1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302static inline struct rq *__task_rq_lock(struct task_struct *p)
303 __acquires(rq->lock)
304{
305 struct rq *rq;
306
307 lockdep_assert_held(&p->pi_lock);
308
309 for (;;) {
310 rq = task_rq(p);
311 raw_spin_lock(&rq->lock);
312 if (likely(rq == task_rq(p)))
313 return rq;
314 raw_spin_unlock(&rq->lock);
315 }
316}
317
318
319
320
321static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
322 __acquires(p->pi_lock)
323 __acquires(rq->lock)
324{
325 struct rq *rq;
326
327 for (;;) {
328 raw_spin_lock_irqsave(&p->pi_lock, *flags);
329 rq = task_rq(p);
330 raw_spin_lock(&rq->lock);
331 if (likely(rq == task_rq(p)))
332 return rq;
333 raw_spin_unlock(&rq->lock);
334 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
335 }
336}
337
338static void __task_rq_unlock(struct rq *rq)
339 __releases(rq->lock)
340{
341 raw_spin_unlock(&rq->lock);
342}
343
344static inline void
345task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
346 __releases(rq->lock)
347 __releases(p->pi_lock)
348{
349 raw_spin_unlock(&rq->lock);
350 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
351}
352
353
354
355
356static struct rq *this_rq_lock(void)
357 __acquires(rq->lock)
358{
359 struct rq *rq;
360
361 local_irq_disable();
362 rq = this_rq();
363 raw_spin_lock(&rq->lock);
364
365 return rq;
366}
367
368#ifdef CONFIG_SCHED_HRTICK
369
370
371
372
373static void hrtick_clear(struct rq *rq)
374{
375 if (hrtimer_active(&rq->hrtick_timer))
376 hrtimer_cancel(&rq->hrtick_timer);
377}
378
379
380
381
382
383static enum hrtimer_restart hrtick(struct hrtimer *timer)
384{
385 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386
387 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388
389 raw_spin_lock(&rq->lock);
390 update_rq_clock(rq);
391 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392 raw_spin_unlock(&rq->lock);
393
394 return HRTIMER_NORESTART;
395}
396
397#ifdef CONFIG_SMP
398
399static int __hrtick_restart(struct rq *rq)
400{
401 struct hrtimer *timer = &rq->hrtick_timer;
402 ktime_t time = hrtimer_get_softexpires(timer);
403
404 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
405}
406
407
408
409
410static void __hrtick_start(void *arg)
411{
412 struct rq *rq = arg;
413
414 raw_spin_lock(&rq->lock);
415 __hrtick_restart(rq);
416 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock);
418}
419
420
421
422
423
424
425void hrtick_start(struct rq *rq, u64 delay)
426{
427 struct hrtimer *timer = &rq->hrtick_timer;
428 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430 hrtimer_set_expires(timer, time);
431
432 if (rq == this_rq()) {
433 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1;
437 }
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443 int cpu = (int)(long)hcpu;
444
445 switch (action) {
446 case CPU_UP_CANCELED:
447 case CPU_UP_CANCELED_FROZEN:
448 case CPU_DOWN_PREPARE:
449 case CPU_DOWN_PREPARE_FROZEN:
450 case CPU_DEAD:
451 case CPU_DEAD_FROZEN:
452 hrtick_clear(cpu_rq(cpu));
453 return NOTIFY_OK;
454 }
455
456 return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461 hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464
465
466
467
468
469void hrtick_start(struct rq *rq, u64 delay)
470{
471 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472 HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483 rq->hrtick_csd_pending = 0;
484
485 rq->hrtick_csd.flags = 0;
486 rq->hrtick_csd.func = __hrtick_start;
487 rq->hrtick_csd.info = rq;
488#endif
489
490 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491 rq->hrtick_timer.function = hrtick;
492}
493#else
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif
506
507
508
509
510
511
512
513
514void resched_task(struct task_struct *p)
515{
516 int cpu;
517
518 lockdep_assert_held(&task_rq(p)->lock);
519
520 if (test_tsk_need_resched(p))
521 return;
522
523 set_tsk_need_resched(p);
524
525 cpu = task_cpu(p);
526 if (cpu == smp_processor_id()) {
527 set_preempt_need_resched();
528 return;
529 }
530
531
532 smp_mb();
533 if (!tsk_is_polling(p))
534 smp_send_reschedule(cpu);
535}
536
537void resched_cpu(int cpu)
538{
539 struct rq *rq = cpu_rq(cpu);
540 unsigned long flags;
541
542 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
543 return;
544 resched_task(cpu_curr(cpu));
545 raw_spin_unlock_irqrestore(&rq->lock, flags);
546}
547
548#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON
550
551
552
553
554
555
556
557
558int get_nohz_timer_target(void)
559{
560 int cpu = smp_processor_id();
561 int i;
562 struct sched_domain *sd;
563
564 rcu_read_lock();
565 for_each_domain(cpu, sd) {
566 for_each_cpu(i, sched_domain_span(sd)) {
567 if (!idle_cpu(i)) {
568 cpu = i;
569 goto unlock;
570 }
571 }
572 }
573unlock:
574 rcu_read_unlock();
575 return cpu;
576}
577
578
579
580
581
582
583
584
585
586
587static void wake_up_idle_cpu(int cpu)
588{
589 struct rq *rq = cpu_rq(cpu);
590
591 if (cpu == smp_processor_id())
592 return;
593
594
595
596
597
598
599
600
601 if (rq->curr != rq->idle)
602 return;
603
604
605
606
607
608
609 set_tsk_need_resched(rq->idle);
610
611
612 smp_mb();
613 if (!tsk_is_polling(rq->idle))
614 smp_send_reschedule(cpu);
615}
616
617static bool wake_up_full_nohz_cpu(int cpu)
618{
619 if (tick_nohz_full_cpu(cpu)) {
620 if (cpu != smp_processor_id() ||
621 tick_nohz_tick_stopped())
622 smp_send_reschedule(cpu);
623 return true;
624 }
625
626 return false;
627}
628
629void wake_up_nohz_cpu(int cpu)
630{
631 if (!wake_up_full_nohz_cpu(cpu))
632 wake_up_idle_cpu(cpu);
633}
634
635static inline bool got_nohz_idle_kick(void)
636{
637 int cpu = smp_processor_id();
638
639 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
640 return false;
641
642 if (idle_cpu(cpu) && !need_resched())
643 return true;
644
645
646
647
648
649 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
650 return false;
651}
652
653#else
654
655static inline bool got_nohz_idle_kick(void)
656{
657 return false;
658}
659
660#endif
661
662#ifdef CONFIG_NO_HZ_FULL
663bool sched_can_stop_tick(void)
664{
665 struct rq *rq;
666
667 rq = this_rq();
668
669
670 smp_rmb();
671
672
673 if (rq->nr_running > 1)
674 return false;
675
676 return true;
677}
678#endif
679
680void sched_avg_update(struct rq *rq)
681{
682 s64 period = sched_avg_period();
683
684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
685
686
687
688
689
690 asm("" : "+rm" (rq->age_stamp));
691 rq->age_stamp += period;
692 rq->rt_avg /= 2;
693 }
694}
695
696#endif
697
698#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
699 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
700
701
702
703
704
705
706int walk_tg_tree_from(struct task_group *from,
707 tg_visitor down, tg_visitor up, void *data)
708{
709 struct task_group *parent, *child;
710 int ret;
711
712 parent = from;
713
714down:
715 ret = (*down)(parent, data);
716 if (ret)
717 goto out;
718 list_for_each_entry_rcu(child, &parent->children, siblings) {
719 parent = child;
720 goto down;
721
722up:
723 continue;
724 }
725 ret = (*up)(parent, data);
726 if (ret || parent == from)
727 goto out;
728
729 child = parent;
730 parent = parent->parent;
731 if (parent)
732 goto up;
733out:
734 return ret;
735}
736
737int tg_nop(struct task_group *tg, void *data)
738{
739 return 0;
740}
741#endif
742
743static void set_load_weight(struct task_struct *p)
744{
745 int prio = p->static_prio - MAX_RT_PRIO;
746 struct load_weight *load = &p->se.load;
747
748
749
750
751 if (p->policy == SCHED_IDLE) {
752 load->weight = scale_load(WEIGHT_IDLEPRIO);
753 load->inv_weight = WMULT_IDLEPRIO;
754 return;
755 }
756
757 load->weight = scale_load(prio_to_weight[prio]);
758 load->inv_weight = prio_to_wmult[prio];
759}
760
761static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
762{
763 update_rq_clock(rq);
764 sched_info_queued(rq, p);
765 p->sched_class->enqueue_task(rq, p, flags);
766}
767
768static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
769{
770 update_rq_clock(rq);
771 sched_info_dequeued(rq, p);
772 p->sched_class->dequeue_task(rq, p, flags);
773}
774
775void activate_task(struct rq *rq, struct task_struct *p, int flags)
776{
777 if (task_contributes_to_load(p))
778 rq->nr_uninterruptible--;
779
780 enqueue_task(rq, p, flags);
781}
782
783void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
784{
785 if (task_contributes_to_load(p))
786 rq->nr_uninterruptible++;
787
788 dequeue_task(rq, p, flags);
789}
790
791static void update_rq_clock_task(struct rq *rq, s64 delta)
792{
793
794
795
796
797#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
798 s64 steal = 0, irq_delta = 0;
799#endif
800#ifdef CONFIG_IRQ_TIME_ACCOUNTING
801 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818 if (irq_delta > delta)
819 irq_delta = delta;
820
821 rq->prev_irq_time += irq_delta;
822 delta -= irq_delta;
823#endif
824#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
825 if (static_key_false((¶virt_steal_rq_enabled))) {
826 u64 st;
827
828 steal = paravirt_steal_clock(cpu_of(rq));
829 steal -= rq->prev_steal_time_rq;
830
831 if (unlikely(steal > delta))
832 steal = delta;
833
834 st = steal_ticks(steal);
835 steal = st * TICK_NSEC;
836
837 rq->prev_steal_time_rq += steal;
838
839 delta -= steal;
840 }
841#endif
842
843 rq->clock_task += delta;
844
845#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
846 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
847 sched_rt_avg_update(rq, irq_delta + steal);
848#endif
849}
850
851void sched_set_stop_task(int cpu, struct task_struct *stop)
852{
853 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
854 struct task_struct *old_stop = cpu_rq(cpu)->stop;
855
856 if (stop) {
857
858
859
860
861
862
863
864
865 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
866
867 stop->sched_class = &stop_sched_class;
868 }
869
870 cpu_rq(cpu)->stop = stop;
871
872 if (old_stop) {
873
874
875
876
877 old_stop->sched_class = &rt_sched_class;
878 }
879}
880
881
882
883
884static inline int __normal_prio(struct task_struct *p)
885{
886 return p->static_prio;
887}
888
889
890
891
892
893
894
895
896static inline int normal_prio(struct task_struct *p)
897{
898 int prio;
899
900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else
905 prio = __normal_prio(p);
906 return prio;
907}
908
909
910
911
912
913
914
915
916static int effective_prio(struct task_struct *p)
917{
918 p->normal_prio = normal_prio(p);
919
920
921
922
923
924 if (!rt_prio(p->prio))
925 return p->normal_prio;
926 return p->prio;
927}
928
929
930
931
932
933
934
935inline int task_curr(const struct task_struct *p)
936{
937 return cpu_curr(task_cpu(p)) == p;
938}
939
940static inline void check_class_changed(struct rq *rq, struct task_struct *p,
941 const struct sched_class *prev_class,
942 int oldprio)
943{
944 if (prev_class != p->sched_class) {
945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio);
950}
951
952void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
953{
954 const struct sched_class *class;
955
956 if (p->sched_class == rq->curr->sched_class) {
957 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
958 } else {
959 for_each_class(class) {
960 if (class == rq->curr->sched_class)
961 break;
962 if (class == p->sched_class) {
963 resched_task(rq->curr);
964 break;
965 }
966 }
967 }
968
969
970
971
972
973 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
974 rq->skip_clock_update = 1;
975}
976
977#ifdef CONFIG_SMP
978void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
979{
980#ifdef CONFIG_SCHED_DEBUG
981
982
983
984
985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
987
988#ifdef CONFIG_LOCKDEP
989
990
991
992
993
994
995
996
997
998
999 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1000 lockdep_is_held(&task_rq(p)->lock)));
1001#endif
1002#endif
1003
1004 trace_sched_migrate_task(p, new_cpu);
1005
1006 if (task_cpu(p) != new_cpu) {
1007 if (p->sched_class->migrate_task_rq)
1008 p->sched_class->migrate_task_rq(p, new_cpu);
1009 p->se.nr_migrations++;
1010 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1011 }
1012
1013 __set_task_cpu(p, new_cpu);
1014}
1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029
1030
1031
1032
1033
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080
1081
1082
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098
1099
1100
1101
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1113
1114out:
1115 return ret;
1116}
1117
1118struct migration_arg {
1119 struct task_struct *task;
1120 int dest_cpu;
1121};
1122
1123static int migration_cpu_stop(void *data);
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1142{
1143 unsigned long flags;
1144 int running, on_rq;
1145 unsigned long ncsw;
1146 struct rq *rq;
1147
1148 for (;;) {
1149
1150
1151
1152
1153
1154
1155 rq = task_rq(p);
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168 while (task_running(rq, p)) {
1169 if (match_state && unlikely(p->state != match_state))
1170 return 0;
1171 cpu_relax();
1172 }
1173
1174
1175
1176
1177
1178
1179 rq = task_rq_lock(p, &flags);
1180 trace_sched_wait_task(p);
1181 running = task_running(rq, p);
1182 on_rq = p->on_rq;
1183 ncsw = 0;
1184 if (!match_state || p->state == match_state)
1185 ncsw = p->nvcsw | LONG_MIN;
1186 task_rq_unlock(rq, p, &flags);
1187
1188
1189
1190
1191 if (unlikely(!ncsw))
1192 break;
1193
1194
1195
1196
1197
1198
1199
1200 if (unlikely(running)) {
1201 cpu_relax();
1202 continue;
1203 }
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214 if (unlikely(on_rq)) {
1215 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1216
1217 set_current_state(TASK_UNINTERRUPTIBLE);
1218 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1219 continue;
1220 }
1221
1222
1223
1224
1225
1226
1227 break;
1228 }
1229
1230 return ncsw;
1231}
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246void kick_process(struct task_struct *p)
1247{
1248 int cpu;
1249
1250 preempt_disable();
1251 cpu = task_cpu(p);
1252 if ((cpu != smp_processor_id()) && task_curr(p))
1253 smp_send_reschedule(cpu);
1254 preempt_enable();
1255}
1256EXPORT_SYMBOL_GPL(kick_process);
1257#endif
1258
1259#ifdef CONFIG_SMP
1260
1261
1262
1263static int select_fallback_rq(int cpu, struct task_struct *p)
1264{
1265 int nid = cpu_to_node(cpu);
1266 const struct cpumask *nodemask = NULL;
1267 enum { cpuset, possible, fail } state = cpuset;
1268 int dest_cpu;
1269
1270
1271
1272
1273
1274
1275 if (nid != -1) {
1276 nodemask = cpumask_of_node(nid);
1277
1278
1279 for_each_cpu(dest_cpu, nodemask) {
1280 if (!cpu_online(dest_cpu))
1281 continue;
1282 if (!cpu_active(dest_cpu))
1283 continue;
1284 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1285 return dest_cpu;
1286 }
1287 }
1288
1289 for (;;) {
1290
1291 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1292 if (!cpu_online(dest_cpu))
1293 continue;
1294 if (!cpu_active(dest_cpu))
1295 continue;
1296 goto out;
1297 }
1298
1299 switch (state) {
1300 case cpuset:
1301
1302 cpuset_cpus_allowed_fallback(p);
1303 state = possible;
1304 break;
1305
1306 case possible:
1307 do_set_cpus_allowed(p, cpu_possible_mask);
1308 state = fail;
1309 break;
1310
1311 case fail:
1312 BUG();
1313 break;
1314 }
1315 }
1316
1317out:
1318 if (state != cpuset) {
1319
1320
1321
1322
1323
1324 if (p->mm && printk_ratelimit()) {
1325 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1326 task_pid_nr(p), p->comm, cpu);
1327 }
1328 }
1329
1330 return dest_cpu;
1331}
1332
1333
1334
1335
1336static inline
1337int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1338{
1339 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1352 !cpu_online(cpu)))
1353 cpu = select_fallback_rq(task_cpu(p), p);
1354
1355 return cpu;
1356}
1357
1358static void update_avg(u64 *avg, u64 sample)
1359{
1360 s64 diff = sample - *avg;
1361 *avg += diff >> 3;
1362}
1363#endif
1364
1365static void
1366ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1367{
1368#ifdef CONFIG_SCHEDSTATS
1369 struct rq *rq = this_rq();
1370
1371#ifdef CONFIG_SMP
1372 int this_cpu = smp_processor_id();
1373
1374 if (cpu == this_cpu) {
1375 schedstat_inc(rq, ttwu_local);
1376 schedstat_inc(p, se.statistics.nr_wakeups_local);
1377 } else {
1378 struct sched_domain *sd;
1379
1380 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1381 rcu_read_lock();
1382 for_each_domain(this_cpu, sd) {
1383 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1384 schedstat_inc(sd, ttwu_wake_remote);
1385 break;
1386 }
1387 }
1388 rcu_read_unlock();
1389 }
1390
1391 if (wake_flags & WF_MIGRATED)
1392 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1393
1394#endif
1395
1396 schedstat_inc(rq, ttwu_count);
1397 schedstat_inc(p, se.statistics.nr_wakeups);
1398
1399 if (wake_flags & WF_SYNC)
1400 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1401
1402#endif
1403}
1404
1405static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1406{
1407 activate_task(rq, p, en_flags);
1408 p->on_rq = 1;
1409
1410
1411 if (p->flags & PF_WQ_WORKER)
1412 wq_worker_waking_up(p, cpu_of(rq));
1413}
1414
1415
1416
1417
1418static void
1419ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1420{
1421 check_preempt_curr(rq, p, wake_flags);
1422 trace_sched_wakeup(p, true);
1423
1424 p->state = TASK_RUNNING;
1425#ifdef CONFIG_SMP
1426 if (p->sched_class->task_woken)
1427 p->sched_class->task_woken(rq, p);
1428
1429 if (rq->idle_stamp) {
1430 u64 delta = rq_clock(rq) - rq->idle_stamp;
1431 u64 max = 2*rq->max_idle_balance_cost;
1432
1433 update_avg(&rq->avg_idle, delta);
1434
1435 if (rq->avg_idle > max)
1436 rq->avg_idle = max;
1437
1438 rq->idle_stamp = 0;
1439 }
1440#endif
1441}
1442
1443static void
1444ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1445{
1446#ifdef CONFIG_SMP
1447 if (p->sched_contributes_to_load)
1448 rq->nr_uninterruptible--;
1449#endif
1450
1451 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1452 ttwu_do_wakeup(rq, p, wake_flags);
1453}
1454
1455
1456
1457
1458
1459
1460
1461static int ttwu_remote(struct task_struct *p, int wake_flags)
1462{
1463 struct rq *rq;
1464 int ret = 0;
1465
1466 rq = __task_rq_lock(p);
1467 if (p->on_rq) {
1468
1469 update_rq_clock(rq);
1470 ttwu_do_wakeup(rq, p, wake_flags);
1471 ret = 1;
1472 }
1473 __task_rq_unlock(rq);
1474
1475 return ret;
1476}
1477
1478#ifdef CONFIG_SMP
1479static void sched_ttwu_pending(void)
1480{
1481 struct rq *rq = this_rq();
1482 struct llist_node *llist = llist_del_all(&rq->wake_list);
1483 struct task_struct *p;
1484
1485 raw_spin_lock(&rq->lock);
1486
1487 while (llist) {
1488 p = llist_entry(llist, struct task_struct, wake_entry);
1489 llist = llist_next(llist);
1490 ttwu_do_activate(rq, p, 0);
1491 }
1492
1493 raw_spin_unlock(&rq->lock);
1494}
1495
1496void scheduler_ipi(void)
1497{
1498
1499
1500
1501
1502
1503 preempt_fold_need_resched();
1504
1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id())
1507 && !got_nohz_idle_kick())
1508 return;
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523 irq_enter();
1524 tick_nohz_full_check();
1525 sched_ttwu_pending();
1526
1527
1528
1529
1530 if (unlikely(got_nohz_idle_kick())) {
1531 this_rq()->idle_balance = 1;
1532 raise_softirq_irqoff(SCHED_SOFTIRQ);
1533 }
1534 irq_exit();
1535}
1536
1537static void ttwu_queue_remote(struct task_struct *p, int cpu)
1538{
1539 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1540 smp_send_reschedule(cpu);
1541}
1542
1543bool cpus_share_cache(int this_cpu, int that_cpu)
1544{
1545 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1546}
1547#endif
1548
1549static void ttwu_queue(struct task_struct *p, int cpu)
1550{
1551 struct rq *rq = cpu_rq(cpu);
1552
1553#if defined(CONFIG_SMP)
1554 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1555 sched_clock_cpu(cpu);
1556 ttwu_queue_remote(p, cpu);
1557 return;
1558 }
1559#endif
1560
1561 raw_spin_lock(&rq->lock);
1562 ttwu_do_activate(rq, p, 0);
1563 raw_spin_unlock(&rq->lock);
1564}
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581static int
1582try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1583{
1584 unsigned long flags;
1585 int cpu, success = 0;
1586
1587
1588
1589
1590
1591
1592
1593 smp_mb__before_spinlock();
1594 raw_spin_lock_irqsave(&p->pi_lock, flags);
1595 if (!(p->state & state))
1596 goto out;
1597
1598 success = 1;
1599 cpu = task_cpu(p);
1600
1601 if (p->on_rq && ttwu_remote(p, wake_flags))
1602 goto stat;
1603
1604#ifdef CONFIG_SMP
1605
1606
1607
1608
1609 while (p->on_cpu)
1610 cpu_relax();
1611
1612
1613
1614 smp_rmb();
1615
1616 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1617 p->state = TASK_WAKING;
1618
1619 if (p->sched_class->task_waking)
1620 p->sched_class->task_waking(p);
1621
1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1623 if (task_cpu(p) != cpu) {
1624 wake_flags |= WF_MIGRATED;
1625 set_task_cpu(p, cpu);
1626 }
1627#endif
1628
1629 ttwu_queue(p, cpu);
1630stat:
1631 ttwu_stat(p, cpu, wake_flags);
1632out:
1633 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1634
1635 return success;
1636}
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646static void try_to_wake_up_local(struct task_struct *p)
1647{
1648 struct rq *rq = task_rq(p);
1649
1650 if (WARN_ON_ONCE(rq != this_rq()) ||
1651 WARN_ON_ONCE(p == current))
1652 return;
1653
1654 lockdep_assert_held(&rq->lock);
1655
1656 if (!raw_spin_trylock(&p->pi_lock)) {
1657 raw_spin_unlock(&rq->lock);
1658 raw_spin_lock(&p->pi_lock);
1659 raw_spin_lock(&rq->lock);
1660 }
1661
1662 if (!(p->state & TASK_NORMAL))
1663 goto out;
1664
1665 if (!p->on_rq)
1666 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1667
1668 ttwu_do_wakeup(rq, p, 0);
1669 ttwu_stat(p, smp_processor_id(), 0);
1670out:
1671 raw_spin_unlock(&p->pi_lock);
1672}
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686int wake_up_process(struct task_struct *p)
1687{
1688 WARN_ON(task_is_stopped_or_traced(p));
1689 return try_to_wake_up(p, TASK_NORMAL, 0);
1690}
1691EXPORT_SYMBOL(wake_up_process);
1692
1693int wake_up_state(struct task_struct *p, unsigned int state)
1694{
1695 return try_to_wake_up(p, state, 0);
1696}
1697
1698
1699
1700
1701
1702
1703
1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1705{
1706 p->on_rq = 0;
1707
1708 p->se.on_rq = 0;
1709 p->se.exec_start = 0;
1710 p->se.sum_exec_runtime = 0;
1711 p->se.prev_sum_exec_runtime = 0;
1712 p->se.nr_migrations = 0;
1713 p->se.vruntime = 0;
1714 INIT_LIST_HEAD(&p->se.group_node);
1715
1716#ifdef CONFIG_SCHEDSTATS
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif
1719
1720 RB_CLEAR_NODE(&p->dl.rb_node);
1721 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1722 p->dl.dl_runtime = p->dl.runtime = 0;
1723 p->dl.dl_deadline = p->dl.deadline = 0;
1724 p->dl.dl_period = 0;
1725 p->dl.flags = 0;
1726
1727 INIT_LIST_HEAD(&p->rt.run_list);
1728
1729#ifdef CONFIG_PREEMPT_NOTIFIERS
1730 INIT_HLIST_HEAD(&p->preempt_notifiers);
1731#endif
1732
1733#ifdef CONFIG_NUMA_BALANCING
1734 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1735 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1736 p->mm->numa_scan_seq = 0;
1737 }
1738
1739 if (clone_flags & CLONE_VM)
1740 p->numa_preferred_nid = current->numa_preferred_nid;
1741 else
1742 p->numa_preferred_nid = -1;
1743
1744 p->node_stamp = 0ULL;
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL;
1749 p->numa_faults_buffer = NULL;
1750
1751 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL;
1753#endif
1754}
1755
1756#ifdef CONFIG_NUMA_BALANCING
1757#ifdef CONFIG_SCHED_DEBUG
1758void set_numabalancing_state(bool enabled)
1759{
1760 if (enabled)
1761 sched_feat_set("NUMA");
1762 else
1763 sched_feat_set("NO_NUMA");
1764}
1765#else
1766__read_mostly bool numabalancing_enabled;
1767
1768void set_numabalancing_state(bool enabled)
1769{
1770 numabalancing_enabled = enabled;
1771}
1772#endif
1773
1774#ifdef CONFIG_PROC_SYSCTL
1775int sysctl_numa_balancing(struct ctl_table *table, int write,
1776 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{
1778 struct ctl_table t;
1779 int err;
1780 int state = numabalancing_enabled;
1781
1782 if (write && !capable(CAP_SYS_ADMIN))
1783 return -EPERM;
1784
1785 t = *table;
1786 t.data = &state;
1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788 if (err < 0)
1789 return err;
1790 if (write)
1791 set_numabalancing_state(state);
1792 return err;
1793}
1794#endif
1795#endif
1796
1797
1798
1799
1800int sched_fork(unsigned long clone_flags, struct task_struct *p)
1801{
1802 unsigned long flags;
1803 int cpu = get_cpu();
1804
1805 __sched_fork(clone_flags, p);
1806
1807
1808
1809
1810
1811 p->state = TASK_RUNNING;
1812
1813
1814
1815
1816 p->prio = current->normal_prio;
1817
1818
1819
1820
1821 if (unlikely(p->sched_reset_on_fork)) {
1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1823 p->policy = SCHED_NORMAL;
1824 p->static_prio = NICE_TO_PRIO(0);
1825 p->rt_priority = 0;
1826 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1827 p->static_prio = NICE_TO_PRIO(0);
1828
1829 p->prio = p->normal_prio = __normal_prio(p);
1830 set_load_weight(p);
1831
1832
1833
1834
1835
1836 p->sched_reset_on_fork = 0;
1837 }
1838
1839 if (dl_prio(p->prio)) {
1840 put_cpu();
1841 return -EAGAIN;
1842 } else if (rt_prio(p->prio)) {
1843 p->sched_class = &rt_sched_class;
1844 } else {
1845 p->sched_class = &fair_sched_class;
1846 }
1847
1848 if (p->sched_class->task_fork)
1849 p->sched_class->task_fork(p);
1850
1851
1852
1853
1854
1855
1856
1857
1858 raw_spin_lock_irqsave(&p->pi_lock, flags);
1859 set_task_cpu(p, cpu);
1860 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1861
1862#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1863 if (likely(sched_info_on()))
1864 memset(&p->sched_info, 0, sizeof(p->sched_info));
1865#endif
1866#if defined(CONFIG_SMP)
1867 p->on_cpu = 0;
1868#endif
1869 init_task_preempt_count(p);
1870#ifdef CONFIG_SMP
1871 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1873#endif
1874
1875 put_cpu();
1876 return 0;
1877}
1878
1879unsigned long to_ratio(u64 period, u64 runtime)
1880{
1881 if (runtime == RUNTIME_INF)
1882 return 1ULL << 20;
1883
1884
1885
1886
1887
1888
1889 if (period == 0)
1890 return 0;
1891
1892 return div64_u64(runtime << 20, period);
1893}
1894
1895#ifdef CONFIG_SMP
1896inline struct dl_bw *dl_bw_of(int i)
1897{
1898 return &cpu_rq(i)->rd->dl_bw;
1899}
1900
1901static inline int dl_bw_cpus(int i)
1902{
1903 struct root_domain *rd = cpu_rq(i)->rd;
1904 int cpus = 0;
1905
1906 for_each_cpu_and(i, rd->span, cpu_active_mask)
1907 cpus++;
1908
1909 return cpus;
1910}
1911#else
1912inline struct dl_bw *dl_bw_of(int i)
1913{
1914 return &cpu_rq(i)->dl.dl_bw;
1915}
1916
1917static inline int dl_bw_cpus(int i)
1918{
1919 return 1;
1920}
1921#endif
1922
1923static inline
1924void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925{
1926 dl_b->total_bw -= tsk_bw;
1927}
1928
1929static inline
1930void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931{
1932 dl_b->total_bw += tsk_bw;
1933}
1934
1935static inline
1936bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937{
1938 return dl_b->bw != -1 &&
1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940}
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950static int dl_overflow(struct task_struct *p, int policy,
1951 const struct sched_attr *attr)
1952{
1953
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1;
1959
1960 if (new_bw == p->dl.dl_bw)
1961 return 0;
1962
1963
1964
1965
1966
1967
1968 raw_spin_lock(&dl_b->lock);
1969 cpus = dl_bw_cpus(task_cpu(p));
1970 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972 __dl_add(dl_b, new_bw);
1973 err = 0;
1974 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976 __dl_clear(dl_b, p->dl.dl_bw);
1977 __dl_add(dl_b, new_bw);
1978 err = 0;
1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980 __dl_clear(dl_b, p->dl.dl_bw);
1981 err = 0;
1982 }
1983 raw_spin_unlock(&dl_b->lock);
1984
1985 return err;
1986}
1987
1988extern void init_dl_bw(struct dl_bw *dl_b);
1989
1990
1991
1992
1993
1994
1995
1996
1997void wake_up_new_task(struct task_struct *p)
1998{
1999 unsigned long flags;
2000 struct rq *rq;
2001
2002 raw_spin_lock_irqsave(&p->pi_lock, flags);
2003#ifdef CONFIG_SMP
2004
2005
2006
2007
2008
2009 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2010#endif
2011
2012
2013 init_task_runnable_average(p);
2014 rq = __task_rq_lock(p);
2015 activate_task(rq, p, 0);
2016 p->on_rq = 1;
2017 trace_sched_wakeup_new(p, true);
2018 check_preempt_curr(rq, p, WF_FORK);
2019#ifdef CONFIG_SMP
2020 if (p->sched_class->task_woken)
2021 p->sched_class->task_woken(rq, p);
2022#endif
2023 task_rq_unlock(rq, p, &flags);
2024}
2025
2026#ifdef CONFIG_PREEMPT_NOTIFIERS
2027
2028
2029
2030
2031
2032void preempt_notifier_register(struct preempt_notifier *notifier)
2033{
2034 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2035}
2036EXPORT_SYMBOL_GPL(preempt_notifier_register);
2037
2038
2039
2040
2041
2042
2043
2044void preempt_notifier_unregister(struct preempt_notifier *notifier)
2045{
2046 hlist_del(¬ifier->link);
2047}
2048EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2049
2050static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2051{
2052 struct preempt_notifier *notifier;
2053
2054 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2055 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2056}
2057
2058static void
2059fire_sched_out_preempt_notifiers(struct task_struct *curr,
2060 struct task_struct *next)
2061{
2062 struct preempt_notifier *notifier;
2063
2064 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2065 notifier->ops->sched_out(notifier, next);
2066}
2067
2068#else
2069
2070static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2071{
2072}
2073
2074static void
2075fire_sched_out_preempt_notifiers(struct task_struct *curr,
2076 struct task_struct *next)
2077{
2078}
2079
2080#endif
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095static inline void
2096prepare_task_switch(struct rq *rq, struct task_struct *prev,
2097 struct task_struct *next)
2098{
2099 trace_sched_switch(prev, next);
2100 sched_info_switch(rq, prev, next);
2101 perf_event_task_sched_out(prev, next);
2102 fire_sched_out_preempt_notifiers(prev, next);
2103 prepare_lock_switch(rq, next);
2104 prepare_arch_switch(next);
2105}
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2123 __releases(rq->lock)
2124{
2125 struct mm_struct *mm = rq->prev_mm;
2126 long prev_state;
2127
2128 rq->prev_mm = NULL;
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141 prev_state = prev->state;
2142 vtime_task_switch(prev);
2143 finish_arch_switch(prev);
2144 perf_event_task_sched_in(prev, current);
2145 finish_lock_switch(rq, prev);
2146 finish_arch_post_lock_switch();
2147
2148 fire_sched_in_preempt_notifiers(current);
2149 if (mm)
2150 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev);
2156
2157
2158
2159
2160
2161 kprobe_flush_task(prev);
2162 put_task_struct(prev);
2163 }
2164
2165 tick_nohz_task_switch(current);
2166}
2167
2168#ifdef CONFIG_SMP
2169
2170
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177
2178static inline void post_schedule(struct rq *rq)
2179{
2180 if (rq->post_schedule) {
2181 unsigned long flags;
2182
2183 raw_spin_lock_irqsave(&rq->lock, flags);
2184 if (rq->curr->sched_class->post_schedule)
2185 rq->curr->sched_class->post_schedule(rq);
2186 raw_spin_unlock_irqrestore(&rq->lock, flags);
2187
2188 rq->post_schedule = 0;
2189 }
2190}
2191
2192#else
2193
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq)
2199{
2200}
2201
2202#endif
2203
2204
2205
2206
2207
2208asmlinkage void schedule_tail(struct task_struct *prev)
2209 __releases(rq->lock)
2210{
2211 struct rq *rq = this_rq();
2212
2213 finish_task_switch(rq, prev);
2214
2215
2216
2217
2218
2219 post_schedule(rq);
2220
2221#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2222
2223 preempt_enable();
2224#endif
2225 if (current->set_child_tid)
2226 put_user(task_pid_vnr(current), current->set_child_tid);
2227}
2228
2229
2230
2231
2232
2233static inline void
2234context_switch(struct rq *rq, struct task_struct *prev,
2235 struct task_struct *next)
2236{
2237 struct mm_struct *mm, *oldmm;
2238
2239 prepare_task_switch(rq, prev, next);
2240
2241 mm = next->mm;
2242 oldmm = prev->active_mm;
2243
2244
2245
2246
2247
2248 arch_start_context_switch(prev);
2249
2250 if (!mm) {
2251 next->active_mm = oldmm;
2252 atomic_inc(&oldmm->mm_count);
2253 enter_lazy_tlb(oldmm, next);
2254 } else
2255 switch_mm(oldmm, mm, next);
2256
2257 if (!prev->mm) {
2258 prev->active_mm = NULL;
2259 rq->prev_mm = oldmm;
2260 }
2261
2262
2263
2264
2265
2266
2267#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2268 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2269#endif
2270
2271 context_tracking_task_switch(prev, next);
2272
2273 switch_to(prev, next, prev);
2274
2275 barrier();
2276
2277
2278
2279
2280
2281 finish_task_switch(this_rq(), prev);
2282}
2283
2284
2285
2286
2287
2288
2289
2290unsigned long nr_running(void)
2291{
2292 unsigned long i, sum = 0;
2293
2294 for_each_online_cpu(i)
2295 sum += cpu_rq(i)->nr_running;
2296
2297 return sum;
2298}
2299
2300unsigned long long nr_context_switches(void)
2301{
2302 int i;
2303 unsigned long long sum = 0;
2304
2305 for_each_possible_cpu(i)
2306 sum += cpu_rq(i)->nr_switches;
2307
2308 return sum;
2309}
2310
2311unsigned long nr_iowait(void)
2312{
2313 unsigned long i, sum = 0;
2314
2315 for_each_possible_cpu(i)
2316 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2317
2318 return sum;
2319}
2320
2321unsigned long nr_iowait_cpu(int cpu)
2322{
2323 struct rq *this = cpu_rq(cpu);
2324 return atomic_read(&this->nr_iowait);
2325}
2326
2327#ifdef CONFIG_SMP
2328
2329
2330
2331
2332
2333void sched_exec(void)
2334{
2335 struct task_struct *p = current;
2336 unsigned long flags;
2337 int dest_cpu;
2338
2339 raw_spin_lock_irqsave(&p->pi_lock, flags);
2340 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2341 if (dest_cpu == smp_processor_id())
2342 goto unlock;
2343
2344 if (likely(cpu_active(dest_cpu))) {
2345 struct migration_arg arg = { p, dest_cpu };
2346
2347 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2348 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2349 return;
2350 }
2351unlock:
2352 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2353}
2354
2355#endif
2356
2357DEFINE_PER_CPU(struct kernel_stat, kstat);
2358DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2359
2360EXPORT_PER_CPU_SYMBOL(kstat);
2361EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2362
2363
2364
2365
2366
2367
2368
2369static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2370{
2371 u64 ns = 0;
2372
2373 if (task_current(rq, p)) {
2374 update_rq_clock(rq);
2375 ns = rq_clock_task(rq) - p->se.exec_start;
2376 if ((s64)ns < 0)
2377 ns = 0;
2378 }
2379
2380 return ns;
2381}
2382
2383unsigned long long task_delta_exec(struct task_struct *p)
2384{
2385 unsigned long flags;
2386 struct rq *rq;
2387 u64 ns = 0;
2388
2389 rq = task_rq_lock(p, &flags);
2390 ns = do_task_delta_exec(p, rq);
2391 task_rq_unlock(rq, p, &flags);
2392
2393 return ns;
2394}
2395
2396
2397
2398
2399
2400
2401unsigned long long task_sched_runtime(struct task_struct *p)
2402{
2403 unsigned long flags;
2404 struct rq *rq;
2405 u64 ns = 0;
2406
2407#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417 if (!p->on_cpu)
2418 return p->se.sum_exec_runtime;
2419#endif
2420
2421 rq = task_rq_lock(p, &flags);
2422 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2423 task_rq_unlock(rq, p, &flags);
2424
2425 return ns;
2426}
2427
2428
2429
2430
2431
2432void scheduler_tick(void)
2433{
2434 int cpu = smp_processor_id();
2435 struct rq *rq = cpu_rq(cpu);
2436 struct task_struct *curr = rq->curr;
2437
2438 sched_clock_tick();
2439
2440 raw_spin_lock(&rq->lock);
2441 update_rq_clock(rq);
2442 curr->sched_class->task_tick(rq, curr, 0);
2443 update_cpu_load_active(rq);
2444 raw_spin_unlock(&rq->lock);
2445
2446 perf_event_task_tick();
2447
2448#ifdef CONFIG_SMP
2449 rq->idle_balance = idle_cpu(cpu);
2450 trigger_load_balance(rq);
2451#endif
2452 rq_last_tick_reset(rq);
2453}
2454
2455#ifdef CONFIG_NO_HZ_FULL
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469u64 scheduler_tick_max_deferment(void)
2470{
2471 struct rq *rq = this_rq();
2472 unsigned long next, now = ACCESS_ONCE(jiffies);
2473
2474 next = rq->last_sched_tick + HZ;
2475
2476 if (time_before_eq(next, now))
2477 return 0;
2478
2479 return jiffies_to_nsecs(next - now);
2480}
2481#endif
2482
2483notrace unsigned long get_parent_ip(unsigned long addr)
2484{
2485 if (in_lock_functions(addr)) {
2486 addr = CALLER_ADDR2;
2487 if (in_lock_functions(addr))
2488 addr = CALLER_ADDR3;
2489 }
2490 return addr;
2491}
2492
2493#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2494 defined(CONFIG_PREEMPT_TRACER))
2495
2496void __kprobes preempt_count_add(int val)
2497{
2498#ifdef CONFIG_DEBUG_PREEMPT
2499
2500
2501
2502 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2503 return;
2504#endif
2505 __preempt_count_add(val);
2506#ifdef CONFIG_DEBUG_PREEMPT
2507
2508
2509
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10);
2512#endif
2513 if (preempt_count() == val)
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2515}
2516EXPORT_SYMBOL(preempt_count_add);
2517
2518void __kprobes preempt_count_sub(int val)
2519{
2520#ifdef CONFIG_DEBUG_PREEMPT
2521
2522
2523
2524 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2525 return;
2526
2527
2528
2529 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2530 !(preempt_count() & PREEMPT_MASK)))
2531 return;
2532#endif
2533
2534 if (preempt_count() == val)
2535 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2536 __preempt_count_sub(val);
2537}
2538EXPORT_SYMBOL(preempt_count_sub);
2539
2540#endif
2541
2542
2543
2544
2545static noinline void __schedule_bug(struct task_struct *prev)
2546{
2547 if (oops_in_progress)
2548 return;
2549
2550 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2551 prev->comm, prev->pid, preempt_count());
2552
2553 debug_show_held_locks(prev);
2554 print_modules();
2555 if (irqs_disabled())
2556 print_irqtrace_events(prev);
2557 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559}
2560
2561
2562
2563
2564static inline void schedule_debug(struct task_struct *prev)
2565{
2566
2567
2568
2569
2570
2571 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2572 __schedule_bug(prev);
2573 rcu_sleep_check();
2574
2575 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2576
2577 schedstat_inc(this_rq(), sched_count);
2578}
2579
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587
2588
2589
2590static inline struct task_struct *
2591pick_next_task(struct rq *rq)
2592{
2593 const struct sched_class *class;
2594 struct task_struct *p;
2595
2596
2597
2598
2599
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2601 p = fair_sched_class.pick_next_task(rq);
2602 if (likely(p))
2603 return p;
2604 }
2605
2606 for_each_class(class) {
2607 p = class->pick_next_task(rq);
2608 if (p)
2609 return p;
2610 }
2611
2612 BUG();
2613}
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652static void __sched __schedule(void)
2653{
2654 struct task_struct *prev, *next;
2655 unsigned long *switch_count;
2656 struct rq *rq;
2657 int cpu;
2658
2659need_resched:
2660 preempt_disable();
2661 cpu = smp_processor_id();
2662 rq = cpu_rq(cpu);
2663 rcu_note_context_switch(cpu);
2664 prev = rq->curr;
2665
2666 schedule_debug(prev);
2667
2668 if (sched_feat(HRTICK))
2669 hrtick_clear(rq);
2670
2671
2672
2673
2674
2675
2676 smp_mb__before_spinlock();
2677 raw_spin_lock_irq(&rq->lock);
2678
2679 switch_count = &prev->nivcsw;
2680 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2681 if (unlikely(signal_pending_state(prev->state, prev))) {
2682 prev->state = TASK_RUNNING;
2683 } else {
2684 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2685 prev->on_rq = 0;
2686
2687
2688
2689
2690
2691
2692 if (prev->flags & PF_WQ_WORKER) {
2693 struct task_struct *to_wakeup;
2694
2695 to_wakeup = wq_worker_sleeping(prev, cpu);
2696 if (to_wakeup)
2697 try_to_wake_up_local(to_wakeup);
2698 }
2699 }
2700 switch_count = &prev->nvcsw;
2701 }
2702
2703 pre_schedule(rq, prev);
2704
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707
2708 put_prev_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0;
2713
2714 if (likely(prev != next)) {
2715 rq->nr_switches++;
2716 rq->curr = next;
2717 ++*switch_count;
2718
2719 context_switch(rq, prev, next);
2720
2721
2722
2723
2724
2725
2726 cpu = smp_processor_id();
2727 rq = cpu_rq(cpu);
2728 } else
2729 raw_spin_unlock_irq(&rq->lock);
2730
2731 post_schedule(rq);
2732
2733 sched_preempt_enable_no_resched();
2734 if (need_resched())
2735 goto need_resched;
2736}
2737
2738static inline void sched_submit_work(struct task_struct *tsk)
2739{
2740 if (!tsk->state || tsk_is_pi_blocked(tsk))
2741 return;
2742
2743
2744
2745
2746 if (blk_needs_flush_plug(tsk))
2747 blk_schedule_flush_plug(tsk);
2748}
2749
2750asmlinkage void __sched schedule(void)
2751{
2752 struct task_struct *tsk = current;
2753
2754 sched_submit_work(tsk);
2755 __schedule();
2756}
2757EXPORT_SYMBOL(schedule);
2758
2759#ifdef CONFIG_CONTEXT_TRACKING
2760asmlinkage void __sched schedule_user(void)
2761{
2762
2763
2764
2765
2766
2767
2768 user_exit();
2769 schedule();
2770 user_enter();
2771}
2772#endif
2773
2774
2775
2776
2777
2778
2779void __sched schedule_preempt_disabled(void)
2780{
2781 sched_preempt_enable_no_resched();
2782 schedule();
2783 preempt_disable();
2784}
2785
2786#ifdef CONFIG_PREEMPT
2787
2788
2789
2790
2791
2792asmlinkage void __sched notrace preempt_schedule(void)
2793{
2794
2795
2796
2797
2798 if (likely(!preemptible()))
2799 return;
2800
2801 do {
2802 __preempt_count_add(PREEMPT_ACTIVE);
2803 __schedule();
2804 __preempt_count_sub(PREEMPT_ACTIVE);
2805
2806
2807
2808
2809
2810 barrier();
2811 } while (need_resched());
2812}
2813EXPORT_SYMBOL(preempt_schedule);
2814#endif
2815
2816
2817
2818
2819
2820
2821
2822asmlinkage void __sched preempt_schedule_irq(void)
2823{
2824 enum ctx_state prev_state;
2825
2826
2827 BUG_ON(preempt_count() || !irqs_disabled());
2828
2829 prev_state = exception_enter();
2830
2831 do {
2832 __preempt_count_add(PREEMPT_ACTIVE);
2833 local_irq_enable();
2834 __schedule();
2835 local_irq_disable();
2836 __preempt_count_sub(PREEMPT_ACTIVE);
2837
2838
2839
2840
2841
2842 barrier();
2843 } while (need_resched());
2844
2845 exception_exit(prev_state);
2846}
2847
2848int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2849 void *key)
2850{
2851 return try_to_wake_up(curr->private, mode, wake_flags);
2852}
2853EXPORT_SYMBOL(default_wake_function);
2854
2855static long __sched
2856sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2857{
2858 unsigned long flags;
2859 wait_queue_t wait;
2860
2861 init_waitqueue_entry(&wait, current);
2862
2863 __set_current_state(state);
2864
2865 spin_lock_irqsave(&q->lock, flags);
2866 __add_wait_queue(q, &wait);
2867 spin_unlock(&q->lock);
2868 timeout = schedule_timeout(timeout);
2869 spin_lock_irq(&q->lock);
2870 __remove_wait_queue(q, &wait);
2871 spin_unlock_irqrestore(&q->lock, flags);
2872
2873 return timeout;
2874}
2875
2876void __sched interruptible_sleep_on(wait_queue_head_t *q)
2877{
2878 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2879}
2880EXPORT_SYMBOL(interruptible_sleep_on);
2881
2882long __sched
2883interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2884{
2885 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2886}
2887EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2888
2889void __sched sleep_on(wait_queue_head_t *q)
2890{
2891 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2892}
2893EXPORT_SYMBOL(sleep_on);
2894
2895long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2896{
2897 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2898}
2899EXPORT_SYMBOL(sleep_on_timeout);
2900
2901#ifdef CONFIG_RT_MUTEXES
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913void rt_mutex_setprio(struct task_struct *p, int prio)
2914{
2915 int oldprio, on_rq, running, enqueue_flag = 0;
2916 struct rq *rq;
2917 const struct sched_class *prev_class;
2918
2919 BUG_ON(prio > MAX_PRIO);
2920
2921 rq = __task_rq_lock(p);
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935 if (unlikely(p == rq->idle)) {
2936 WARN_ON(p != rq->curr);
2937 WARN_ON(p->pi_blocked_on);
2938 goto out_unlock;
2939 }
2940
2941 trace_sched_pi_setprio(p, prio);
2942 p->pi_top_task = rt_mutex_get_top_task(p);
2943 oldprio = p->prio;
2944 prev_class = p->sched_class;
2945 on_rq = p->on_rq;
2946 running = task_current(rq, p);
2947 if (on_rq)
2948 dequeue_task(rq, p, 0);
2949 if (running)
2950 p->sched_class->put_prev_task(rq, p);
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961 if (dl_prio(prio)) {
2962 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2963 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2964 p->dl.dl_boosted = 1;
2965 p->dl.dl_throttled = 0;
2966 enqueue_flag = ENQUEUE_REPLENISH;
2967 } else
2968 p->dl.dl_boosted = 0;
2969 p->sched_class = &dl_sched_class;
2970 } else if (rt_prio(prio)) {
2971 if (dl_prio(oldprio))
2972 p->dl.dl_boosted = 0;
2973 if (oldprio < prio)
2974 enqueue_flag = ENQUEUE_HEAD;
2975 p->sched_class = &rt_sched_class;
2976 } else {
2977 if (dl_prio(oldprio))
2978 p->dl.dl_boosted = 0;
2979 p->sched_class = &fair_sched_class;
2980 }
2981
2982 p->prio = prio;
2983
2984 if (running)
2985 p->sched_class->set_curr_task(rq);
2986 if (on_rq)
2987 enqueue_task(rq, p, enqueue_flag);
2988
2989 check_class_changed(rq, p, prev_class, oldprio);
2990out_unlock:
2991 __task_rq_unlock(rq);
2992}
2993#endif
2994
2995void set_user_nice(struct task_struct *p, long nice)
2996{
2997 int old_prio, delta, on_rq;
2998 unsigned long flags;
2999 struct rq *rq;
3000
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3002 return;
3003
3004
3005
3006
3007 rq = task_rq_lock(p, &flags);
3008
3009
3010
3011
3012
3013
3014 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3015 p->static_prio = NICE_TO_PRIO(nice);
3016 goto out_unlock;
3017 }
3018 on_rq = p->on_rq;
3019 if (on_rq)
3020 dequeue_task(rq, p, 0);
3021
3022 p->static_prio = NICE_TO_PRIO(nice);
3023 set_load_weight(p);
3024 old_prio = p->prio;
3025 p->prio = effective_prio(p);
3026 delta = p->prio - old_prio;
3027
3028 if (on_rq) {
3029 enqueue_task(rq, p, 0);
3030
3031
3032
3033
3034 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3035 resched_task(rq->curr);
3036 }
3037out_unlock:
3038 task_rq_unlock(rq, p, &flags);
3039}
3040EXPORT_SYMBOL(set_user_nice);
3041
3042
3043
3044
3045
3046
3047int can_nice(const struct task_struct *p, const int nice)
3048{
3049
3050 int nice_rlim = 20 - nice;
3051
3052 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3053 capable(CAP_SYS_NICE));
3054}
3055
3056#ifdef __ARCH_WANT_SYS_NICE
3057
3058
3059
3060
3061
3062
3063
3064
3065SYSCALL_DEFINE1(nice, int, increment)
3066{
3067 long nice, retval;
3068
3069
3070
3071
3072
3073
3074 if (increment < -40)
3075 increment = -40;
3076 if (increment > 40)
3077 increment = 40;
3078
3079 nice = TASK_NICE(current) + increment;
3080 if (nice < -20)
3081 nice = -20;
3082 if (nice > 19)
3083 nice = 19;
3084
3085 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM;
3087
3088 retval = security_task_setnice(current, nice);
3089 if (retval)
3090 return retval;
3091
3092 set_user_nice(current, nice);
3093 return 0;
3094}
3095
3096#endif
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106int task_prio(const struct task_struct *p)
3107{
3108 return p->prio - MAX_RT_PRIO;
3109}
3110
3111
3112
3113
3114
3115
3116
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123
3124
3125
3126
3127
3128
3129int idle_cpu(int cpu)
3130{
3131 struct rq *rq = cpu_rq(cpu);
3132
3133 if (rq->curr != rq->idle)
3134 return 0;
3135
3136 if (rq->nr_running)
3137 return 0;
3138
3139#ifdef CONFIG_SMP
3140 if (!llist_empty(&rq->wake_list))
3141 return 0;
3142#endif
3143
3144 return 1;
3145}
3146
3147
3148
3149
3150
3151
3152
3153struct task_struct *idle_task(int cpu)
3154{
3155 return cpu_rq(cpu)->idle;
3156}
3157
3158
3159
3160
3161
3162
3163
3164static struct task_struct *find_process_by_pid(pid_t pid)
3165{
3166 return pid ? find_task_by_vpid(pid) : current;
3167}
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177static void
3178__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3179{
3180 struct sched_dl_entity *dl_se = &p->dl;
3181
3182 init_dl_task_timer(dl_se);
3183 dl_se->dl_runtime = attr->sched_runtime;
3184 dl_se->dl_deadline = attr->sched_deadline;
3185 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3186 dl_se->flags = attr->sched_flags;
3187 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3188 dl_se->dl_throttled = 0;
3189 dl_se->dl_new = 1;
3190}
3191
3192
3193static void __setscheduler(struct rq *rq, struct task_struct *p,
3194 const struct sched_attr *attr)
3195{
3196 int policy = attr->sched_policy;
3197
3198 if (policy == -1)
3199 policy = p->policy;
3200
3201 p->policy = policy;
3202
3203 if (dl_policy(policy))
3204 __setparam_dl(p, attr);
3205 else if (fair_policy(policy))
3206 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3207
3208
3209
3210
3211
3212
3213 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p);
3217
3218 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class;
3220 else if (rt_prio(p->prio))
3221 p->sched_class = &rt_sched_class;
3222 else
3223 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226}
3227
3228static void
3229__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3230{
3231 struct sched_dl_entity *dl_se = &p->dl;
3232
3233 attr->sched_priority = p->rt_priority;
3234 attr->sched_runtime = dl_se->dl_runtime;
3235 attr->sched_deadline = dl_se->dl_deadline;
3236 attr->sched_period = dl_se->dl_period;
3237 attr->sched_flags = dl_se->flags;
3238}
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248static bool
3249__checkparam_dl(const struct sched_attr *attr)
3250{
3251 return attr && attr->sched_deadline != 0 &&
3252 (attr->sched_period == 0 ||
3253 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3254 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3255 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3256}
3257
3258
3259
3260
3261static bool check_same_owner(struct task_struct *p)
3262{
3263 const struct cred *cred = current_cred(), *pcred;
3264 bool match;
3265
3266 rcu_read_lock();
3267 pcred = __task_cred(p);
3268 match = (uid_eq(cred->euid, pcred->euid) ||
3269 uid_eq(cred->euid, pcred->uid));
3270 rcu_read_unlock();
3271 return match;
3272}
3273
3274static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr,
3276 bool user)
3277{
3278 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy;
3280 unsigned long flags;
3281 const struct sched_class *prev_class;
3282 struct rq *rq;
3283 int reset_on_fork;
3284
3285
3286 BUG_ON(in_interrupt());
3287recheck:
3288
3289 if (policy < 0) {
3290 reset_on_fork = p->sched_reset_on_fork;
3291 policy = oldpolicy = p->policy;
3292 } else {
3293 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3294
3295 if (policy != SCHED_DEADLINE &&
3296 policy != SCHED_FIFO && policy != SCHED_RR &&
3297 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3298 policy != SCHED_IDLE)
3299 return -EINVAL;
3300 }
3301
3302 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3303 return -EINVAL;
3304
3305
3306
3307
3308
3309
3310 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3311 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3312 return -EINVAL;
3313 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3314 (rt_policy(policy) != (attr->sched_priority != 0)))
3315 return -EINVAL;
3316
3317
3318
3319
3320 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) &&
3323 !can_nice(p, attr->sched_nice))
3324 return -EPERM;
3325 }
3326
3327 if (rt_policy(policy)) {
3328 unsigned long rlim_rtprio =
3329 task_rlimit(p, RLIMIT_RTPRIO);
3330
3331
3332 if (policy != p->policy && !rlim_rtprio)
3333 return -EPERM;
3334
3335
3336 if (attr->sched_priority > p->rt_priority &&
3337 attr->sched_priority > rlim_rtprio)
3338 return -EPERM;
3339 }
3340
3341
3342
3343
3344
3345
3346
3347 if (dl_policy(policy))
3348 return -EPERM;
3349
3350
3351
3352
3353
3354 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3355 if (!can_nice(p, TASK_NICE(p)))
3356 return -EPERM;
3357 }
3358
3359
3360 if (!check_same_owner(p))
3361 return -EPERM;
3362
3363
3364 if (p->sched_reset_on_fork && !reset_on_fork)
3365 return -EPERM;
3366 }
3367
3368 if (user) {
3369 retval = security_task_setscheduler(p);
3370 if (retval)
3371 return retval;
3372 }
3373
3374
3375
3376
3377
3378
3379
3380
3381 rq = task_rq_lock(p, &flags);
3382
3383
3384
3385
3386 if (p == rq->stop) {
3387 task_rq_unlock(rq, p, &flags);
3388 return -EINVAL;
3389 }
3390
3391
3392
3393
3394 if (unlikely(policy == p->policy)) {
3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3396 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change;
3399 if (dl_policy(policy))
3400 goto change;
3401
3402 task_rq_unlock(rq, p, &flags);
3403 return 0;
3404 }
3405change:
3406
3407 if (user) {
3408#ifdef CONFIG_RT_GROUP_SCHED
3409
3410
3411
3412
3413 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3414 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3415 !task_group_is_autogroup(task_group(p))) {
3416 task_rq_unlock(rq, p, &flags);
3417 return -EPERM;
3418 }
3419#endif
3420#ifdef CONFIG_SMP
3421 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3422 cpumask_t *span = rq->rd->span;
3423
3424
3425
3426
3427
3428
3429 if (!cpumask_subset(span, &p->cpus_allowed) ||
3430 rq->rd->dl_bw.bw == 0) {
3431 task_rq_unlock(rq, p, &flags);
3432 return -EPERM;
3433 }
3434 }
3435#endif
3436 }
3437
3438
3439 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3440 policy = oldpolicy = -1;
3441 task_rq_unlock(rq, p, &flags);
3442 goto recheck;
3443 }
3444
3445
3446
3447
3448
3449
3450 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3451 task_rq_unlock(rq, p, &flags);
3452 return -EBUSY;
3453 }
3454
3455 on_rq = p->on_rq;
3456 running = task_current(rq, p);
3457 if (on_rq)
3458 dequeue_task(rq, p, 0);
3459 if (running)
3460 p->sched_class->put_prev_task(rq, p);
3461
3462 p->sched_reset_on_fork = reset_on_fork;
3463
3464 oldprio = p->prio;
3465 prev_class = p->sched_class;
3466 __setscheduler(rq, p, attr);
3467
3468 if (running)
3469 p->sched_class->set_curr_task(rq);
3470 if (on_rq)
3471 enqueue_task(rq, p, 0);
3472
3473 check_class_changed(rq, p, prev_class, oldprio);
3474 task_rq_unlock(rq, p, &flags);
3475
3476 rt_mutex_adjust_pi(p);
3477
3478 return 0;
3479}
3480
3481static int _sched_setscheduler(struct task_struct *p, int policy,
3482 const struct sched_param *param, bool check)
3483{
3484 struct sched_attr attr = {
3485 .sched_policy = policy,
3486 .sched_priority = param->sched_priority,
3487 .sched_nice = PRIO_TO_NICE(p->static_prio),
3488 };
3489
3490
3491
3492
3493 if (policy & SCHED_RESET_ON_FORK) {
3494 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3495 policy &= ~SCHED_RESET_ON_FORK;
3496 attr.sched_policy = policy;
3497 }
3498
3499 return __sched_setscheduler(p, &attr, check);
3500}
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511int sched_setscheduler(struct task_struct *p, int policy,
3512 const struct sched_param *param)
3513{
3514 return _sched_setscheduler(p, policy, param, true);
3515}
3516EXPORT_SYMBOL_GPL(sched_setscheduler);
3517
3518int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3519{
3520 return __sched_setscheduler(p, attr, true);
3521}
3522EXPORT_SYMBOL_GPL(sched_setattr);
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3538 const struct sched_param *param)
3539{
3540 return _sched_setscheduler(p, policy, param, false);
3541}
3542
3543static int
3544do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3545{
3546 struct sched_param lparam;
3547 struct task_struct *p;
3548 int retval;
3549
3550 if (!param || pid < 0)
3551 return -EINVAL;
3552 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3553 return -EFAULT;
3554
3555 rcu_read_lock();
3556 retval = -ESRCH;
3557 p = find_process_by_pid(pid);
3558 if (p != NULL)
3559 retval = sched_setscheduler(p, policy, &lparam);
3560 rcu_read_unlock();
3561
3562 return retval;
3563}
3564
3565
3566
3567
3568static int sched_copy_attr(struct sched_attr __user *uattr,
3569 struct sched_attr *attr)
3570{
3571 u32 size;
3572 int ret;
3573
3574 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3575 return -EFAULT;
3576
3577
3578
3579
3580 memset(attr, 0, sizeof(*attr));
3581
3582 ret = get_user(size, &uattr->size);
3583 if (ret)
3584 return ret;
3585
3586 if (size > PAGE_SIZE)
3587 goto err_size;
3588
3589 if (!size)
3590 size = SCHED_ATTR_SIZE_VER0;
3591
3592 if (size < SCHED_ATTR_SIZE_VER0)
3593 goto err_size;
3594
3595
3596
3597
3598
3599
3600
3601 if (size > sizeof(*attr)) {
3602 unsigned char __user *addr;
3603 unsigned char __user *end;
3604 unsigned char val;
3605
3606 addr = (void __user *)uattr + sizeof(*attr);
3607 end = (void __user *)uattr + size;
3608
3609 for (; addr < end; addr++) {
3610 ret = get_user(val, addr);
3611 if (ret)
3612 return ret;
3613 if (val)
3614 goto err_size;
3615 }
3616 size = sizeof(*attr);
3617 }
3618
3619 ret = copy_from_user(attr, uattr, size);
3620 if (ret)
3621 return -EFAULT;
3622
3623
3624
3625
3626
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3628
3629out:
3630 return ret;
3631
3632err_size:
3633 put_user(sizeof(*attr), &uattr->size);
3634 ret = -E2BIG;
3635 goto out;
3636}
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3647 struct sched_param __user *, param)
3648{
3649
3650 if (policy < 0)
3651 return -EINVAL;
3652
3653 return do_sched_setscheduler(pid, policy, param);
3654}
3655
3656
3657
3658
3659
3660
3661
3662
3663SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3664{
3665 return do_sched_setscheduler(pid, -1, param);
3666}
3667
3668
3669
3670
3671
3672
3673SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3674 unsigned int, flags)
3675{
3676 struct sched_attr attr;
3677 struct task_struct *p;
3678 int retval;
3679
3680 if (!uattr || pid < 0 || flags)
3681 return -EINVAL;
3682
3683 if (sched_copy_attr(uattr, &attr))
3684 return -EFAULT;
3685
3686 rcu_read_lock();
3687 retval = -ESRCH;
3688 p = find_process_by_pid(pid);
3689 if (p != NULL)
3690 retval = sched_setattr(p, &attr);
3691 rcu_read_unlock();
3692
3693 return retval;
3694}
3695
3696
3697
3698
3699
3700
3701
3702
3703SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3704{
3705 struct task_struct *p;
3706 int retval;
3707
3708 if (pid < 0)
3709 return -EINVAL;
3710
3711 retval = -ESRCH;
3712 rcu_read_lock();
3713 p = find_process_by_pid(pid);
3714 if (p) {
3715 retval = security_task_getscheduler(p);
3716 if (!retval)
3717 retval = p->policy
3718 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3719 }
3720 rcu_read_unlock();
3721 return retval;
3722}
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3733{
3734 struct sched_param lp;
3735 struct task_struct *p;
3736 int retval;
3737
3738 if (!param || pid < 0)
3739 return -EINVAL;
3740
3741 rcu_read_lock();
3742 p = find_process_by_pid(pid);
3743 retval = -ESRCH;
3744 if (!p)
3745 goto out_unlock;
3746
3747 retval = security_task_getscheduler(p);
3748 if (retval)
3749 goto out_unlock;
3750
3751 if (task_has_dl_policy(p)) {
3752 retval = -EINVAL;
3753 goto out_unlock;
3754 }
3755 lp.sched_priority = p->rt_priority;
3756 rcu_read_unlock();
3757
3758
3759
3760
3761 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3762
3763 return retval;
3764
3765out_unlock:
3766 rcu_read_unlock();
3767 return retval;
3768}
3769
3770static int sched_read_attr(struct sched_attr __user *uattr,
3771 struct sched_attr *attr,
3772 unsigned int usize)
3773{
3774 int ret;
3775
3776 if (!access_ok(VERIFY_WRITE, uattr, usize))
3777 return -EFAULT;
3778
3779
3780
3781
3782
3783
3784 if (usize < sizeof(*attr)) {
3785 unsigned char *addr;
3786 unsigned char *end;
3787
3788 addr = (void *)attr + usize;
3789 end = (void *)attr + sizeof(*attr);
3790
3791 for (; addr < end; addr++) {
3792 if (*addr)
3793 goto err_size;
3794 }
3795
3796 attr->size = usize;
3797 }
3798
3799 ret = copy_to_user(uattr, attr, attr->size);
3800 if (ret)
3801 return -EFAULT;
3802
3803out:
3804 return ret;
3805
3806err_size:
3807 ret = -E2BIG;
3808 goto out;
3809}
3810
3811
3812
3813
3814
3815
3816
3817SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3818 unsigned int, size, unsigned int, flags)
3819{
3820 struct sched_attr attr = {
3821 .size = sizeof(struct sched_attr),
3822 };
3823 struct task_struct *p;
3824 int retval;
3825
3826 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3827 size < SCHED_ATTR_SIZE_VER0 || flags)
3828 return -EINVAL;
3829
3830 rcu_read_lock();
3831 p = find_process_by_pid(pid);
3832 retval = -ESRCH;
3833 if (!p)
3834 goto out_unlock;
3835
3836 retval = security_task_getscheduler(p);
3837 if (retval)
3838 goto out_unlock;
3839
3840 attr.sched_policy = p->policy;
3841 if (p->sched_reset_on_fork)
3842 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3843 if (task_has_dl_policy(p))
3844 __getparam_dl(p, &attr);
3845 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority;
3847 else
3848 attr.sched_nice = TASK_NICE(p);
3849
3850 rcu_read_unlock();
3851
3852 retval = sched_read_attr(uattr, &attr, size);
3853 return retval;
3854
3855out_unlock:
3856 rcu_read_unlock();
3857 return retval;
3858}
3859
3860long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3861{
3862 cpumask_var_t cpus_allowed, new_mask;
3863 struct task_struct *p;
3864 int retval;
3865
3866 rcu_read_lock();
3867
3868 p = find_process_by_pid(pid);
3869 if (!p) {
3870 rcu_read_unlock();
3871 return -ESRCH;
3872 }
3873
3874
3875 get_task_struct(p);
3876 rcu_read_unlock();
3877
3878 if (p->flags & PF_NO_SETAFFINITY) {
3879 retval = -EINVAL;
3880 goto out_put_task;
3881 }
3882 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3883 retval = -ENOMEM;
3884 goto out_put_task;
3885 }
3886 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3887 retval = -ENOMEM;
3888 goto out_free_cpus_allowed;
3889 }
3890 retval = -EPERM;
3891 if (!check_same_owner(p)) {
3892 rcu_read_lock();
3893 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3894 rcu_read_unlock();
3895 goto out_unlock;
3896 }
3897 rcu_read_unlock();
3898 }
3899
3900 retval = security_task_setscheduler(p);
3901 if (retval)
3902 goto out_unlock;
3903
3904
3905 cpuset_cpus_allowed(p, cpus_allowed);
3906 cpumask_and(new_mask, in_mask, cpus_allowed);
3907
3908
3909
3910
3911
3912
3913
3914#ifdef CONFIG_SMP
3915 if (task_has_dl_policy(p)) {
3916 const struct cpumask *span = task_rq(p)->rd->span;
3917
3918 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3919 retval = -EBUSY;
3920 goto out_unlock;
3921 }
3922 }
3923#endif
3924again:
3925 retval = set_cpus_allowed_ptr(p, new_mask);
3926
3927 if (!retval) {
3928 cpuset_cpus_allowed(p, cpus_allowed);
3929 if (!cpumask_subset(new_mask, cpus_allowed)) {
3930
3931
3932
3933
3934
3935 cpumask_copy(new_mask, cpus_allowed);
3936 goto again;
3937 }
3938 }
3939out_unlock:
3940 free_cpumask_var(new_mask);
3941out_free_cpus_allowed:
3942 free_cpumask_var(cpus_allowed);
3943out_put_task:
3944 put_task_struct(p);
3945 return retval;
3946}
3947
3948static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3949 struct cpumask *new_mask)
3950{
3951 if (len < cpumask_size())
3952 cpumask_clear(new_mask);
3953 else if (len > cpumask_size())
3954 len = cpumask_size();
3955
3956 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3957}
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3968 unsigned long __user *, user_mask_ptr)
3969{
3970 cpumask_var_t new_mask;
3971 int retval;
3972
3973 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3974 return -ENOMEM;
3975
3976 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3977 if (retval == 0)
3978 retval = sched_setaffinity(pid, new_mask);
3979 free_cpumask_var(new_mask);
3980 return retval;
3981}
3982
3983long sched_getaffinity(pid_t pid, struct cpumask *mask)
3984{
3985 struct task_struct *p;
3986 unsigned long flags;
3987 int retval;
3988
3989 rcu_read_lock();
3990
3991 retval = -ESRCH;
3992 p = find_process_by_pid(pid);
3993 if (!p)
3994 goto out_unlock;
3995
3996 retval = security_task_getscheduler(p);
3997 if (retval)
3998 goto out_unlock;
3999
4000 raw_spin_lock_irqsave(&p->pi_lock, flags);
4001 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4002 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4003
4004out_unlock:
4005 rcu_read_unlock();
4006
4007 return retval;
4008}
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4019 unsigned long __user *, user_mask_ptr)
4020{
4021 int ret;
4022 cpumask_var_t mask;
4023
4024 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4025 return -EINVAL;
4026 if (len & (sizeof(unsigned long)-1))
4027 return -EINVAL;
4028
4029 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4030 return -ENOMEM;
4031
4032 ret = sched_getaffinity(pid, mask);
4033 if (ret == 0) {
4034 size_t retlen = min_t(size_t, len, cpumask_size());
4035
4036 if (copy_to_user(user_mask_ptr, mask, retlen))
4037 ret = -EFAULT;
4038 else
4039 ret = retlen;
4040 }
4041 free_cpumask_var(mask);
4042
4043 return ret;
4044}
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054SYSCALL_DEFINE0(sched_yield)
4055{
4056 struct rq *rq = this_rq_lock();
4057
4058 schedstat_inc(rq, yld_count);
4059 current->sched_class->yield_task(rq);
4060
4061
4062
4063
4064
4065 __release(rq->lock);
4066 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4067 do_raw_spin_unlock(&rq->lock);
4068 sched_preempt_enable_no_resched();
4069
4070 schedule();
4071
4072 return 0;
4073}
4074
4075static void __cond_resched(void)
4076{
4077 __preempt_count_add(PREEMPT_ACTIVE);
4078 __schedule();
4079 __preempt_count_sub(PREEMPT_ACTIVE);
4080}
4081
4082int __sched _cond_resched(void)
4083{
4084 if (should_resched()) {
4085 __cond_resched();
4086 return 1;
4087 }
4088 return 0;
4089}
4090EXPORT_SYMBOL(_cond_resched);
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100int __cond_resched_lock(spinlock_t *lock)
4101{
4102 int resched = should_resched();
4103 int ret = 0;
4104
4105 lockdep_assert_held(lock);
4106
4107 if (spin_needbreak(lock) || resched) {
4108 spin_unlock(lock);
4109 if (resched)
4110 __cond_resched();
4111 else
4112 cpu_relax();
4113 ret = 1;
4114 spin_lock(lock);
4115 }
4116 return ret;
4117}
4118EXPORT_SYMBOL(__cond_resched_lock);
4119
4120int __sched __cond_resched_softirq(void)
4121{
4122 BUG_ON(!in_softirq());
4123
4124 if (should_resched()) {
4125 local_bh_enable();
4126 __cond_resched();
4127 local_bh_disable();
4128 return 1;
4129 }
4130 return 0;
4131}
4132EXPORT_SYMBOL(__cond_resched_softirq);
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156void __sched yield(void)
4157{
4158 set_current_state(TASK_RUNNING);
4159 sys_sched_yield();
4160}
4161EXPORT_SYMBOL(yield);
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178bool __sched yield_to(struct task_struct *p, bool preempt)
4179{
4180 struct task_struct *curr = current;
4181 struct rq *rq, *p_rq;
4182 unsigned long flags;
4183 int yielded = 0;
4184
4185 local_irq_save(flags);
4186 rq = this_rq();
4187
4188again:
4189 p_rq = task_rq(p);
4190
4191
4192
4193
4194 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4195 yielded = -ESRCH;
4196 goto out_irq;
4197 }
4198
4199 double_rq_lock(rq, p_rq);
4200 if (task_rq(p) != p_rq) {
4201 double_rq_unlock(rq, p_rq);
4202 goto again;
4203 }
4204
4205 if (!curr->sched_class->yield_to_task)
4206 goto out_unlock;
4207
4208 if (curr->sched_class != p->sched_class)
4209 goto out_unlock;
4210
4211 if (task_running(p_rq, p) || p->state)
4212 goto out_unlock;
4213
4214 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4215 if (yielded) {
4216 schedstat_inc(rq, yld_count);
4217
4218
4219
4220
4221 if (preempt && rq != p_rq)
4222 resched_task(p_rq->curr);
4223 }
4224
4225out_unlock:
4226 double_rq_unlock(rq, p_rq);
4227out_irq:
4228 local_irq_restore(flags);
4229
4230 if (yielded > 0)
4231 schedule();
4232
4233 return yielded;
4234}
4235EXPORT_SYMBOL_GPL(yield_to);
4236
4237
4238
4239
4240
4241void __sched io_schedule(void)
4242{
4243 struct rq *rq = raw_rq();
4244
4245 delayacct_blkio_start();
4246 atomic_inc(&rq->nr_iowait);
4247 blk_flush_plug(current);
4248 current->in_iowait = 1;
4249 schedule();
4250 current->in_iowait = 0;
4251 atomic_dec(&rq->nr_iowait);
4252 delayacct_blkio_end();
4253}
4254EXPORT_SYMBOL(io_schedule);
4255
4256long __sched io_schedule_timeout(long timeout)
4257{
4258 struct rq *rq = raw_rq();
4259 long ret;
4260
4261 delayacct_blkio_start();
4262 atomic_inc(&rq->nr_iowait);
4263 blk_flush_plug(current);
4264 current->in_iowait = 1;
4265 ret = schedule_timeout(timeout);
4266 current->in_iowait = 0;
4267 atomic_dec(&rq->nr_iowait);
4268 delayacct_blkio_end();
4269 return ret;
4270}
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4281{
4282 int ret = -EINVAL;
4283
4284 switch (policy) {
4285 case SCHED_FIFO:
4286 case SCHED_RR:
4287 ret = MAX_USER_RT_PRIO-1;
4288 break;
4289 case SCHED_DEADLINE:
4290 case SCHED_NORMAL:
4291 case SCHED_BATCH:
4292 case SCHED_IDLE:
4293 ret = 0;
4294 break;
4295 }
4296 return ret;
4297}
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4308{
4309 int ret = -EINVAL;
4310
4311 switch (policy) {
4312 case SCHED_FIFO:
4313 case SCHED_RR:
4314 ret = 1;
4315 break;
4316 case SCHED_DEADLINE:
4317 case SCHED_NORMAL:
4318 case SCHED_BATCH:
4319 case SCHED_IDLE:
4320 ret = 0;
4321 }
4322 return ret;
4323}
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4337 struct timespec __user *, interval)
4338{
4339 struct task_struct *p;
4340 unsigned int time_slice;
4341 unsigned long flags;
4342 struct rq *rq;
4343 int retval;
4344 struct timespec t;
4345
4346 if (pid < 0)
4347 return -EINVAL;
4348
4349 retval = -ESRCH;
4350 rcu_read_lock();
4351 p = find_process_by_pid(pid);
4352 if (!p)
4353 goto out_unlock;
4354
4355 retval = security_task_getscheduler(p);
4356 if (retval)
4357 goto out_unlock;
4358
4359 rq = task_rq_lock(p, &flags);
4360 time_slice = 0;
4361 if (p->sched_class->get_rr_interval)
4362 time_slice = p->sched_class->get_rr_interval(rq, p);
4363 task_rq_unlock(rq, p, &flags);
4364
4365 rcu_read_unlock();
4366 jiffies_to_timespec(time_slice, &t);
4367 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4368 return retval;
4369
4370out_unlock:
4371 rcu_read_unlock();
4372 return retval;
4373}
4374
4375static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4376
4377void sched_show_task(struct task_struct *p)
4378{
4379 unsigned long free = 0;
4380 int ppid;
4381 unsigned state;
4382
4383 state = p->state ? __ffs(p->state) + 1 : 0;
4384 printk(KERN_INFO "%-15.15s %c", p->comm,
4385 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4386#if BITS_PER_LONG == 32
4387 if (state == TASK_RUNNING)
4388 printk(KERN_CONT " running ");
4389 else
4390 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4391#else
4392 if (state == TASK_RUNNING)
4393 printk(KERN_CONT " running task ");
4394 else
4395 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4396#endif
4397#ifdef CONFIG_DEBUG_STACK_USAGE
4398 free = stack_not_used(p);
4399#endif
4400 rcu_read_lock();
4401 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4402 rcu_read_unlock();
4403 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4404 task_pid_nr(p), ppid,
4405 (unsigned long)task_thread_info(p)->flags);
4406
4407 print_worker_info(KERN_INFO, p);
4408 show_stack(p, NULL);
4409}
4410
4411void show_state_filter(unsigned long state_filter)
4412{
4413 struct task_struct *g, *p;
4414
4415#if BITS_PER_LONG == 32
4416 printk(KERN_INFO
4417 " task PC stack pid father\n");
4418#else
4419 printk(KERN_INFO
4420 " task PC stack pid father\n");
4421#endif
4422 rcu_read_lock();
4423 do_each_thread(g, p) {
4424
4425
4426
4427
4428 touch_nmi_watchdog();
4429 if (!state_filter || (p->state & state_filter))
4430 sched_show_task(p);
4431 } while_each_thread(g, p);
4432
4433 touch_all_softlockup_watchdogs();
4434
4435#ifdef CONFIG_SCHED_DEBUG
4436 sysrq_sched_debug_show();
4437#endif
4438 rcu_read_unlock();
4439
4440
4441
4442 if (!state_filter)
4443 debug_show_all_locks();
4444}
4445
4446void init_idle_bootup_task(struct task_struct *idle)
4447{
4448 idle->sched_class = &idle_sched_class;
4449}
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459void init_idle(struct task_struct *idle, int cpu)
4460{
4461 struct rq *rq = cpu_rq(cpu);
4462 unsigned long flags;
4463
4464 raw_spin_lock_irqsave(&rq->lock, flags);
4465
4466 __sched_fork(0, idle);
4467 idle->state = TASK_RUNNING;
4468 idle->se.exec_start = sched_clock();
4469
4470 do_set_cpus_allowed(idle, cpumask_of(cpu));
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481 rcu_read_lock();
4482 __set_task_cpu(idle, cpu);
4483 rcu_read_unlock();
4484
4485 rq->curr = rq->idle = idle;
4486#if defined(CONFIG_SMP)
4487 idle->on_cpu = 1;
4488#endif
4489 raw_spin_unlock_irqrestore(&rq->lock, flags);
4490
4491
4492 init_idle_preempt_count(idle, cpu);
4493
4494
4495
4496
4497 idle->sched_class = &idle_sched_class;
4498 ftrace_graph_init_idle_task(idle, cpu);
4499 vtime_init_idle(idle, cpu);
4500#if defined(CONFIG_SMP)
4501 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4502#endif
4503}
4504
4505#ifdef CONFIG_SMP
4506void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4507{
4508 if (p->sched_class && p->sched_class->set_cpus_allowed)
4509 p->sched_class->set_cpus_allowed(p, new_mask);
4510
4511 cpumask_copy(&p->cpus_allowed, new_mask);
4512 p->nr_cpus_allowed = cpumask_weight(new_mask);
4513}
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4539{
4540 unsigned long flags;
4541 struct rq *rq;
4542 unsigned int dest_cpu;
4543 int ret = 0;
4544
4545 rq = task_rq_lock(p, &flags);
4546
4547 if (cpumask_equal(&p->cpus_allowed, new_mask))
4548 goto out;
4549
4550 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4551 ret = -EINVAL;
4552 goto out;
4553 }
4554
4555 do_set_cpus_allowed(p, new_mask);
4556
4557
4558 if (cpumask_test_cpu(task_cpu(p), new_mask))
4559 goto out;
4560
4561 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4562 if (p->on_rq) {
4563 struct migration_arg arg = { p, dest_cpu };
4564
4565 task_rq_unlock(rq, p, &flags);
4566 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4567 tlb_migrate_finish(p->mm);
4568 return 0;
4569 }
4570out:
4571 task_rq_unlock(rq, p, &flags);
4572
4573 return ret;
4574}
4575EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4589{
4590 struct rq *rq_dest, *rq_src;
4591 int ret = 0;
4592
4593 if (unlikely(!cpu_active(dest_cpu)))
4594 return ret;
4595
4596 rq_src = cpu_rq(src_cpu);
4597 rq_dest = cpu_rq(dest_cpu);
4598
4599 raw_spin_lock(&p->pi_lock);
4600 double_rq_lock(rq_src, rq_dest);
4601
4602 if (task_cpu(p) != src_cpu)
4603 goto done;
4604
4605 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4606 goto fail;
4607
4608
4609
4610
4611
4612 if (p->on_rq) {
4613 dequeue_task(rq_src, p, 0);
4614 set_task_cpu(p, dest_cpu);
4615 enqueue_task(rq_dest, p, 0);
4616 check_preempt_curr(rq_dest, p, 0);
4617 }
4618done:
4619 ret = 1;
4620fail:
4621 double_rq_unlock(rq_src, rq_dest);
4622 raw_spin_unlock(&p->pi_lock);
4623 return ret;
4624}
4625
4626#ifdef CONFIG_NUMA_BALANCING
4627
4628int migrate_task_to(struct task_struct *p, int target_cpu)
4629{
4630 struct migration_arg arg = { p, target_cpu };
4631 int curr_cpu = task_cpu(p);
4632
4633 if (curr_cpu == target_cpu)
4634 return 0;
4635
4636 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4637 return -EINVAL;
4638
4639
4640
4641 trace_sched_move_numa(p, curr_cpu, target_cpu);
4642 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4643}
4644
4645
4646
4647
4648
4649void sched_setnuma(struct task_struct *p, int nid)
4650{
4651 struct rq *rq;
4652 unsigned long flags;
4653 bool on_rq, running;
4654
4655 rq = task_rq_lock(p, &flags);
4656 on_rq = p->on_rq;
4657 running = task_current(rq, p);
4658
4659 if (on_rq)
4660 dequeue_task(rq, p, 0);
4661 if (running)
4662 p->sched_class->put_prev_task(rq, p);
4663
4664 p->numa_preferred_nid = nid;
4665
4666 if (running)
4667 p->sched_class->set_curr_task(rq);
4668 if (on_rq)
4669 enqueue_task(rq, p, 0);
4670 task_rq_unlock(rq, p, &flags);
4671}
4672#endif
4673
4674
4675
4676
4677
4678
4679static int migration_cpu_stop(void *data)
4680{
4681 struct migration_arg *arg = data;
4682
4683
4684
4685
4686
4687 local_irq_disable();
4688 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4689 local_irq_enable();
4690 return 0;
4691}
4692
4693#ifdef CONFIG_HOTPLUG_CPU
4694
4695
4696
4697
4698
4699void idle_task_exit(void)
4700{
4701 struct mm_struct *mm = current->active_mm;
4702
4703 BUG_ON(cpu_online(smp_processor_id()));
4704
4705 if (mm != &init_mm)
4706 switch_mm(mm, &init_mm, current);
4707 mmdrop(mm);
4708}
4709
4710
4711
4712
4713
4714
4715
4716
4717static void calc_load_migrate(struct rq *rq)
4718{
4719 long delta = calc_load_fold_active(rq);
4720 if (delta)
4721 atomic_long_add(delta, &calc_load_tasks);
4722}
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732static void migrate_tasks(unsigned int dead_cpu)
4733{
4734 struct rq *rq = cpu_rq(dead_cpu);
4735 struct task_struct *next, *stop = rq->stop;
4736 int dest_cpu;
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747 rq->stop = NULL;
4748
4749
4750
4751
4752
4753
4754 update_rq_clock(rq);
4755
4756 for ( ; ; ) {
4757
4758
4759
4760
4761 if (rq->nr_running == 1)
4762 break;
4763
4764 next = pick_next_task(rq);
4765 BUG_ON(!next);
4766 next->sched_class->put_prev_task(rq, next);
4767
4768
4769 dest_cpu = select_fallback_rq(dead_cpu, next);
4770 raw_spin_unlock(&rq->lock);
4771
4772 __migrate_task(next, dead_cpu, dest_cpu);
4773
4774 raw_spin_lock(&rq->lock);
4775 }
4776
4777 rq->stop = stop;
4778}
4779
4780#endif
4781
4782#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4783
4784static struct ctl_table sd_ctl_dir[] = {
4785 {
4786 .procname = "sched_domain",
4787 .mode = 0555,
4788 },
4789 {}
4790};
4791
4792static struct ctl_table sd_ctl_root[] = {
4793 {
4794 .procname = "kernel",
4795 .mode = 0555,
4796 .child = sd_ctl_dir,
4797 },
4798 {}
4799};
4800
4801static struct ctl_table *sd_alloc_ctl_entry(int n)
4802{
4803 struct ctl_table *entry =
4804 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4805
4806 return entry;
4807}
4808
4809static void sd_free_ctl_entry(struct ctl_table **tablep)
4810{
4811 struct ctl_table *entry;
4812
4813
4814
4815
4816
4817
4818
4819 for (entry = *tablep; entry->mode; entry++) {
4820 if (entry->child)
4821 sd_free_ctl_entry(&entry->child);
4822 if (entry->proc_handler == NULL)
4823 kfree(entry->procname);
4824 }
4825
4826 kfree(*tablep);
4827 *tablep = NULL;
4828}
4829
4830static int min_load_idx = 0;
4831static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4832
4833static void
4834set_table_entry(struct ctl_table *entry,
4835 const char *procname, void *data, int maxlen,
4836 umode_t mode, proc_handler *proc_handler,
4837 bool load_idx)
4838{
4839 entry->procname = procname;
4840 entry->data = data;
4841 entry->maxlen = maxlen;
4842 entry->mode = mode;
4843 entry->proc_handler = proc_handler;
4844
4845 if (load_idx) {
4846 entry->extra1 = &min_load_idx;
4847 entry->extra2 = &max_load_idx;
4848 }
4849}
4850
4851static struct ctl_table *
4852sd_alloc_ctl_domain_table(struct sched_domain *sd)
4853{
4854 struct ctl_table *table = sd_alloc_ctl_entry(13);
4855
4856 if (table == NULL)
4857 return NULL;
4858
4859 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4860 sizeof(long), 0644, proc_doulongvec_minmax, false);
4861 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4862 sizeof(long), 0644, proc_doulongvec_minmax, false);
4863 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4864 sizeof(int), 0644, proc_dointvec_minmax, true);
4865 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4866 sizeof(int), 0644, proc_dointvec_minmax, true);
4867 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4868 sizeof(int), 0644, proc_dointvec_minmax, true);
4869 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4870 sizeof(int), 0644, proc_dointvec_minmax, true);
4871 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4872 sizeof(int), 0644, proc_dointvec_minmax, true);
4873 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4874 sizeof(int), 0644, proc_dointvec_minmax, false);
4875 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4876 sizeof(int), 0644, proc_dointvec_minmax, false);
4877 set_table_entry(&table[9], "cache_nice_tries",
4878 &sd->cache_nice_tries,
4879 sizeof(int), 0644, proc_dointvec_minmax, false);
4880 set_table_entry(&table[10], "flags", &sd->flags,
4881 sizeof(int), 0644, proc_dointvec_minmax, false);
4882 set_table_entry(&table[11], "name", sd->name,
4883 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4884
4885
4886 return table;
4887}
4888
4889static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4890{
4891 struct ctl_table *entry, *table;
4892 struct sched_domain *sd;
4893 int domain_num = 0, i;
4894 char buf[32];
4895
4896 for_each_domain(cpu, sd)
4897 domain_num++;
4898 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4899 if (table == NULL)
4900 return NULL;
4901
4902 i = 0;
4903 for_each_domain(cpu, sd) {
4904 snprintf(buf, 32, "domain%d", i);
4905 entry->procname = kstrdup(buf, GFP_KERNEL);
4906 entry->mode = 0555;
4907 entry->child = sd_alloc_ctl_domain_table(sd);
4908 entry++;
4909 i++;
4910 }
4911 return table;
4912}
4913
4914static struct ctl_table_header *sd_sysctl_header;
4915static void register_sched_domain_sysctl(void)
4916{
4917 int i, cpu_num = num_possible_cpus();
4918 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4919 char buf[32];
4920
4921 WARN_ON(sd_ctl_dir[0].child);
4922 sd_ctl_dir[0].child = entry;
4923
4924 if (entry == NULL)
4925 return;
4926
4927 for_each_possible_cpu(i) {
4928 snprintf(buf, 32, "cpu%d", i);
4929 entry->procname = kstrdup(buf, GFP_KERNEL);
4930 entry->mode = 0555;
4931 entry->child = sd_alloc_ctl_cpu_table(i);
4932 entry++;
4933 }
4934
4935 WARN_ON(sd_sysctl_header);
4936 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4937}
4938
4939
4940static void unregister_sched_domain_sysctl(void)
4941{
4942 if (sd_sysctl_header)
4943 unregister_sysctl_table(sd_sysctl_header);
4944 sd_sysctl_header = NULL;
4945 if (sd_ctl_dir[0].child)
4946 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4947}
4948#else
4949static void register_sched_domain_sysctl(void)
4950{
4951}
4952static void unregister_sched_domain_sysctl(void)
4953{
4954}
4955#endif
4956
4957static void set_rq_online(struct rq *rq)
4958{
4959 if (!rq->online) {
4960 const struct sched_class *class;
4961
4962 cpumask_set_cpu(rq->cpu, rq->rd->online);
4963 rq->online = 1;
4964
4965 for_each_class(class) {
4966 if (class->rq_online)
4967 class->rq_online(rq);
4968 }
4969 }
4970}
4971
4972static void set_rq_offline(struct rq *rq)
4973{
4974 if (rq->online) {
4975 const struct sched_class *class;
4976
4977 for_each_class(class) {
4978 if (class->rq_offline)
4979 class->rq_offline(rq);
4980 }
4981
4982 cpumask_clear_cpu(rq->cpu, rq->rd->online);
4983 rq->online = 0;
4984 }
4985}
4986
4987
4988
4989
4990
4991static int
4992migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4993{
4994 int cpu = (long)hcpu;
4995 unsigned long flags;
4996 struct rq *rq = cpu_rq(cpu);
4997
4998 switch (action & ~CPU_TASKS_FROZEN) {
4999
5000 case CPU_UP_PREPARE:
5001 rq->calc_load_update = calc_load_update;
5002 break;
5003
5004 case CPU_ONLINE:
5005
5006 raw_spin_lock_irqsave(&rq->lock, flags);
5007 if (rq->rd) {
5008 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5009
5010 set_rq_online(rq);
5011 }
5012 raw_spin_unlock_irqrestore(&rq->lock, flags);
5013 break;
5014
5015#ifdef CONFIG_HOTPLUG_CPU
5016 case CPU_DYING:
5017 sched_ttwu_pending();
5018
5019 raw_spin_lock_irqsave(&rq->lock, flags);
5020 if (rq->rd) {
5021 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5022 set_rq_offline(rq);
5023 }
5024 migrate_tasks(cpu);
5025 BUG_ON(rq->nr_running != 1);
5026 raw_spin_unlock_irqrestore(&rq->lock, flags);
5027 break;
5028
5029 case CPU_DEAD:
5030 calc_load_migrate(rq);
5031 break;
5032#endif
5033 }
5034
5035 update_max_interval();
5036
5037 return NOTIFY_OK;
5038}
5039
5040
5041
5042
5043
5044
5045static struct notifier_block migration_notifier = {
5046 .notifier_call = migration_call,
5047 .priority = CPU_PRI_MIGRATION,
5048};
5049
5050static int sched_cpu_active(struct notifier_block *nfb,
5051 unsigned long action, void *hcpu)
5052{
5053 switch (action & ~CPU_TASKS_FROZEN) {
5054 case CPU_STARTING:
5055 case CPU_DOWN_FAILED:
5056 set_cpu_active((long)hcpu, true);
5057 return NOTIFY_OK;
5058 default:
5059 return NOTIFY_DONE;
5060 }
5061}
5062
5063static int sched_cpu_inactive(struct notifier_block *nfb,
5064 unsigned long action, void *hcpu)
5065{
5066 unsigned long flags;
5067 long cpu = (long)hcpu;
5068
5069 switch (action & ~CPU_TASKS_FROZEN) {
5070 case CPU_DOWN_PREPARE:
5071 set_cpu_active(cpu, false);
5072
5073
5074 if (!(action & CPU_TASKS_FROZEN)) {
5075 struct dl_bw *dl_b = dl_bw_of(cpu);
5076 bool overflow;
5077 int cpus;
5078
5079 raw_spin_lock_irqsave(&dl_b->lock, flags);
5080 cpus = dl_bw_cpus(cpu);
5081 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5082 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5083
5084 if (overflow)
5085 return notifier_from_errno(-EBUSY);
5086 }
5087 return NOTIFY_OK;
5088 }
5089
5090 return NOTIFY_DONE;
5091}
5092
5093static int __init migration_init(void)
5094{
5095 void *cpu = (void *)(long)smp_processor_id();
5096 int err;
5097
5098
5099 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5100 BUG_ON(err == NOTIFY_BAD);
5101 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5102 register_cpu_notifier(&migration_notifier);
5103
5104
5105 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5106 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5107
5108 return 0;
5109}
5110early_initcall(migration_init);
5111#endif
5112
5113#ifdef CONFIG_SMP
5114
5115static cpumask_var_t sched_domains_tmpmask;
5116
5117#ifdef CONFIG_SCHED_DEBUG
5118
5119static __read_mostly int sched_debug_enabled;
5120
5121static int __init sched_debug_setup(char *str)
5122{
5123 sched_debug_enabled = 1;
5124
5125 return 0;
5126}
5127early_param("sched_debug", sched_debug_setup);
5128
5129static inline bool sched_debug(void)
5130{
5131 return sched_debug_enabled;
5132}
5133
5134static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5135 struct cpumask *groupmask)
5136{
5137 struct sched_group *group = sd->groups;
5138 char str[256];
5139
5140 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5141 cpumask_clear(groupmask);
5142
5143 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5144
5145 if (!(sd->flags & SD_LOAD_BALANCE)) {
5146 printk("does not load-balance\n");
5147 if (sd->parent)
5148 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5149 " has parent");
5150 return -1;
5151 }
5152
5153 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5154
5155 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5156 printk(KERN_ERR "ERROR: domain->span does not contain "
5157 "CPU%d\n", cpu);
5158 }
5159 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5160 printk(KERN_ERR "ERROR: domain->groups does not contain"
5161 " CPU%d\n", cpu);
5162 }
5163
5164 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5165 do {
5166 if (!group) {
5167 printk("\n");
5168 printk(KERN_ERR "ERROR: group is NULL\n");
5169 break;
5170 }
5171
5172
5173
5174
5175
5176
5177 if (!group->sgp->power_orig) {
5178 printk(KERN_CONT "\n");
5179 printk(KERN_ERR "ERROR: domain->cpu_power not "
5180 "set\n");
5181 break;
5182 }
5183
5184 if (!cpumask_weight(sched_group_cpus(group))) {
5185 printk(KERN_CONT "\n");
5186 printk(KERN_ERR "ERROR: empty group\n");
5187 break;
5188 }
5189
5190 if (!(sd->flags & SD_OVERLAP) &&
5191 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5192 printk(KERN_CONT "\n");
5193 printk(KERN_ERR "ERROR: repeated CPUs\n");
5194 break;
5195 }
5196
5197 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5198
5199 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5200
5201 printk(KERN_CONT " %s", str);
5202 if (group->sgp->power != SCHED_POWER_SCALE) {
5203 printk(KERN_CONT " (cpu_power = %d)",
5204 group->sgp->power);
5205 }
5206
5207 group = group->next;
5208 } while (group != sd->groups);
5209 printk(KERN_CONT "\n");
5210
5211 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5212 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5213
5214 if (sd->parent &&
5215 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5216 printk(KERN_ERR "ERROR: parent span is not a superset "
5217 "of domain->span\n");
5218 return 0;
5219}
5220
5221static void sched_domain_debug(struct sched_domain *sd, int cpu)
5222{
5223 int level = 0;
5224
5225 if (!sched_debug_enabled)
5226 return;
5227
5228 if (!sd) {
5229 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5230 return;
5231 }
5232
5233 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5234
5235 for (;;) {
5236 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5237 break;
5238 level++;
5239 sd = sd->parent;
5240 if (!sd)
5241 break;
5242 }
5243}
5244#else
5245# define sched_domain_debug(sd, cpu) do { } while (0)
5246static inline bool sched_debug(void)
5247{
5248 return false;
5249}
5250#endif
5251
5252static int sd_degenerate(struct sched_domain *sd)
5253{
5254 if (cpumask_weight(sched_domain_span(sd)) == 1)
5255 return 1;
5256
5257
5258 if (sd->flags & (SD_LOAD_BALANCE |
5259 SD_BALANCE_NEWIDLE |
5260 SD_BALANCE_FORK |
5261 SD_BALANCE_EXEC |
5262 SD_SHARE_CPUPOWER |
5263 SD_SHARE_PKG_RESOURCES)) {
5264 if (sd->groups != sd->groups->next)
5265 return 0;
5266 }
5267
5268
5269 if (sd->flags & (SD_WAKE_AFFINE))
5270 return 0;
5271
5272 return 1;
5273}
5274
5275static int
5276sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5277{
5278 unsigned long cflags = sd->flags, pflags = parent->flags;
5279
5280 if (sd_degenerate(parent))
5281 return 1;
5282
5283 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5284 return 0;
5285
5286
5287 if (parent->groups == parent->groups->next) {
5288 pflags &= ~(SD_LOAD_BALANCE |
5289 SD_BALANCE_NEWIDLE |
5290 SD_BALANCE_FORK |
5291 SD_BALANCE_EXEC |
5292 SD_SHARE_CPUPOWER |
5293 SD_SHARE_PKG_RESOURCES |
5294 SD_PREFER_SIBLING);
5295 if (nr_node_ids == 1)
5296 pflags &= ~SD_SERIALIZE;
5297 }
5298 if (~cflags & pflags)
5299 return 0;
5300
5301 return 1;
5302}
5303
5304static void free_rootdomain(struct rcu_head *rcu)
5305{
5306 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5307
5308 cpupri_cleanup(&rd->cpupri);
5309 cpudl_cleanup(&rd->cpudl);
5310 free_cpumask_var(rd->dlo_mask);
5311 free_cpumask_var(rd->rto_mask);
5312 free_cpumask_var(rd->online);
5313 free_cpumask_var(rd->span);
5314 kfree(rd);
5315}
5316
5317static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5318{
5319 struct root_domain *old_rd = NULL;
5320 unsigned long flags;
5321
5322 raw_spin_lock_irqsave(&rq->lock, flags);
5323
5324 if (rq->rd) {
5325 old_rd = rq->rd;
5326
5327 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5328 set_rq_offline(rq);
5329
5330 cpumask_clear_cpu(rq->cpu, old_rd->span);
5331
5332
5333
5334
5335
5336
5337 if (!atomic_dec_and_test(&old_rd->refcount))
5338 old_rd = NULL;
5339 }
5340
5341 atomic_inc(&rd->refcount);
5342 rq->rd = rd;
5343
5344 cpumask_set_cpu(rq->cpu, rd->span);
5345 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5346 set_rq_online(rq);
5347
5348 raw_spin_unlock_irqrestore(&rq->lock, flags);
5349
5350 if (old_rd)
5351 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5352}
5353
5354static int init_rootdomain(struct root_domain *rd)
5355{
5356 memset(rd, 0, sizeof(*rd));
5357
5358 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5359 goto out;
5360 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5361 goto free_span;
5362 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5363 goto free_online;
5364 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5365 goto free_dlo_mask;
5366
5367 init_dl_bw(&rd->dl_bw);
5368 if (cpudl_init(&rd->cpudl) != 0)
5369 goto free_dlo_mask;
5370
5371 if (cpupri_init(&rd->cpupri) != 0)
5372 goto free_rto_mask;
5373 return 0;
5374
5375free_rto_mask:
5376 free_cpumask_var(rd->rto_mask);
5377free_dlo_mask:
5378 free_cpumask_var(rd->dlo_mask);
5379free_online:
5380 free_cpumask_var(rd->online);
5381free_span:
5382 free_cpumask_var(rd->span);
5383out:
5384 return -ENOMEM;
5385}
5386
5387
5388
5389
5390
5391struct root_domain def_root_domain;
5392
5393static void init_defrootdomain(void)
5394{
5395 init_rootdomain(&def_root_domain);
5396
5397 atomic_set(&def_root_domain.refcount, 1);
5398}
5399
5400static struct root_domain *alloc_rootdomain(void)
5401{
5402 struct root_domain *rd;
5403
5404 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5405 if (!rd)
5406 return NULL;
5407
5408 if (init_rootdomain(rd) != 0) {
5409 kfree(rd);
5410 return NULL;
5411 }
5412
5413 return rd;
5414}
5415
5416static void free_sched_groups(struct sched_group *sg, int free_sgp)
5417{
5418 struct sched_group *tmp, *first;
5419
5420 if (!sg)
5421 return;
5422
5423 first = sg;
5424 do {
5425 tmp = sg->next;
5426
5427 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5428 kfree(sg->sgp);
5429
5430 kfree(sg);
5431 sg = tmp;
5432 } while (sg != first);
5433}
5434
5435static void free_sched_domain(struct rcu_head *rcu)
5436{
5437 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5438
5439
5440
5441
5442
5443 if (sd->flags & SD_OVERLAP) {
5444 free_sched_groups(sd->groups, 1);
5445 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5446 kfree(sd->groups->sgp);
5447 kfree(sd->groups);
5448 }
5449 kfree(sd);
5450}
5451
5452static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5453{
5454 call_rcu(&sd->rcu, free_sched_domain);
5455}
5456
5457static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5458{
5459 for (; sd; sd = sd->parent)
5460 destroy_sched_domain(sd, cpu);
5461}
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5473DEFINE_PER_CPU(int, sd_llc_size);
5474DEFINE_PER_CPU(int, sd_llc_id);
5475DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5476DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5477DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5478
5479static void update_top_cache_domain(int cpu)
5480{
5481 struct sched_domain *sd;
5482 struct sched_domain *busy_sd = NULL;
5483 int id = cpu;
5484 int size = 1;
5485
5486 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5487 if (sd) {
5488 id = cpumask_first(sched_domain_span(sd));
5489 size = cpumask_weight(sched_domain_span(sd));
5490 busy_sd = sd->parent;
5491 }
5492 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5493
5494 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5495 per_cpu(sd_llc_size, cpu) = size;
5496 per_cpu(sd_llc_id, cpu) = id;
5497
5498 sd = lowest_flag_domain(cpu, SD_NUMA);
5499 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5500
5501 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5502 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5503}
5504
5505
5506
5507
5508
5509static void
5510cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5511{
5512 struct rq *rq = cpu_rq(cpu);
5513 struct sched_domain *tmp;
5514
5515
5516 for (tmp = sd; tmp; ) {
5517 struct sched_domain *parent = tmp->parent;
5518 if (!parent)
5519 break;
5520
5521 if (sd_parent_degenerate(tmp, parent)) {
5522 tmp->parent = parent->parent;
5523 if (parent->parent)
5524 parent->parent->child = tmp;
5525
5526
5527
5528
5529
5530 if (parent->flags & SD_PREFER_SIBLING)
5531 tmp->flags |= SD_PREFER_SIBLING;
5532 destroy_sched_domain(parent, cpu);
5533 } else
5534 tmp = tmp->parent;
5535 }
5536
5537 if (sd && sd_degenerate(sd)) {
5538 tmp = sd;
5539 sd = sd->parent;
5540 destroy_sched_domain(tmp, cpu);
5541 if (sd)
5542 sd->child = NULL;
5543 }
5544
5545 sched_domain_debug(sd, cpu);
5546
5547 rq_attach_root(rq, rd);
5548 tmp = rq->sd;
5549 rcu_assign_pointer(rq->sd, sd);
5550 destroy_sched_domains(tmp, cpu);
5551
5552 update_top_cache_domain(cpu);
5553}
5554
5555
5556static cpumask_var_t cpu_isolated_map;
5557
5558
5559static int __init isolated_cpu_setup(char *str)
5560{
5561 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5562 cpulist_parse(str, cpu_isolated_map);
5563 return 1;
5564}
5565
5566__setup("isolcpus=", isolated_cpu_setup);
5567
5568static const struct cpumask *cpu_cpu_mask(int cpu)
5569{
5570 return cpumask_of_node(cpu_to_node(cpu));
5571}
5572
5573struct sd_data {
5574 struct sched_domain **__percpu sd;
5575 struct sched_group **__percpu sg;
5576 struct sched_group_power **__percpu sgp;
5577};
5578
5579struct s_data {
5580 struct sched_domain ** __percpu sd;
5581 struct root_domain *rd;
5582};
5583
5584enum s_alloc {
5585 sa_rootdomain,
5586 sa_sd,
5587 sa_sd_storage,
5588 sa_none,
5589};
5590
5591struct sched_domain_topology_level;
5592
5593typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5594typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5595
5596#define SDTL_OVERLAP 0x01
5597
5598struct sched_domain_topology_level {
5599 sched_domain_init_f init;
5600 sched_domain_mask_f mask;
5601 int flags;
5602 int numa_level;
5603 struct sd_data data;
5604};
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5620{
5621 const struct cpumask *span = sched_domain_span(sd);
5622 struct sd_data *sdd = sd->private;
5623 struct sched_domain *sibling;
5624 int i;
5625
5626 for_each_cpu(i, span) {
5627 sibling = *per_cpu_ptr(sdd->sd, i);
5628 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5629 continue;
5630
5631 cpumask_set_cpu(i, sched_group_mask(sg));
5632 }
5633}
5634
5635
5636
5637
5638
5639int group_balance_cpu(struct sched_group *sg)
5640{
5641 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5642}
5643
5644static int
5645build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5646{
5647 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5648 const struct cpumask *span = sched_domain_span(sd);
5649 struct cpumask *covered = sched_domains_tmpmask;
5650 struct sd_data *sdd = sd->private;
5651 struct sched_domain *child;
5652 int i;
5653
5654 cpumask_clear(covered);
5655
5656 for_each_cpu(i, span) {
5657 struct cpumask *sg_span;
5658
5659 if (cpumask_test_cpu(i, covered))
5660 continue;
5661
5662 child = *per_cpu_ptr(sdd->sd, i);
5663
5664
5665 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5666 continue;
5667
5668 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5669 GFP_KERNEL, cpu_to_node(cpu));
5670
5671 if (!sg)
5672 goto fail;
5673
5674 sg_span = sched_group_cpus(sg);
5675 if (child->child) {
5676 child = child->child;
5677 cpumask_copy(sg_span, sched_domain_span(child));
5678 } else
5679 cpumask_set_cpu(i, sg_span);
5680
5681 cpumask_or(covered, covered, sg_span);
5682
5683 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5684 if (atomic_inc_return(&sg->sgp->ref) == 1)
5685 build_group_mask(sd, sg);
5686
5687
5688
5689
5690
5691
5692 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5693 sg->sgp->power_orig = sg->sgp->power;
5694
5695
5696
5697
5698
5699
5700 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5701 group_balance_cpu(sg) == cpu)
5702 groups = sg;
5703
5704 if (!first)
5705 first = sg;
5706 if (last)
5707 last->next = sg;
5708 last = sg;
5709 last->next = first;
5710 }
5711 sd->groups = groups;
5712
5713 return 0;
5714
5715fail:
5716 free_sched_groups(first, 0);
5717
5718 return -ENOMEM;
5719}
5720
5721static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5722{
5723 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5724 struct sched_domain *child = sd->child;
5725
5726 if (child)
5727 cpu = cpumask_first(sched_domain_span(child));
5728
5729 if (sg) {
5730 *sg = *per_cpu_ptr(sdd->sg, cpu);
5731 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5732 atomic_set(&(*sg)->sgp->ref, 1);
5733 }
5734
5735 return cpu;
5736}
5737
5738
5739
5740
5741
5742
5743
5744
5745static int
5746build_sched_groups(struct sched_domain *sd, int cpu)
5747{
5748 struct sched_group *first = NULL, *last = NULL;
5749 struct sd_data *sdd = sd->private;
5750 const struct cpumask *span = sched_domain_span(sd);
5751 struct cpumask *covered;
5752 int i;
5753
5754 get_group(cpu, sdd, &sd->groups);
5755 atomic_inc(&sd->groups->ref);
5756
5757 if (cpu != cpumask_first(span))
5758 return 0;
5759
5760 lockdep_assert_held(&sched_domains_mutex);
5761 covered = sched_domains_tmpmask;
5762
5763 cpumask_clear(covered);
5764
5765 for_each_cpu(i, span) {
5766 struct sched_group *sg;
5767 int group, j;
5768
5769 if (cpumask_test_cpu(i, covered))
5770 continue;
5771
5772 group = get_group(i, sdd, &sg);
5773 cpumask_clear(sched_group_cpus(sg));
5774 sg->sgp->power = 0;
5775 cpumask_setall(sched_group_mask(sg));
5776
5777 for_each_cpu(j, span) {
5778 if (get_group(j, sdd, NULL) != group)
5779 continue;
5780
5781 cpumask_set_cpu(j, covered);
5782 cpumask_set_cpu(j, sched_group_cpus(sg));
5783 }
5784
5785 if (!first)
5786 first = sg;
5787 if (last)
5788 last->next = sg;
5789 last = sg;
5790 }
5791 last->next = first;
5792
5793 return 0;
5794}
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5807{
5808 struct sched_group *sg = sd->groups;
5809
5810 WARN_ON(!sg);
5811
5812 do {
5813 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5814 sg = sg->next;
5815 } while (sg != sd->groups);
5816
5817 if (cpu != group_balance_cpu(sg))
5818 return;
5819
5820 update_group_power(sd, cpu);
5821 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5822}
5823
5824int __weak arch_sd_sibling_asym_packing(void)
5825{
5826 return 0*SD_ASYM_PACKING;
5827}
5828
5829
5830
5831
5832
5833
5834#ifdef CONFIG_SCHED_DEBUG
5835# define SD_INIT_NAME(sd, type) sd->name = #type
5836#else
5837# define SD_INIT_NAME(sd, type) do { } while (0)
5838#endif
5839
5840#define SD_INIT_FUNC(type) \
5841static noinline struct sched_domain * \
5842sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5843{ \
5844 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5845 *sd = SD_##type##_INIT; \
5846 SD_INIT_NAME(sd, type); \
5847 sd->private = &tl->data; \
5848 return sd; \
5849}
5850
5851SD_INIT_FUNC(CPU)
5852#ifdef CONFIG_SCHED_SMT
5853 SD_INIT_FUNC(SIBLING)
5854#endif
5855#ifdef CONFIG_SCHED_MC
5856 SD_INIT_FUNC(MC)
5857#endif
5858#ifdef CONFIG_SCHED_BOOK
5859 SD_INIT_FUNC(BOOK)
5860#endif
5861
5862static int default_relax_domain_level = -1;
5863int sched_domain_level_max;
5864
5865static int __init setup_relax_domain_level(char *str)
5866{
5867 if (kstrtoint(str, 0, &default_relax_domain_level))
5868 pr_warn("Unable to set relax_domain_level\n");
5869
5870 return 1;
5871}
5872__setup("relax_domain_level=", setup_relax_domain_level);
5873
5874static void set_domain_attribute(struct sched_domain *sd,
5875 struct sched_domain_attr *attr)
5876{
5877 int request;
5878
5879 if (!attr || attr->relax_domain_level < 0) {
5880 if (default_relax_domain_level < 0)
5881 return;
5882 else
5883 request = default_relax_domain_level;
5884 } else
5885 request = attr->relax_domain_level;
5886 if (request < sd->level) {
5887
5888 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5889 } else {
5890
5891 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5892 }
5893}
5894
5895static void __sdt_free(const struct cpumask *cpu_map);
5896static int __sdt_alloc(const struct cpumask *cpu_map);
5897
5898static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5899 const struct cpumask *cpu_map)
5900{
5901 switch (what) {
5902 case sa_rootdomain:
5903 if (!atomic_read(&d->rd->refcount))
5904 free_rootdomain(&d->rd->rcu);
5905 case sa_sd:
5906 free_percpu(d->sd);
5907 case sa_sd_storage:
5908 __sdt_free(cpu_map);
5909 case sa_none:
5910 break;
5911 }
5912}
5913
5914static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5915 const struct cpumask *cpu_map)
5916{
5917 memset(d, 0, sizeof(*d));
5918
5919 if (__sdt_alloc(cpu_map))
5920 return sa_sd_storage;
5921 d->sd = alloc_percpu(struct sched_domain *);
5922 if (!d->sd)
5923 return sa_sd_storage;
5924 d->rd = alloc_rootdomain();
5925 if (!d->rd)
5926 return sa_sd;
5927 return sa_rootdomain;
5928}
5929
5930
5931
5932
5933
5934
5935static void claim_allocations(int cpu, struct sched_domain *sd)
5936{
5937 struct sd_data *sdd = sd->private;
5938
5939 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5940 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5941
5942 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5943 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5944
5945 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5946 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5947}
5948
5949#ifdef CONFIG_SCHED_SMT
5950static const struct cpumask *cpu_smt_mask(int cpu)
5951{
5952 return topology_thread_cpumask(cpu);
5953}
5954#endif
5955
5956
5957
5958
5959static struct sched_domain_topology_level default_topology[] = {
5960#ifdef CONFIG_SCHED_SMT
5961 { sd_init_SIBLING, cpu_smt_mask, },
5962#endif
5963#ifdef CONFIG_SCHED_MC
5964 { sd_init_MC, cpu_coregroup_mask, },
5965#endif
5966#ifdef CONFIG_SCHED_BOOK
5967 { sd_init_BOOK, cpu_book_mask, },
5968#endif
5969 { sd_init_CPU, cpu_cpu_mask, },
5970 { NULL, },
5971};
5972
5973static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5974
5975#define for_each_sd_topology(tl) \
5976 for (tl = sched_domain_topology; tl->init; tl++)
5977
5978#ifdef CONFIG_NUMA
5979
5980static int sched_domains_numa_levels;
5981static int *sched_domains_numa_distance;
5982static struct cpumask ***sched_domains_numa_masks;
5983static int sched_domains_curr_level;
5984
5985static inline int sd_local_flags(int level)
5986{
5987 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5988 return 0;
5989
5990 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5991}
5992
5993static struct sched_domain *
5994sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5995{
5996 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5997 int level = tl->numa_level;
5998 int sd_weight = cpumask_weight(
5999 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6000
6001 *sd = (struct sched_domain){
6002 .min_interval = sd_weight,
6003 .max_interval = 2*sd_weight,
6004 .busy_factor = 32,
6005 .imbalance_pct = 125,
6006 .cache_nice_tries = 2,
6007 .busy_idx = 3,
6008 .idle_idx = 2,
6009 .newidle_idx = 0,
6010 .wake_idx = 0,
6011 .forkexec_idx = 0,
6012
6013 .flags = 1*SD_LOAD_BALANCE
6014 | 1*SD_BALANCE_NEWIDLE
6015 | 0*SD_BALANCE_EXEC
6016 | 0*SD_BALANCE_FORK
6017 | 0*SD_BALANCE_WAKE
6018 | 0*SD_WAKE_AFFINE
6019 | 0*SD_SHARE_CPUPOWER
6020 | 0*SD_SHARE_PKG_RESOURCES
6021 | 1*SD_SERIALIZE
6022 | 0*SD_PREFER_SIBLING
6023 | 1*SD_NUMA
6024 | sd_local_flags(level)
6025 ,
6026 .last_balance = jiffies,
6027 .balance_interval = sd_weight,
6028 };
6029 SD_INIT_NAME(sd, NUMA);
6030 sd->private = &tl->data;
6031
6032
6033
6034
6035 sched_domains_curr_level = tl->numa_level;
6036
6037 return sd;
6038}
6039
6040static const struct cpumask *sd_numa_mask(int cpu)
6041{
6042 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6043}
6044
6045static void sched_numa_warn(const char *str)
6046{
6047 static int done = false;
6048 int i,j;
6049
6050 if (done)
6051 return;
6052
6053 done = true;
6054
6055 printk(KERN_WARNING "ERROR: %s\n\n", str);
6056
6057 for (i = 0; i < nr_node_ids; i++) {
6058 printk(KERN_WARNING " ");
6059 for (j = 0; j < nr_node_ids; j++)
6060 printk(KERN_CONT "%02d ", node_distance(i,j));
6061 printk(KERN_CONT "\n");
6062 }
6063 printk(KERN_WARNING "\n");
6064}
6065
6066static bool find_numa_distance(int distance)
6067{
6068 int i;
6069
6070 if (distance == node_distance(0, 0))
6071 return true;
6072
6073 for (i = 0; i < sched_domains_numa_levels; i++) {
6074 if (sched_domains_numa_distance[i] == distance)
6075 return true;
6076 }
6077
6078 return false;
6079}
6080
6081static void sched_init_numa(void)
6082{
6083 int next_distance, curr_distance = node_distance(0, 0);
6084 struct sched_domain_topology_level *tl;
6085 int level = 0;
6086 int i, j, k;
6087
6088 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6089 if (!sched_domains_numa_distance)
6090 return;
6091
6092
6093
6094
6095
6096
6097
6098
6099 next_distance = curr_distance;
6100 for (i = 0; i < nr_node_ids; i++) {
6101 for (j = 0; j < nr_node_ids; j++) {
6102 for (k = 0; k < nr_node_ids; k++) {
6103 int distance = node_distance(i, k);
6104
6105 if (distance > curr_distance &&
6106 (distance < next_distance ||
6107 next_distance == curr_distance))
6108 next_distance = distance;
6109
6110
6111
6112
6113
6114
6115 if (sched_debug() && node_distance(k, i) != distance)
6116 sched_numa_warn("Node-distance not symmetric");
6117
6118 if (sched_debug() && i && !find_numa_distance(distance))
6119 sched_numa_warn("Node-0 not representative");
6120 }
6121 if (next_distance != curr_distance) {
6122 sched_domains_numa_distance[level++] = next_distance;
6123 sched_domains_numa_levels = level;
6124 curr_distance = next_distance;
6125 } else break;
6126 }
6127
6128
6129
6130
6131 if (!sched_debug())
6132 break;
6133 }
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151 sched_domains_numa_levels = 0;
6152
6153 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6154 if (!sched_domains_numa_masks)
6155 return;
6156
6157
6158
6159
6160
6161 for (i = 0; i < level; i++) {
6162 sched_domains_numa_masks[i] =
6163 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6164 if (!sched_domains_numa_masks[i])
6165 return;
6166
6167 for (j = 0; j < nr_node_ids; j++) {
6168 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6169 if (!mask)
6170 return;
6171
6172 sched_domains_numa_masks[i][j] = mask;
6173
6174 for (k = 0; k < nr_node_ids; k++) {
6175 if (node_distance(j, k) > sched_domains_numa_distance[i])
6176 continue;
6177
6178 cpumask_or(mask, mask, cpumask_of_node(k));
6179 }
6180 }
6181 }
6182
6183 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6184 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6185 if (!tl)
6186 return;
6187
6188
6189
6190
6191 for (i = 0; default_topology[i].init; i++)
6192 tl[i] = default_topology[i];
6193
6194
6195
6196
6197 for (j = 0; j < level; i++, j++) {
6198 tl[i] = (struct sched_domain_topology_level){
6199 .init = sd_numa_init,
6200 .mask = sd_numa_mask,
6201 .flags = SDTL_OVERLAP,
6202 .numa_level = j,
6203 };
6204 }
6205
6206 sched_domain_topology = tl;
6207
6208 sched_domains_numa_levels = level;
6209}
6210
6211static void sched_domains_numa_masks_set(int cpu)
6212{
6213 int i, j;
6214 int node = cpu_to_node(cpu);
6215
6216 for (i = 0; i < sched_domains_numa_levels; i++) {
6217 for (j = 0; j < nr_node_ids; j++) {
6218 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6219 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6220 }
6221 }
6222}
6223
6224static void sched_domains_numa_masks_clear(int cpu)
6225{
6226 int i, j;
6227 for (i = 0; i < sched_domains_numa_levels; i++) {
6228 for (j = 0; j < nr_node_ids; j++)
6229 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6230 }
6231}
6232
6233
6234
6235
6236
6237static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6238 unsigned long action,
6239 void *hcpu)
6240{
6241 int cpu = (long)hcpu;
6242
6243 switch (action & ~CPU_TASKS_FROZEN) {
6244 case CPU_ONLINE:
6245 sched_domains_numa_masks_set(cpu);
6246 break;
6247
6248 case CPU_DEAD:
6249 sched_domains_numa_masks_clear(cpu);
6250 break;
6251
6252 default:
6253 return NOTIFY_DONE;
6254 }
6255
6256 return NOTIFY_OK;
6257}
6258#else
6259static inline void sched_init_numa(void)
6260{
6261}
6262
6263static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6264 unsigned long action,
6265 void *hcpu)
6266{
6267 return 0;
6268}
6269#endif
6270
6271static int __sdt_alloc(const struct cpumask *cpu_map)
6272{
6273 struct sched_domain_topology_level *tl;
6274 int j;
6275
6276 for_each_sd_topology(tl) {
6277 struct sd_data *sdd = &tl->data;
6278
6279 sdd->sd = alloc_percpu(struct sched_domain *);
6280 if (!sdd->sd)
6281 return -ENOMEM;
6282
6283 sdd->sg = alloc_percpu(struct sched_group *);
6284 if (!sdd->sg)
6285 return -ENOMEM;
6286
6287 sdd->sgp = alloc_percpu(struct sched_group_power *);
6288 if (!sdd->sgp)
6289 return -ENOMEM;
6290
6291 for_each_cpu(j, cpu_map) {
6292 struct sched_domain *sd;
6293 struct sched_group *sg;
6294 struct sched_group_power *sgp;
6295
6296 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6297 GFP_KERNEL, cpu_to_node(j));
6298 if (!sd)
6299 return -ENOMEM;
6300
6301 *per_cpu_ptr(sdd->sd, j) = sd;
6302
6303 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6304 GFP_KERNEL, cpu_to_node(j));
6305 if (!sg)
6306 return -ENOMEM;
6307
6308 sg->next = sg;
6309
6310 *per_cpu_ptr(sdd->sg, j) = sg;
6311
6312 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6313 GFP_KERNEL, cpu_to_node(j));
6314 if (!sgp)
6315 return -ENOMEM;
6316
6317 *per_cpu_ptr(sdd->sgp, j) = sgp;
6318 }
6319 }
6320
6321 return 0;
6322}
6323
6324static void __sdt_free(const struct cpumask *cpu_map)
6325{
6326 struct sched_domain_topology_level *tl;
6327 int j;
6328
6329 for_each_sd_topology(tl) {
6330 struct sd_data *sdd = &tl->data;
6331
6332 for_each_cpu(j, cpu_map) {
6333 struct sched_domain *sd;
6334
6335 if (sdd->sd) {
6336 sd = *per_cpu_ptr(sdd->sd, j);
6337 if (sd && (sd->flags & SD_OVERLAP))
6338 free_sched_groups(sd->groups, 0);
6339 kfree(*per_cpu_ptr(sdd->sd, j));
6340 }
6341
6342 if (sdd->sg)
6343 kfree(*per_cpu_ptr(sdd->sg, j));
6344 if (sdd->sgp)
6345 kfree(*per_cpu_ptr(sdd->sgp, j));
6346 }
6347 free_percpu(sdd->sd);
6348 sdd->sd = NULL;
6349 free_percpu(sdd->sg);
6350 sdd->sg = NULL;
6351 free_percpu(sdd->sgp);
6352 sdd->sgp = NULL;
6353 }
6354}
6355
6356struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6357 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6358 struct sched_domain *child, int cpu)
6359{
6360 struct sched_domain *sd = tl->init(tl, cpu);
6361 if (!sd)
6362 return child;
6363
6364 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6365 if (child) {
6366 sd->level = child->level + 1;
6367 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6368 child->parent = sd;
6369 sd->child = child;
6370 }
6371 set_domain_attribute(sd, attr);
6372
6373 return sd;
6374}
6375
6376
6377
6378
6379
6380static int build_sched_domains(const struct cpumask *cpu_map,
6381 struct sched_domain_attr *attr)
6382{
6383 enum s_alloc alloc_state;
6384 struct sched_domain *sd;
6385 struct s_data d;
6386 int i, ret = -ENOMEM;
6387
6388 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6389 if (alloc_state != sa_rootdomain)
6390 goto error;
6391
6392
6393 for_each_cpu(i, cpu_map) {
6394 struct sched_domain_topology_level *tl;
6395
6396 sd = NULL;
6397 for_each_sd_topology(tl) {
6398 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6399 if (tl == sched_domain_topology)
6400 *per_cpu_ptr(d.sd, i) = sd;
6401 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6402 sd->flags |= SD_OVERLAP;
6403 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6404 break;
6405 }
6406 }
6407
6408
6409 for_each_cpu(i, cpu_map) {
6410 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6411 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6412 if (sd->flags & SD_OVERLAP) {
6413 if (build_overlap_sched_groups(sd, i))
6414 goto error;
6415 } else {
6416 if (build_sched_groups(sd, i))
6417 goto error;
6418 }
6419 }
6420 }
6421
6422
6423 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6424 if (!cpumask_test_cpu(i, cpu_map))
6425 continue;
6426
6427 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6428 claim_allocations(i, sd);
6429 init_sched_groups_power(i, sd);
6430 }
6431 }
6432
6433
6434 rcu_read_lock();
6435 for_each_cpu(i, cpu_map) {
6436 sd = *per_cpu_ptr(d.sd, i);
6437 cpu_attach_domain(sd, d.rd, i);
6438 }
6439 rcu_read_unlock();
6440
6441 ret = 0;
6442error:
6443 __free_domain_allocs(&d, alloc_state, cpu_map);
6444 return ret;
6445}
6446
6447static cpumask_var_t *doms_cur;
6448static int ndoms_cur;
6449static struct sched_domain_attr *dattr_cur;
6450
6451
6452
6453
6454
6455
6456
6457static cpumask_var_t fallback_doms;
6458
6459
6460
6461
6462
6463
6464int __attribute__((weak)) arch_update_cpu_topology(void)
6465{
6466 return 0;
6467}
6468
6469cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6470{
6471 int i;
6472 cpumask_var_t *doms;
6473
6474 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6475 if (!doms)
6476 return NULL;
6477 for (i = 0; i < ndoms; i++) {
6478 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6479 free_sched_domains(doms, i);
6480 return NULL;
6481 }
6482 }
6483 return doms;
6484}
6485
6486void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6487{
6488 unsigned int i;
6489 for (i = 0; i < ndoms; i++)
6490 free_cpumask_var(doms[i]);
6491 kfree(doms);
6492}
6493
6494
6495
6496
6497
6498
6499static int init_sched_domains(const struct cpumask *cpu_map)
6500{
6501 int err;
6502
6503 arch_update_cpu_topology();
6504 ndoms_cur = 1;
6505 doms_cur = alloc_sched_domains(ndoms_cur);
6506 if (!doms_cur)
6507 doms_cur = &fallback_doms;
6508 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6509 err = build_sched_domains(doms_cur[0], NULL);
6510 register_sched_domain_sysctl();
6511
6512 return err;
6513}
6514
6515
6516
6517
6518
6519static void detach_destroy_domains(const struct cpumask *cpu_map)
6520{
6521 int i;
6522
6523 rcu_read_lock();
6524 for_each_cpu(i, cpu_map)
6525 cpu_attach_domain(NULL, &def_root_domain, i);
6526 rcu_read_unlock();
6527}
6528
6529
6530static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6531 struct sched_domain_attr *new, int idx_new)
6532{
6533 struct sched_domain_attr tmp;
6534
6535
6536 if (!new && !cur)
6537 return 1;
6538
6539 tmp = SD_ATTR_INIT;
6540 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6541 new ? (new + idx_new) : &tmp,
6542 sizeof(struct sched_domain_attr));
6543}
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6572 struct sched_domain_attr *dattr_new)
6573{
6574 int i, j, n;
6575 int new_topology;
6576
6577 mutex_lock(&sched_domains_mutex);
6578
6579
6580 unregister_sched_domain_sysctl();
6581
6582
6583 new_topology = arch_update_cpu_topology();
6584
6585 n = doms_new ? ndoms_new : 0;
6586
6587
6588 for (i = 0; i < ndoms_cur; i++) {
6589 for (j = 0; j < n && !new_topology; j++) {
6590 if (cpumask_equal(doms_cur[i], doms_new[j])
6591 && dattrs_equal(dattr_cur, i, dattr_new, j))
6592 goto match1;
6593 }
6594
6595 detach_destroy_domains(doms_cur[i]);
6596match1:
6597 ;
6598 }
6599
6600 n = ndoms_cur;
6601 if (doms_new == NULL) {
6602 n = 0;
6603 doms_new = &fallback_doms;
6604 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6605 WARN_ON_ONCE(dattr_new);
6606 }
6607
6608
6609 for (i = 0; i < ndoms_new; i++) {
6610 for (j = 0; j < n && !new_topology; j++) {
6611 if (cpumask_equal(doms_new[i], doms_cur[j])
6612 && dattrs_equal(dattr_new, i, dattr_cur, j))
6613 goto match2;
6614 }
6615
6616 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6617match2:
6618 ;
6619 }
6620
6621
6622 if (doms_cur != &fallback_doms)
6623 free_sched_domains(doms_cur, ndoms_cur);
6624 kfree(dattr_cur);
6625 doms_cur = doms_new;
6626 dattr_cur = dattr_new;
6627 ndoms_cur = ndoms_new;
6628
6629 register_sched_domain_sysctl();
6630
6631 mutex_unlock(&sched_domains_mutex);
6632}
6633
6634static int num_cpus_frozen;
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6645 void *hcpu)
6646{
6647 switch (action) {
6648 case CPU_ONLINE_FROZEN:
6649 case CPU_DOWN_FAILED_FROZEN:
6650
6651
6652
6653
6654
6655
6656
6657 num_cpus_frozen--;
6658 if (likely(num_cpus_frozen)) {
6659 partition_sched_domains(1, NULL, NULL);
6660 break;
6661 }
6662
6663
6664
6665
6666
6667
6668
6669 case CPU_ONLINE:
6670 case CPU_DOWN_FAILED:
6671 cpuset_update_active_cpus(true);
6672 break;
6673 default:
6674 return NOTIFY_DONE;
6675 }
6676 return NOTIFY_OK;
6677}
6678
6679static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6680 void *hcpu)
6681{
6682 switch (action) {
6683 case CPU_DOWN_PREPARE:
6684 cpuset_update_active_cpus(false);
6685 break;
6686 case CPU_DOWN_PREPARE_FROZEN:
6687 num_cpus_frozen++;
6688 partition_sched_domains(1, NULL, NULL);
6689 break;
6690 default:
6691 return NOTIFY_DONE;
6692 }
6693 return NOTIFY_OK;
6694}
6695
6696void __init sched_init_smp(void)
6697{
6698 cpumask_var_t non_isolated_cpus;
6699
6700 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6701 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6702
6703 sched_init_numa();
6704
6705
6706
6707
6708
6709
6710 mutex_lock(&sched_domains_mutex);
6711 init_sched_domains(cpu_active_mask);
6712 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6713 if (cpumask_empty(non_isolated_cpus))
6714 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6715 mutex_unlock(&sched_domains_mutex);
6716
6717 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6718 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6719 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6720
6721 init_hrtick();
6722
6723
6724 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6725 BUG();
6726 sched_init_granularity();
6727 free_cpumask_var(non_isolated_cpus);
6728
6729 init_sched_rt_class();
6730 init_sched_dl_class();
6731}
6732#else
6733void __init sched_init_smp(void)
6734{
6735 sched_init_granularity();
6736}
6737#endif
6738
6739const_debug unsigned int sysctl_timer_migration = 1;
6740
6741int in_sched_functions(unsigned long addr)
6742{
6743 return in_lock_functions(addr) ||
6744 (addr >= (unsigned long)__sched_text_start
6745 && addr < (unsigned long)__sched_text_end);
6746}
6747
6748#ifdef CONFIG_CGROUP_SCHED
6749
6750
6751
6752
6753struct task_group root_task_group;
6754LIST_HEAD(task_groups);
6755#endif
6756
6757DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6758
6759void __init sched_init(void)
6760{
6761 int i, j;
6762 unsigned long alloc_size = 0, ptr;
6763
6764#ifdef CONFIG_FAIR_GROUP_SCHED
6765 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6766#endif
6767#ifdef CONFIG_RT_GROUP_SCHED
6768 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6769#endif
6770#ifdef CONFIG_CPUMASK_OFFSTACK
6771 alloc_size += num_possible_cpus() * cpumask_size();
6772#endif
6773 if (alloc_size) {
6774 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6775
6776#ifdef CONFIG_FAIR_GROUP_SCHED
6777 root_task_group.se = (struct sched_entity **)ptr;
6778 ptr += nr_cpu_ids * sizeof(void **);
6779
6780 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6781 ptr += nr_cpu_ids * sizeof(void **);
6782
6783#endif
6784#ifdef CONFIG_RT_GROUP_SCHED
6785 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6786 ptr += nr_cpu_ids * sizeof(void **);
6787
6788 root_task_group.rt_rq = (struct rt_rq **)ptr;
6789 ptr += nr_cpu_ids * sizeof(void **);
6790
6791#endif
6792#ifdef CONFIG_CPUMASK_OFFSTACK
6793 for_each_possible_cpu(i) {
6794 per_cpu(load_balance_mask, i) = (void *)ptr;
6795 ptr += cpumask_size();
6796 }
6797#endif
6798 }
6799
6800 init_rt_bandwidth(&def_rt_bandwidth,
6801 global_rt_period(), global_rt_runtime());
6802 init_dl_bandwidth(&def_dl_bandwidth,
6803 global_rt_period(), global_rt_runtime());
6804
6805#ifdef CONFIG_SMP
6806 init_defrootdomain();
6807#endif
6808
6809#ifdef CONFIG_RT_GROUP_SCHED
6810 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6811 global_rt_period(), global_rt_runtime());
6812#endif
6813
6814#ifdef CONFIG_CGROUP_SCHED
6815 list_add(&root_task_group.list, &task_groups);
6816 INIT_LIST_HEAD(&root_task_group.children);
6817 INIT_LIST_HEAD(&root_task_group.siblings);
6818 autogroup_init(&init_task);
6819
6820#endif
6821
6822 for_each_possible_cpu(i) {
6823 struct rq *rq;
6824
6825 rq = cpu_rq(i);
6826 raw_spin_lock_init(&rq->lock);
6827 rq->nr_running = 0;
6828 rq->calc_load_active = 0;
6829 rq->calc_load_update = jiffies + LOAD_FREQ;
6830 init_cfs_rq(&rq->cfs);
6831 init_rt_rq(&rq->rt, rq);
6832 init_dl_rq(&rq->dl, rq);
6833#ifdef CONFIG_FAIR_GROUP_SCHED
6834 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6835 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6856 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6857#endif
6858
6859 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6860#ifdef CONFIG_RT_GROUP_SCHED
6861 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6862 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6863#endif
6864
6865 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6866 rq->cpu_load[j] = 0;
6867
6868 rq->last_load_update_tick = jiffies;
6869
6870#ifdef CONFIG_SMP
6871 rq->sd = NULL;
6872 rq->rd = NULL;
6873 rq->cpu_power = SCHED_POWER_SCALE;
6874 rq->post_schedule = 0;
6875 rq->active_balance = 0;
6876 rq->next_balance = jiffies;
6877 rq->push_cpu = 0;
6878 rq->cpu = i;
6879 rq->online = 0;
6880 rq->idle_stamp = 0;
6881 rq->avg_idle = 2*sysctl_sched_migration_cost;
6882 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6883
6884 INIT_LIST_HEAD(&rq->cfs_tasks);
6885
6886 rq_attach_root(rq, &def_root_domain);
6887#ifdef CONFIG_NO_HZ_COMMON
6888 rq->nohz_flags = 0;
6889#endif
6890#ifdef CONFIG_NO_HZ_FULL
6891 rq->last_sched_tick = 0;
6892#endif
6893#endif
6894 init_rq_hrtick(rq);
6895 atomic_set(&rq->nr_iowait, 0);
6896 }
6897
6898 set_load_weight(&init_task);
6899
6900#ifdef CONFIG_PREEMPT_NOTIFIERS
6901 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6902#endif
6903
6904
6905
6906
6907 atomic_inc(&init_mm.mm_count);
6908 enter_lazy_tlb(&init_mm, current);
6909
6910
6911
6912
6913
6914
6915
6916 init_idle(current, smp_processor_id());
6917
6918 calc_load_update = jiffies + LOAD_FREQ;
6919
6920
6921
6922
6923 current->sched_class = &fair_sched_class;
6924
6925#ifdef CONFIG_SMP
6926 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6927
6928 if (cpu_isolated_map == NULL)
6929 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6930 idle_thread_set_boot_cpu();
6931#endif
6932 init_sched_fair_class();
6933
6934 scheduler_running = 1;
6935}
6936
6937#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6938static inline int preempt_count_equals(int preempt_offset)
6939{
6940 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6941
6942 return (nested == preempt_offset);
6943}
6944
6945void __might_sleep(const char *file, int line, int preempt_offset)
6946{
6947 static unsigned long prev_jiffy;
6948
6949 rcu_sleep_check();
6950 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6951 system_state != SYSTEM_RUNNING || oops_in_progress)
6952 return;
6953 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6954 return;
6955 prev_jiffy = jiffies;
6956
6957 printk(KERN_ERR
6958 "BUG: sleeping function called from invalid context at %s:%d\n",
6959 file, line);
6960 printk(KERN_ERR
6961 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6962 in_atomic(), irqs_disabled(),
6963 current->pid, current->comm);
6964
6965 debug_show_held_locks(current);
6966 if (irqs_disabled())
6967 print_irqtrace_events(current);
6968 dump_stack();
6969}
6970EXPORT_SYMBOL(__might_sleep);
6971#endif
6972
6973#ifdef CONFIG_MAGIC_SYSRQ
6974static void normalize_task(struct rq *rq, struct task_struct *p)
6975{
6976 const struct sched_class *prev_class = p->sched_class;
6977 struct sched_attr attr = {
6978 .sched_policy = SCHED_NORMAL,
6979 };
6980 int old_prio = p->prio;
6981 int on_rq;
6982
6983 on_rq = p->on_rq;
6984 if (on_rq)
6985 dequeue_task(rq, p, 0);
6986 __setscheduler(rq, p, &attr);
6987 if (on_rq) {
6988 enqueue_task(rq, p, 0);
6989 resched_task(rq->curr);
6990 }
6991
6992 check_class_changed(rq, p, prev_class, old_prio);
6993}
6994
6995void normalize_rt_tasks(void)
6996{
6997 struct task_struct *g, *p;
6998 unsigned long flags;
6999 struct rq *rq;
7000
7001 read_lock_irqsave(&tasklist_lock, flags);
7002 do_each_thread(g, p) {
7003
7004
7005
7006 if (!p->mm)
7007 continue;
7008
7009 p->se.exec_start = 0;
7010#ifdef CONFIG_SCHEDSTATS
7011 p->se.statistics.wait_start = 0;
7012 p->se.statistics.sleep_start = 0;
7013 p->se.statistics.block_start = 0;
7014#endif
7015
7016 if (!dl_task(p) && !rt_task(p)) {
7017
7018
7019
7020
7021 if (TASK_NICE(p) < 0 && p->mm)
7022 set_user_nice(p, 0);
7023 continue;
7024 }
7025
7026 raw_spin_lock(&p->pi_lock);
7027 rq = __task_rq_lock(p);
7028
7029 normalize_task(rq, p);
7030
7031 __task_rq_unlock(rq);
7032 raw_spin_unlock(&p->pi_lock);
7033 } while_each_thread(g, p);
7034
7035 read_unlock_irqrestore(&tasklist_lock, flags);
7036}
7037
7038#endif
7039
7040#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059struct task_struct *curr_task(int cpu)
7060{
7061 return cpu_curr(cpu);
7062}
7063
7064#endif
7065
7066#ifdef CONFIG_IA64
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082void set_curr_task(int cpu, struct task_struct *p)
7083{
7084 cpu_curr(cpu) = p;
7085}
7086
7087#endif
7088
7089#ifdef CONFIG_CGROUP_SCHED
7090
7091static DEFINE_SPINLOCK(task_group_lock);
7092
7093static void free_sched_group(struct task_group *tg)
7094{
7095 free_fair_sched_group(tg);
7096 free_rt_sched_group(tg);
7097 autogroup_free(tg);
7098 kfree(tg);
7099}
7100
7101
7102struct task_group *sched_create_group(struct task_group *parent)
7103{
7104 struct task_group *tg;
7105
7106 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7107 if (!tg)
7108 return ERR_PTR(-ENOMEM);
7109
7110 if (!alloc_fair_sched_group(tg, parent))
7111 goto err;
7112
7113 if (!alloc_rt_sched_group(tg, parent))
7114 goto err;
7115
7116 return tg;
7117
7118err:
7119 free_sched_group(tg);
7120 return ERR_PTR(-ENOMEM);
7121}
7122
7123void sched_online_group(struct task_group *tg, struct task_group *parent)
7124{
7125 unsigned long flags;
7126
7127 spin_lock_irqsave(&task_group_lock, flags);
7128 list_add_rcu(&tg->list, &task_groups);
7129
7130 WARN_ON(!parent);
7131
7132 tg->parent = parent;
7133 INIT_LIST_HEAD(&tg->children);
7134 list_add_rcu(&tg->siblings, &parent->children);
7135 spin_unlock_irqrestore(&task_group_lock, flags);
7136}
7137
7138
7139static void free_sched_group_rcu(struct rcu_head *rhp)
7140{
7141
7142 free_sched_group(container_of(rhp, struct task_group, rcu));
7143}
7144
7145
7146void sched_destroy_group(struct task_group *tg)
7147{
7148
7149 call_rcu(&tg->rcu, free_sched_group_rcu);
7150}
7151
7152void sched_offline_group(struct task_group *tg)
7153{
7154 unsigned long flags;
7155 int i;
7156
7157
7158 for_each_possible_cpu(i)
7159 unregister_fair_sched_group(tg, i);
7160
7161 spin_lock_irqsave(&task_group_lock, flags);
7162 list_del_rcu(&tg->list);
7163 list_del_rcu(&tg->siblings);
7164 spin_unlock_irqrestore(&task_group_lock, flags);
7165}
7166
7167
7168
7169
7170
7171
7172void sched_move_task(struct task_struct *tsk)
7173{
7174 struct task_group *tg;
7175 int on_rq, running;
7176 unsigned long flags;
7177 struct rq *rq;
7178
7179 rq = task_rq_lock(tsk, &flags);
7180
7181 running = task_current(rq, tsk);
7182 on_rq = tsk->on_rq;
7183
7184 if (on_rq)
7185 dequeue_task(rq, tsk, 0);
7186 if (unlikely(running))
7187 tsk->sched_class->put_prev_task(rq, tsk);
7188
7189 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
7190 lockdep_is_held(&tsk->sighand->siglock)),
7191 struct task_group, css);
7192 tg = autogroup_task_group(tsk, tg);
7193 tsk->sched_task_group = tg;
7194
7195#ifdef CONFIG_FAIR_GROUP_SCHED
7196 if (tsk->sched_class->task_move_group)
7197 tsk->sched_class->task_move_group(tsk, on_rq);
7198 else
7199#endif
7200 set_task_rq(tsk, task_cpu(tsk));
7201
7202 if (unlikely(running))
7203 tsk->sched_class->set_curr_task(rq);
7204 if (on_rq)
7205 enqueue_task(rq, tsk, 0);
7206
7207 task_rq_unlock(rq, tsk, &flags);
7208}
7209#endif
7210
7211#ifdef CONFIG_RT_GROUP_SCHED
7212
7213
7214
7215static DEFINE_MUTEX(rt_constraints_mutex);
7216
7217
7218static inline int tg_has_rt_tasks(struct task_group *tg)
7219{
7220 struct task_struct *g, *p;
7221
7222 do_each_thread(g, p) {
7223 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7224 return 1;
7225 } while_each_thread(g, p);
7226
7227 return 0;
7228}
7229
7230struct rt_schedulable_data {
7231 struct task_group *tg;
7232 u64 rt_period;
7233 u64 rt_runtime;
7234};
7235
7236static int tg_rt_schedulable(struct task_group *tg, void *data)
7237{
7238 struct rt_schedulable_data *d = data;
7239 struct task_group *child;
7240 unsigned long total, sum = 0;
7241 u64 period, runtime;
7242
7243 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7244 runtime = tg->rt_bandwidth.rt_runtime;
7245
7246 if (tg == d->tg) {
7247 period = d->rt_period;
7248 runtime = d->rt_runtime;
7249 }
7250
7251
7252
7253
7254 if (runtime > period && runtime != RUNTIME_INF)
7255 return -EINVAL;
7256
7257
7258
7259
7260 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7261 return -EBUSY;
7262
7263 total = to_ratio(period, runtime);
7264
7265
7266
7267
7268 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7269 return -EINVAL;
7270
7271
7272
7273
7274 list_for_each_entry_rcu(child, &tg->children, siblings) {
7275 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7276 runtime = child->rt_bandwidth.rt_runtime;
7277
7278 if (child == d->tg) {
7279 period = d->rt_period;
7280 runtime = d->rt_runtime;
7281 }
7282
7283 sum += to_ratio(period, runtime);
7284 }
7285
7286 if (sum > total)
7287 return -EINVAL;
7288
7289 return 0;
7290}
7291
7292static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7293{
7294 int ret;
7295
7296 struct rt_schedulable_data data = {
7297 .tg = tg,
7298 .rt_period = period,
7299 .rt_runtime = runtime,
7300 };
7301
7302 rcu_read_lock();
7303 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7304 rcu_read_unlock();
7305
7306 return ret;
7307}
7308
7309static int tg_set_rt_bandwidth(struct task_group *tg,
7310 u64 rt_period, u64 rt_runtime)
7311{
7312 int i, err = 0;
7313
7314 mutex_lock(&rt_constraints_mutex);
7315 read_lock(&tasklist_lock);
7316 err = __rt_schedulable(tg, rt_period, rt_runtime);
7317 if (err)
7318 goto unlock;
7319
7320 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7321 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7322 tg->rt_bandwidth.rt_runtime = rt_runtime;
7323
7324 for_each_possible_cpu(i) {
7325 struct rt_rq *rt_rq = tg->rt_rq[i];
7326
7327 raw_spin_lock(&rt_rq->rt_runtime_lock);
7328 rt_rq->rt_runtime = rt_runtime;
7329 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7330 }
7331 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7332unlock:
7333 read_unlock(&tasklist_lock);
7334 mutex_unlock(&rt_constraints_mutex);
7335
7336 return err;
7337}
7338
7339static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7340{
7341 u64 rt_runtime, rt_period;
7342
7343 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7344 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7345 if (rt_runtime_us < 0)
7346 rt_runtime = RUNTIME_INF;
7347
7348 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7349}
7350
7351static long sched_group_rt_runtime(struct task_group *tg)
7352{
7353 u64 rt_runtime_us;
7354
7355 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7356 return -1;
7357
7358 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7359 do_div(rt_runtime_us, NSEC_PER_USEC);
7360 return rt_runtime_us;
7361}
7362
7363static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7364{
7365 u64 rt_runtime, rt_period;
7366
7367 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7368 rt_runtime = tg->rt_bandwidth.rt_runtime;
7369
7370 if (rt_period == 0)
7371 return -EINVAL;
7372
7373 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7374}
7375
7376static long sched_group_rt_period(struct task_group *tg)
7377{
7378 u64 rt_period_us;
7379
7380 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7381 do_div(rt_period_us, NSEC_PER_USEC);
7382 return rt_period_us;
7383}
7384#endif
7385
7386#ifdef CONFIG_RT_GROUP_SCHED
7387static int sched_rt_global_constraints(void)
7388{
7389 int ret = 0;
7390
7391 mutex_lock(&rt_constraints_mutex);
7392 read_lock(&tasklist_lock);
7393 ret = __rt_schedulable(NULL, 0, 0);
7394 read_unlock(&tasklist_lock);
7395 mutex_unlock(&rt_constraints_mutex);
7396
7397 return ret;
7398}
7399
7400static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7401{
7402
7403 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7404 return 0;
7405
7406 return 1;
7407}
7408
7409#else
7410static int sched_rt_global_constraints(void)
7411{
7412 unsigned long flags;
7413 int i, ret = 0;
7414
7415 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7416 for_each_possible_cpu(i) {
7417 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7418
7419 raw_spin_lock(&rt_rq->rt_runtime_lock);
7420 rt_rq->rt_runtime = global_rt_runtime();
7421 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7422 }
7423 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7424
7425 return ret;
7426}
7427#endif
7428
7429static int sched_dl_global_constraints(void)
7430{
7431 u64 runtime = global_rt_runtime();
7432 u64 period = global_rt_period();
7433 u64 new_bw = to_ratio(period, runtime);
7434 int cpu, ret = 0;
7435 unsigned long flags;
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446 for_each_possible_cpu(cpu) {
7447 struct dl_bw *dl_b = dl_bw_of(cpu);
7448
7449 raw_spin_lock_irqsave(&dl_b->lock, flags);
7450 if (new_bw < dl_b->total_bw)
7451 ret = -EBUSY;
7452 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7453
7454 if (ret)
7455 break;
7456 }
7457
7458 return ret;
7459}
7460
7461static void sched_dl_do_global(void)
7462{
7463 u64 new_bw = -1;
7464 int cpu;
7465 unsigned long flags;
7466
7467 def_dl_bandwidth.dl_period = global_rt_period();
7468 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7469
7470 if (global_rt_runtime() != RUNTIME_INF)
7471 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7472
7473
7474
7475
7476 for_each_possible_cpu(cpu) {
7477 struct dl_bw *dl_b = dl_bw_of(cpu);
7478
7479 raw_spin_lock_irqsave(&dl_b->lock, flags);
7480 dl_b->bw = new_bw;
7481 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7482 }
7483}
7484
7485static int sched_rt_global_validate(void)
7486{
7487 if (sysctl_sched_rt_period <= 0)
7488 return -EINVAL;
7489
7490 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7491 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7492 return -EINVAL;
7493
7494 return 0;
7495}
7496
7497static void sched_rt_do_global(void)
7498{
7499 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7500 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7501}
7502
7503int sched_rt_handler(struct ctl_table *table, int write,
7504 void __user *buffer, size_t *lenp,
7505 loff_t *ppos)
7506{
7507 int old_period, old_runtime;
7508 static DEFINE_MUTEX(mutex);
7509 int ret;
7510
7511 mutex_lock(&mutex);
7512 old_period = sysctl_sched_rt_period;
7513 old_runtime = sysctl_sched_rt_runtime;
7514
7515 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7516
7517 if (!ret && write) {
7518 ret = sched_rt_global_validate();
7519 if (ret)
7520 goto undo;
7521
7522 ret = sched_rt_global_constraints();
7523 if (ret)
7524 goto undo;
7525
7526 ret = sched_dl_global_constraints();
7527 if (ret)
7528 goto undo;
7529
7530 sched_rt_do_global();
7531 sched_dl_do_global();
7532 }
7533 if (0) {
7534undo:
7535 sysctl_sched_rt_period = old_period;
7536 sysctl_sched_rt_runtime = old_runtime;
7537 }
7538 mutex_unlock(&mutex);
7539
7540 return ret;
7541}
7542
7543int sched_rr_handler(struct ctl_table *table, int write,
7544 void __user *buffer, size_t *lenp,
7545 loff_t *ppos)
7546{
7547 int ret;
7548 static DEFINE_MUTEX(mutex);
7549
7550 mutex_lock(&mutex);
7551 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7552
7553
7554 if (!ret && write) {
7555 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7556 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7557 }
7558 mutex_unlock(&mutex);
7559 return ret;
7560}
7561
7562#ifdef CONFIG_CGROUP_SCHED
7563
7564static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7565{
7566 return css ? container_of(css, struct task_group, css) : NULL;
7567}
7568
7569static struct cgroup_subsys_state *
7570cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7571{
7572 struct task_group *parent = css_tg(parent_css);
7573 struct task_group *tg;
7574
7575 if (!parent) {
7576
7577 return &root_task_group.css;
7578 }
7579
7580 tg = sched_create_group(parent);
7581 if (IS_ERR(tg))
7582 return ERR_PTR(-ENOMEM);
7583
7584 return &tg->css;
7585}
7586
7587static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7588{
7589 struct task_group *tg = css_tg(css);
7590 struct task_group *parent = css_tg(css_parent(css));
7591
7592 if (parent)
7593 sched_online_group(tg, parent);
7594 return 0;
7595}
7596
7597static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7598{
7599 struct task_group *tg = css_tg(css);
7600
7601 sched_destroy_group(tg);
7602}
7603
7604static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7605{
7606 struct task_group *tg = css_tg(css);
7607
7608 sched_offline_group(tg);
7609}
7610
7611static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7612 struct cgroup_taskset *tset)
7613{
7614 struct task_struct *task;
7615
7616 cgroup_taskset_for_each(task, css, tset) {
7617#ifdef CONFIG_RT_GROUP_SCHED
7618 if (!sched_rt_can_attach(css_tg(css), task))
7619 return -EINVAL;
7620#else
7621
7622 if (task->sched_class != &fair_sched_class)
7623 return -EINVAL;
7624#endif
7625 }
7626 return 0;
7627}
7628
7629static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7630 struct cgroup_taskset *tset)
7631{
7632 struct task_struct *task;
7633
7634 cgroup_taskset_for_each(task, css, tset)
7635 sched_move_task(task);
7636}
7637
7638static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7639 struct cgroup_subsys_state *old_css,
7640 struct task_struct *task)
7641{
7642
7643
7644
7645
7646
7647 if (!(task->flags & PF_EXITING))
7648 return;
7649
7650 sched_move_task(task);
7651}
7652
7653#ifdef CONFIG_FAIR_GROUP_SCHED
7654static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7655 struct cftype *cftype, u64 shareval)
7656{
7657 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7658}
7659
7660static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7661 struct cftype *cft)
7662{
7663 struct task_group *tg = css_tg(css);
7664
7665 return (u64) scale_load_down(tg->shares);
7666}
7667
7668#ifdef CONFIG_CFS_BANDWIDTH
7669static DEFINE_MUTEX(cfs_constraints_mutex);
7670
7671const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7672const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7673
7674static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7675
7676static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7677{
7678 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7679 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7680
7681 if (tg == &root_task_group)
7682 return -EINVAL;
7683
7684
7685
7686
7687
7688
7689 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7690 return -EINVAL;
7691
7692
7693
7694
7695
7696
7697 if (period > max_cfs_quota_period)
7698 return -EINVAL;
7699
7700 mutex_lock(&cfs_constraints_mutex);
7701 ret = __cfs_schedulable(tg, period, quota);
7702 if (ret)
7703 goto out_unlock;
7704
7705 runtime_enabled = quota != RUNTIME_INF;
7706 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7707
7708
7709
7710
7711 if (runtime_enabled && !runtime_was_enabled)
7712 cfs_bandwidth_usage_inc();
7713 raw_spin_lock_irq(&cfs_b->lock);
7714 cfs_b->period = ns_to_ktime(period);
7715 cfs_b->quota = quota;
7716
7717 __refill_cfs_bandwidth_runtime(cfs_b);
7718
7719 if (runtime_enabled && cfs_b->timer_active) {
7720
7721 cfs_b->timer_active = 0;
7722 __start_cfs_bandwidth(cfs_b);
7723 }
7724 raw_spin_unlock_irq(&cfs_b->lock);
7725
7726 for_each_possible_cpu(i) {
7727 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7728 struct rq *rq = cfs_rq->rq;
7729
7730 raw_spin_lock_irq(&rq->lock);
7731 cfs_rq->runtime_enabled = runtime_enabled;
7732 cfs_rq->runtime_remaining = 0;
7733
7734 if (cfs_rq->throttled)
7735 unthrottle_cfs_rq(cfs_rq);
7736 raw_spin_unlock_irq(&rq->lock);
7737 }
7738 if (runtime_was_enabled && !runtime_enabled)
7739 cfs_bandwidth_usage_dec();
7740out_unlock:
7741 mutex_unlock(&cfs_constraints_mutex);
7742
7743 return ret;
7744}
7745
7746int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7747{
7748 u64 quota, period;
7749
7750 period = ktime_to_ns(tg->cfs_bandwidth.period);
7751 if (cfs_quota_us < 0)
7752 quota = RUNTIME_INF;
7753 else
7754 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7755
7756 return tg_set_cfs_bandwidth(tg, period, quota);
7757}
7758
7759long tg_get_cfs_quota(struct task_group *tg)
7760{
7761 u64 quota_us;
7762
7763 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7764 return -1;
7765
7766 quota_us = tg->cfs_bandwidth.quota;
7767 do_div(quota_us, NSEC_PER_USEC);
7768
7769 return quota_us;
7770}
7771
7772int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7773{
7774 u64 quota, period;
7775
7776 period = (u64)cfs_period_us * NSEC_PER_USEC;
7777 quota = tg->cfs_bandwidth.quota;
7778
7779 return tg_set_cfs_bandwidth(tg, period, quota);
7780}
7781
7782long tg_get_cfs_period(struct task_group *tg)
7783{
7784 u64 cfs_period_us;
7785
7786 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7787 do_div(cfs_period_us, NSEC_PER_USEC);
7788
7789 return cfs_period_us;
7790}
7791
7792static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7793 struct cftype *cft)
7794{
7795 return tg_get_cfs_quota(css_tg(css));
7796}
7797
7798static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7799 struct cftype *cftype, s64 cfs_quota_us)
7800{
7801 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7802}
7803
7804static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7805 struct cftype *cft)
7806{
7807 return tg_get_cfs_period(css_tg(css));
7808}
7809
7810static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7811 struct cftype *cftype, u64 cfs_period_us)
7812{
7813 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7814}
7815
7816struct cfs_schedulable_data {
7817 struct task_group *tg;
7818 u64 period, quota;
7819};
7820
7821
7822
7823
7824
7825static u64 normalize_cfs_quota(struct task_group *tg,
7826 struct cfs_schedulable_data *d)
7827{
7828 u64 quota, period;
7829
7830 if (tg == d->tg) {
7831 period = d->period;
7832 quota = d->quota;
7833 } else {
7834 period = tg_get_cfs_period(tg);
7835 quota = tg_get_cfs_quota(tg);
7836 }
7837
7838
7839 if (quota == RUNTIME_INF || quota == -1)
7840 return RUNTIME_INF;
7841
7842 return to_ratio(period, quota);
7843}
7844
7845static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7846{
7847 struct cfs_schedulable_data *d = data;
7848 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7849 s64 quota = 0, parent_quota = -1;
7850
7851 if (!tg->parent) {
7852 quota = RUNTIME_INF;
7853 } else {
7854 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7855
7856 quota = normalize_cfs_quota(tg, d);
7857 parent_quota = parent_b->hierarchal_quota;
7858
7859
7860
7861
7862
7863 if (quota == RUNTIME_INF)
7864 quota = parent_quota;
7865 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7866 return -EINVAL;
7867 }
7868 cfs_b->hierarchal_quota = quota;
7869
7870 return 0;
7871}
7872
7873static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7874{
7875 int ret;
7876 struct cfs_schedulable_data data = {
7877 .tg = tg,
7878 .period = period,
7879 .quota = quota,
7880 };
7881
7882 if (quota != RUNTIME_INF) {
7883 do_div(data.period, NSEC_PER_USEC);
7884 do_div(data.quota, NSEC_PER_USEC);
7885 }
7886
7887 rcu_read_lock();
7888 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7889 rcu_read_unlock();
7890
7891 return ret;
7892}
7893
7894static int cpu_stats_show(struct seq_file *sf, void *v)
7895{
7896 struct task_group *tg = css_tg(seq_css(sf));
7897 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7898
7899 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7900 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7901 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7902
7903 return 0;
7904}
7905#endif
7906#endif
7907
7908#ifdef CONFIG_RT_GROUP_SCHED
7909static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7910 struct cftype *cft, s64 val)
7911{
7912 return sched_group_set_rt_runtime(css_tg(css), val);
7913}
7914
7915static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7916 struct cftype *cft)
7917{
7918 return sched_group_rt_runtime(css_tg(css));
7919}
7920
7921static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7922 struct cftype *cftype, u64 rt_period_us)
7923{
7924 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7925}
7926
7927static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7928 struct cftype *cft)
7929{
7930 return sched_group_rt_period(css_tg(css));
7931}
7932#endif
7933
7934static struct cftype cpu_files[] = {
7935#ifdef CONFIG_FAIR_GROUP_SCHED
7936 {
7937 .name = "shares",
7938 .read_u64 = cpu_shares_read_u64,
7939 .write_u64 = cpu_shares_write_u64,
7940 },
7941#endif
7942#ifdef CONFIG_CFS_BANDWIDTH
7943 {
7944 .name = "cfs_quota_us",
7945 .read_s64 = cpu_cfs_quota_read_s64,
7946 .write_s64 = cpu_cfs_quota_write_s64,
7947 },
7948 {
7949 .name = "cfs_period_us",
7950 .read_u64 = cpu_cfs_period_read_u64,
7951 .write_u64 = cpu_cfs_period_write_u64,
7952 },
7953 {
7954 .name = "stat",
7955 .seq_show = cpu_stats_show,
7956 },
7957#endif
7958#ifdef CONFIG_RT_GROUP_SCHED
7959 {
7960 .name = "rt_runtime_us",
7961 .read_s64 = cpu_rt_runtime_read,
7962 .write_s64 = cpu_rt_runtime_write,
7963 },
7964 {
7965 .name = "rt_period_us",
7966 .read_u64 = cpu_rt_period_read_uint,
7967 .write_u64 = cpu_rt_period_write_uint,
7968 },
7969#endif
7970 { }
7971};
7972
7973struct cgroup_subsys cpu_cgroup_subsys = {
7974 .name = "cpu",
7975 .css_alloc = cpu_cgroup_css_alloc,
7976 .css_free = cpu_cgroup_css_free,
7977 .css_online = cpu_cgroup_css_online,
7978 .css_offline = cpu_cgroup_css_offline,
7979 .can_attach = cpu_cgroup_can_attach,
7980 .attach = cpu_cgroup_attach,
7981 .exit = cpu_cgroup_exit,
7982 .subsys_id = cpu_cgroup_subsys_id,
7983 .base_cftypes = cpu_files,
7984 .early_init = 1,
7985};
7986
7987#endif
7988
7989void dump_cpu_task(int cpu)
7990{
7991 pr_info("Task dump for CPU %d:\n", cpu);
7992 sched_show_task(cpu_curr(cpu));
7993}
7994