1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76#include <linux/compiler.h>
77#include <linux/frame.h>
78#include <linux/sched/mm.h>
79
80#include <asm/switch_to.h>
81#include <asm/tlb.h>
82#include <asm/irq_regs.h>
83#include <asm/mutex.h>
84#ifdef CONFIG_PARAVIRT
85#include <asm/paravirt.h>
86#endif
87
88#include "sched.h"
89#include "../workqueue_internal.h"
90#include "../smpboot.h"
91
92#define CREATE_TRACE_POINTS
93#include <trace/events/sched.h>
94
95#ifdef smp_mb__before_atomic
96void __smp_mb__before_atomic(void)
97{
98 smp_mb__before_atomic();
99}
100EXPORT_SYMBOL(__smp_mb__before_atomic);
101#endif
102
103#ifdef smp_mb__after_atomic
104void __smp_mb__after_atomic(void)
105{
106 smp_mb__after_atomic();
107}
108EXPORT_SYMBOL(__smp_mb__after_atomic);
109#endif
110
111void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
112{
113 if (hrtimer_active(period_timer))
114 return;
115
116 hrtimer_forward_now(period_timer, period);
117 hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
118}
119
120DEFINE_MUTEX(sched_domains_mutex);
121DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
122
123static void update_rq_clock_task(struct rq *rq, s64 delta);
124
125void update_rq_clock(struct rq *rq)
126{
127 s64 delta;
128
129 if (rq->skip_clock_update > 0)
130 return;
131
132 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
133 if (delta < 0)
134 return;
135 rq->clock += delta;
136 update_rq_clock_task(rq, delta);
137}
138
139
140
141
142
143#define SCHED_FEAT(name, enabled) \
144 (1UL << __SCHED_FEAT_##name) * enabled |
145
146const_debug unsigned int sysctl_sched_features =
147#include "features.h"
148 0;
149
150#undef SCHED_FEAT
151
152#ifdef CONFIG_SCHED_DEBUG
153#define SCHED_FEAT(name, enabled) \
154 #name ,
155
156static const char * const sched_feat_names[] = {
157#include "features.h"
158};
159
160#undef SCHED_FEAT
161
162static int sched_feat_show(struct seq_file *m, void *v)
163{
164 int i;
165
166 for (i = 0; i < __SCHED_FEAT_NR; i++) {
167 if (!(sysctl_sched_features & (1UL << i)))
168 seq_puts(m, "NO_");
169 seq_printf(m, "%s ", sched_feat_names[i]);
170 }
171 seq_puts(m, "\n");
172
173 return 0;
174}
175
176#ifdef HAVE_JUMP_LABEL
177
178#define jump_label_key__true STATIC_KEY_INIT_TRUE
179#define jump_label_key__false STATIC_KEY_INIT_FALSE
180
181#define SCHED_FEAT(name, enabled) \
182 jump_label_key__##enabled ,
183
184struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
185#include "features.h"
186};
187
188#undef SCHED_FEAT
189
190static void sched_feat_disable(int i)
191{
192 if (static_key_enabled(&sched_feat_keys[i]))
193 static_key_slow_dec(&sched_feat_keys[i]);
194}
195
196static void sched_feat_enable(int i)
197{
198 if (!static_key_enabled(&sched_feat_keys[i]))
199 static_key_slow_inc(&sched_feat_keys[i]);
200}
201#else
202static void sched_feat_disable(int i) { };
203static void sched_feat_enable(int i) { };
204#endif
205
206static int sched_feat_set(char *cmp)
207{
208 int i;
209 int neg = 0;
210
211 if (strncmp(cmp, "NO_", 3) == 0) {
212 neg = 1;
213 cmp += 3;
214 }
215
216 for (i = 0; i < __SCHED_FEAT_NR; i++) {
217 if (strcmp(cmp, sched_feat_names[i]) == 0) {
218 if (neg) {
219 sysctl_sched_features &= ~(1UL << i);
220 sched_feat_disable(i);
221 } else {
222 sysctl_sched_features |= (1UL << i);
223 sched_feat_enable(i);
224 }
225 break;
226 }
227 }
228
229 return i;
230}
231
232static ssize_t
233sched_feat_write(struct file *filp, const char __user *ubuf,
234 size_t cnt, loff_t *ppos)
235{
236 char buf[64];
237 char *cmp;
238 int i;
239
240 if (cnt > 63)
241 cnt = 63;
242
243 if (copy_from_user(&buf, ubuf, cnt))
244 return -EFAULT;
245
246 buf[cnt] = 0;
247 cmp = strstrip(buf);
248
249 i = sched_feat_set(cmp);
250 if (i == __SCHED_FEAT_NR)
251 return -EINVAL;
252
253 *ppos += cnt;
254
255 return cnt;
256}
257
258static int sched_feat_open(struct inode *inode, struct file *filp)
259{
260 return single_open(filp, sched_feat_show, NULL);
261}
262
263static const struct file_operations sched_feat_fops = {
264 .open = sched_feat_open,
265 .write = sched_feat_write,
266 .read = seq_read,
267 .llseek = seq_lseek,
268 .release = single_release,
269};
270
271static __init int sched_init_debug(void)
272{
273 debugfs_create_file("sched_features", 0644, NULL, NULL,
274 &sched_feat_fops);
275
276 return 0;
277}
278late_initcall(sched_init_debug);
279#endif
280
281
282
283
284
285const_debug unsigned int sysctl_sched_nr_migrate = 32;
286
287
288
289
290
291
292
293const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
294
295
296cpumask_var_t cpu_isolated_map;
297
298
299
300
301
302unsigned int sysctl_sched_rt_period = 1000000;
303
304__read_mostly int scheduler_running;
305
306
307
308
309
310int sysctl_sched_rt_runtime = 950000;
311
312
313
314
315static struct rq *this_rq_lock(void)
316 __acquires(rq->lock)
317{
318 struct rq *rq;
319
320 local_irq_disable();
321 rq = this_rq();
322 raw_spin_lock(&rq->lock);
323
324 return rq;
325}
326
327#ifdef CONFIG_SCHED_HRTICK
328
329
330
331
332
333
334
335
336
337
338
339static void hrtick_clear(struct rq *rq)
340{
341 if (hrtimer_active(&rq->hrtick_timer))
342 hrtimer_cancel(&rq->hrtick_timer);
343}
344
345
346
347
348
349static enum hrtimer_restart hrtick(struct hrtimer *timer)
350{
351 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
352
353 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
354
355 raw_spin_lock(&rq->lock);
356 update_rq_clock(rq);
357 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
358 raw_spin_unlock(&rq->lock);
359
360 return HRTIMER_NORESTART;
361}
362
363#ifdef CONFIG_SMP
364
365
366
367static void __hrtick_start(void *arg)
368{
369 struct rq *rq = arg;
370
371 raw_spin_lock(&rq->lock);
372 hrtimer_restart(&rq->hrtick_timer);
373 rq->hrtick_csd_pending = 0;
374 raw_spin_unlock(&rq->lock);
375}
376
377
378
379
380
381
382void hrtick_start(struct rq *rq, u64 delay)
383{
384 struct hrtimer *timer = &rq->hrtick_timer;
385 ktime_t time;
386 s64 delta;
387
388
389
390
391
392 delta = max_t(s64, delay, 10000LL);
393 time = ktime_add_ns(timer->base->get_time(), delta);
394
395 hrtimer_set_expires(timer, time);
396
397 if (rq == this_rq()) {
398 hrtimer_restart(timer);
399 } else if (!rq->hrtick_csd_pending) {
400 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
401 rq->hrtick_csd_pending = 1;
402 }
403}
404
405static int
406hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
407{
408 int cpu = (int)(long)hcpu;
409
410 switch (action) {
411 case CPU_UP_CANCELED:
412 case CPU_UP_CANCELED_FROZEN:
413 case CPU_DOWN_PREPARE:
414 case CPU_DOWN_PREPARE_FROZEN:
415 case CPU_DEAD:
416 case CPU_DEAD_FROZEN:
417 hrtick_clear(cpu_rq(cpu));
418 return NOTIFY_OK;
419 }
420
421 return NOTIFY_DONE;
422}
423
424static __init void init_hrtick(void)
425{
426 hotcpu_notifier(hotplug_hrtick, 0);
427}
428#else
429
430
431
432
433
434void hrtick_start(struct rq *rq, u64 delay)
435{
436
437
438
439
440 delay = max_t(u64, delay, 10000LL);
441 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
442 HRTIMER_MODE_REL_PINNED);
443}
444
445static inline void init_hrtick(void)
446{
447}
448#endif
449
450static void init_rq_hrtick(struct rq *rq)
451{
452#ifdef CONFIG_SMP
453 rq->hrtick_csd_pending = 0;
454
455 rq->hrtick_csd.flags = 0;
456 rq->hrtick_csd.func = __hrtick_start;
457 rq->hrtick_csd.info = rq;
458#endif
459
460 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
461 rq->hrtick_timer.function = hrtick;
462}
463#else
464static inline void hrtick_clear(struct rq *rq)
465{
466}
467
468static inline void init_rq_hrtick(struct rq *rq)
469{
470}
471
472static inline void init_hrtick(void)
473{
474}
475#endif
476
477
478
479
480#define fetch_or(ptr, val) \
481({ typeof(*(ptr)) __old, __val = *(ptr); \
482 for (;;) { \
483 __old = cmpxchg((ptr), __val, __val | (val)); \
484 if (__old == __val) \
485 break; \
486 __val = __old; \
487 } \
488 __old; \
489})
490
491#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
492
493
494
495
496
497static bool set_nr_and_not_polling(struct task_struct *p)
498{
499 struct thread_info *ti = task_thread_info(p);
500 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
501}
502
503
504
505
506
507
508
509static bool set_nr_if_polling(struct task_struct *p)
510{
511 struct thread_info *ti = task_thread_info(p);
512 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
513
514 for (;;) {
515 if (!(val & _TIF_POLLING_NRFLAG))
516 return false;
517 if (val & _TIF_NEED_RESCHED)
518 return true;
519 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
520 if (old == val)
521 break;
522 val = old;
523 }
524 return true;
525}
526
527#else
528static bool set_nr_and_not_polling(struct task_struct *p)
529{
530 set_tsk_need_resched(p);
531 return true;
532}
533
534#ifdef CONFIG_SMP
535static bool set_nr_if_polling(struct task_struct *p)
536{
537 return false;
538}
539#endif
540#endif
541
542void wake_q_add(struct wake_q_head *head, struct task_struct *task)
543{
544 struct wake_q_node *node = &task->wake_q;
545
546
547
548
549
550
551
552
553
554 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
555 return;
556
557 get_task_struct(task);
558
559
560
561
562 *head->lastp = node;
563 head->lastp = &node->next;
564}
565
566void wake_up_q(struct wake_q_head *head)
567{
568 struct wake_q_node *node = head->first;
569
570 while (node != WAKE_Q_TAIL) {
571 struct task_struct *task;
572
573 task = container_of(node, struct task_struct, wake_q);
574 BUG_ON(!task);
575
576 node = node->next;
577 task->wake_q.next = NULL;
578
579
580
581
582
583 wake_up_process(task);
584 put_task_struct(task);
585 }
586}
587
588
589
590
591
592
593
594
595#ifdef CONFIG_SMP
596void resched_curr(struct rq *rq)
597{
598 struct task_struct *curr = rq->curr;
599 int cpu;
600
601 assert_raw_spin_locked(&rq->lock);
602
603 if (test_tsk_need_resched(curr))
604 return;
605
606 cpu = cpu_of(rq);
607 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(curr);
609 return;
610 }
611
612 if (set_nr_and_not_polling(curr))
613 smp_send_reschedule(cpu);
614 else
615 trace_sched_wake_idle_without_ipi(cpu);
616}
617
618void resched_cpu(int cpu)
619{
620 struct rq *rq = cpu_rq(cpu);
621 unsigned long flags;
622
623 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
624 return;
625 resched_curr(rq);
626 raw_spin_unlock_irqrestore(&rq->lock, flags);
627}
628
629#ifdef CONFIG_NO_HZ_COMMON
630
631
632
633
634
635
636
637
638int get_nohz_timer_target(void)
639{
640 int i, cpu = smp_processor_id();
641 struct sched_domain *sd;
642
643 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
644 return cpu;
645
646 rcu_read_lock();
647 for_each_domain(cpu, sd) {
648 for_each_cpu(i, sched_domain_span(sd)) {
649 if (cpu == i)
650 continue;
651
652 if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
653 cpu = i;
654 goto unlock;
655 }
656 }
657 }
658
659 if (!is_housekeeping_cpu(cpu))
660 cpu = housekeeping_any_cpu();
661unlock:
662 rcu_read_unlock();
663 return cpu;
664}
665
666
667
668
669
670
671
672
673
674
675static void wake_up_idle_cpu(int cpu)
676{
677 struct rq *rq = cpu_rq(cpu);
678
679 if (cpu == smp_processor_id())
680 return;
681
682
683
684
685
686
687
688
689 if (rq->curr != rq->idle)
690 return;
691
692
693
694
695
696
697 set_tsk_need_resched(rq->idle);
698
699
700 smp_mb();
701 if (!tsk_is_polling(rq->idle))
702 smp_send_reschedule(cpu);
703 else
704 trace_sched_wake_idle_without_ipi(cpu);
705}
706
707static bool wake_up_full_nohz_cpu(int cpu)
708{
709 if (tick_nohz_full_cpu(cpu)) {
710 if (cpu != smp_processor_id() ||
711 tick_nohz_tick_stopped())
712 smp_send_reschedule(cpu);
713 return true;
714 }
715
716 return false;
717}
718
719void wake_up_nohz_cpu(int cpu)
720{
721 if (!wake_up_full_nohz_cpu(cpu))
722 wake_up_idle_cpu(cpu);
723}
724
725static inline bool got_nohz_idle_kick(void)
726{
727 int cpu = smp_processor_id();
728
729 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
730 return false;
731
732 if (idle_cpu(cpu) && !need_resched())
733 return true;
734
735
736
737
738
739 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
740 return false;
741}
742
743#else
744
745static inline bool got_nohz_idle_kick(void)
746{
747 return false;
748}
749
750#endif
751
752#ifdef CONFIG_NO_HZ_FULL
753bool sched_can_stop_tick(void)
754{
755 struct rq *rq;
756
757
758
759
760
761 if (current->policy == SCHED_FIFO)
762 return true;
763
764
765
766
767
768 if (current->policy == SCHED_RR) {
769 struct sched_rt_entity *rt_se = ¤t->rt;
770
771 return rt_se->run_list.prev == rt_se->run_list.next;
772 }
773
774 rq = this_rq();
775
776
777 smp_rmb();
778
779
780 if (rq->nr_running > 1)
781 return false;
782
783 return true;
784}
785#endif
786
787void sched_avg_update(struct rq *rq)
788{
789 s64 period = sched_avg_period();
790
791 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
792
793
794
795
796
797 asm("" : "+rm" (rq->age_stamp));
798 rq->age_stamp += period;
799 rq->rt_avg /= 2;
800 }
801}
802
803#else
804void resched_curr(struct rq *rq)
805{
806 struct task_struct *curr = rq->curr;
807
808 assert_raw_spin_locked(&rq->lock);
809 set_tsk_need_resched(curr);
810}
811#endif
812
813#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
814 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
815
816
817
818
819
820
821int walk_tg_tree_from(struct task_group *from,
822 tg_visitor down, tg_visitor up, void *data)
823{
824 struct task_group *parent, *child;
825 int ret;
826
827 parent = from;
828
829down:
830 ret = (*down)(parent, data);
831 if (ret)
832 goto out;
833 list_for_each_entry_rcu(child, &parent->children, siblings) {
834 parent = child;
835 goto down;
836
837up:
838 continue;
839 }
840 ret = (*up)(parent, data);
841 if (ret || parent == from)
842 goto out;
843
844 child = parent;
845 parent = parent->parent;
846 if (parent)
847 goto up;
848out:
849 return ret;
850}
851
852int tg_nop(struct task_group *tg, void *data)
853{
854 return 0;
855}
856#endif
857
858static void set_load_weight(struct task_struct *p)
859{
860 int prio = p->static_prio - MAX_RT_PRIO;
861 struct load_weight *load = &p->se.load;
862
863
864
865
866 if (p->policy == SCHED_IDLE) {
867 load->weight = scale_load(WEIGHT_IDLEPRIO);
868 load->inv_weight = WMULT_IDLEPRIO;
869 return;
870 }
871
872 load->weight = scale_load(prio_to_weight[prio]);
873 load->inv_weight = prio_to_wmult[prio];
874}
875
876static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
877{
878 update_rq_clock(rq);
879 if (!(flags & ENQUEUE_RESTORE))
880 sched_info_queued(p);
881 p->sched_class->enqueue_task(rq, p, flags);
882}
883
884static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
885{
886 update_rq_clock(rq);
887 if (!(flags & DEQUEUE_SAVE))
888 sched_info_dequeued(p);
889 p->sched_class->dequeue_task(rq, p, flags);
890}
891
892void activate_task(struct rq *rq, struct task_struct *p, int flags)
893{
894 if (task_contributes_to_load(p))
895 rq->nr_uninterruptible--;
896
897 enqueue_task(rq, p, flags);
898}
899
900void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
901{
902 if (task_contributes_to_load(p))
903 rq->nr_uninterruptible++;
904
905 dequeue_task(rq, p, flags);
906}
907
908static void update_rq_clock_task(struct rq *rq, s64 delta)
909{
910
911
912
913
914#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
915 s64 steal = 0, irq_delta = 0;
916#endif
917#ifdef CONFIG_IRQ_TIME_ACCOUNTING
918 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935 if (irq_delta > delta)
936 irq_delta = delta;
937
938 rq->prev_irq_time += irq_delta;
939 delta -= irq_delta;
940#endif
941#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
942 if (static_key_false((¶virt_steal_rq_enabled))) {
943 u64 st;
944
945 steal = paravirt_steal_clock(cpu_of(rq));
946 steal -= rq->prev_steal_time_rq;
947
948 if (unlikely(steal > delta))
949 steal = delta;
950
951 st = steal_ticks(steal);
952 steal = st * TICK_NSEC;
953
954 rq->prev_steal_time_rq += steal;
955
956 delta -= steal;
957 }
958#endif
959
960 rq->clock_task += delta;
961
962#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
963 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
964 sched_rt_avg_update(rq, irq_delta + steal);
965#endif
966}
967
968void sched_set_stop_task(int cpu, struct task_struct *stop)
969{
970 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
971 struct task_struct *old_stop = cpu_rq(cpu)->stop;
972
973 if (stop) {
974
975
976
977
978
979
980
981
982 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
983
984 stop->sched_class = &stop_sched_class;
985 }
986
987 cpu_rq(cpu)->stop = stop;
988
989 if (old_stop) {
990
991
992
993
994 old_stop->sched_class = &rt_sched_class;
995 }
996}
997
998
999
1000
1001static inline int __normal_prio(struct task_struct *p)
1002{
1003 return p->static_prio;
1004}
1005
1006
1007
1008
1009
1010
1011
1012
1013static inline int normal_prio(struct task_struct *p)
1014{
1015 int prio;
1016
1017 if (task_has_dl_policy(p))
1018 prio = MAX_DL_PRIO-1;
1019 else if (task_has_rt_policy(p))
1020 prio = MAX_RT_PRIO-1 - p->rt_priority;
1021 else
1022 prio = __normal_prio(p);
1023 return prio;
1024}
1025
1026
1027
1028
1029
1030
1031
1032
1033static int effective_prio(struct task_struct *p)
1034{
1035 p->normal_prio = normal_prio(p);
1036
1037
1038
1039
1040
1041 if (!rt_prio(p->prio))
1042 return p->normal_prio;
1043 return p->prio;
1044}
1045
1046
1047
1048
1049
1050
1051
1052inline int task_curr(const struct task_struct *p)
1053{
1054 return cpu_curr(task_cpu(p)) == p;
1055}
1056
1057
1058
1059
1060static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1061 const struct sched_class *prev_class,
1062 int oldprio)
1063{
1064 if (prev_class != p->sched_class) {
1065 if (prev_class->switched_from)
1066 prev_class->switched_from(rq, p);
1067
1068 p->sched_class->switched_to(rq, p);
1069 } else if (oldprio != p->prio || dl_task(p))
1070 p->sched_class->prio_changed(rq, p, oldprio);
1071}
1072
1073void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1074{
1075 const struct sched_class *class;
1076
1077 if (p->sched_class == rq->curr->sched_class) {
1078 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1079 } else {
1080 for_each_class(class) {
1081 if (class == rq->curr->sched_class)
1082 break;
1083 if (class == p->sched_class) {
1084 resched_curr(rq);
1085 break;
1086 }
1087 }
1088 }
1089
1090
1091
1092
1093
1094 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1095 rq->skip_clock_update = 1;
1096}
1097
1098#ifdef CONFIG_SMP
1099void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1100{
1101#ifdef CONFIG_SCHED_DEBUG
1102
1103
1104
1105
1106 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1107 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1108
1109#ifdef CONFIG_LOCKDEP
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1121 lockdep_is_held(&task_rq(p)->lock)));
1122#endif
1123#endif
1124
1125 trace_sched_migrate_task(p, new_cpu);
1126
1127 if (task_cpu(p) != new_cpu) {
1128 if (p->sched_class->migrate_task_rq)
1129 p->sched_class->migrate_task_rq(p, new_cpu);
1130 p->se.nr_migrations++;
1131 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1132 }
1133
1134 __set_task_cpu(p, new_cpu);
1135}
1136
1137static void __migrate_swap_task(struct task_struct *p, int cpu)
1138{
1139 if (task_on_rq_queued(p)) {
1140 struct rq *src_rq, *dst_rq;
1141
1142 src_rq = task_rq(p);
1143 dst_rq = cpu_rq(cpu);
1144
1145 deactivate_task(src_rq, p, 0);
1146 set_task_cpu(p, cpu);
1147 activate_task(dst_rq, p, 0);
1148 check_preempt_curr(dst_rq, p, 0);
1149 } else {
1150
1151
1152
1153
1154
1155 p->wake_cpu = cpu;
1156 }
1157}
1158
1159struct migration_swap_arg {
1160 struct task_struct *src_task, *dst_task;
1161 int src_cpu, dst_cpu;
1162};
1163
1164static int migrate_swap_stop(void *data)
1165{
1166 struct migration_swap_arg *arg = data;
1167 struct rq *src_rq, *dst_rq;
1168 int ret = -EAGAIN;
1169
1170 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1171 return -EAGAIN;
1172
1173 src_rq = cpu_rq(arg->src_cpu);
1174 dst_rq = cpu_rq(arg->dst_cpu);
1175
1176 double_raw_lock(&arg->src_task->pi_lock,
1177 &arg->dst_task->pi_lock);
1178 double_rq_lock(src_rq, dst_rq);
1179
1180 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1181 goto unlock;
1182
1183 if (task_cpu(arg->src_task) != arg->src_cpu)
1184 goto unlock;
1185
1186 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1187 goto unlock;
1188
1189 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1190 goto unlock;
1191
1192 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1193 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1194
1195 ret = 0;
1196
1197unlock:
1198 double_rq_unlock(src_rq, dst_rq);
1199 raw_spin_unlock(&arg->dst_task->pi_lock);
1200 raw_spin_unlock(&arg->src_task->pi_lock);
1201
1202 return ret;
1203}
1204
1205
1206
1207
1208int migrate_swap(struct task_struct *cur, struct task_struct *p)
1209{
1210 struct migration_swap_arg arg;
1211 int ret = -EINVAL;
1212
1213 arg = (struct migration_swap_arg){
1214 .src_task = cur,
1215 .src_cpu = task_cpu(cur),
1216 .dst_task = p,
1217 .dst_cpu = task_cpu(p),
1218 };
1219
1220 if (arg.src_cpu == arg.dst_cpu)
1221 goto out;
1222
1223
1224
1225
1226
1227 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1228 goto out;
1229
1230 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1231 goto out;
1232
1233 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1234 goto out;
1235
1236 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1237 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1238
1239out:
1240 return ret;
1241}
1242
1243struct migration_arg {
1244 struct task_struct *task;
1245 int dest_cpu;
1246};
1247
1248static int migration_cpu_stop(void *data);
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1267{
1268 unsigned long flags;
1269 int running, queued;
1270 unsigned long ncsw;
1271 struct rq *rq;
1272
1273 for (;;) {
1274
1275
1276
1277
1278
1279
1280 rq = task_rq(p);
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293 while (task_running(rq, p)) {
1294 if (match_state && unlikely(p->state != match_state))
1295 return 0;
1296 cpu_relax();
1297 }
1298
1299
1300
1301
1302
1303
1304 rq = task_rq_lock(p, &flags);
1305 trace_sched_wait_task(p);
1306 running = task_running(rq, p);
1307 queued = task_on_rq_queued(p);
1308 ncsw = 0;
1309 if (!match_state || p->state == match_state)
1310 ncsw = p->nvcsw | LONG_MIN;
1311 task_rq_unlock(rq, p, &flags);
1312
1313
1314
1315
1316 if (unlikely(!ncsw))
1317 break;
1318
1319
1320
1321
1322
1323
1324
1325 if (unlikely(running)) {
1326 cpu_relax();
1327 continue;
1328 }
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339 if (unlikely(queued)) {
1340 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1341
1342 set_current_state(TASK_UNINTERRUPTIBLE);
1343 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1344 continue;
1345 }
1346
1347
1348
1349
1350
1351
1352 break;
1353 }
1354
1355 return ncsw;
1356}
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371void kick_process(struct task_struct *p)
1372{
1373 int cpu;
1374
1375 preempt_disable();
1376 cpu = task_cpu(p);
1377 if ((cpu != smp_processor_id()) && task_curr(p))
1378 smp_send_reschedule(cpu);
1379 preempt_enable();
1380}
1381EXPORT_SYMBOL_GPL(kick_process);
1382#endif
1383
1384#ifdef CONFIG_SMP
1385
1386
1387
1388static int select_fallback_rq(int cpu, struct task_struct *p)
1389{
1390 int nid = cpu_to_node(cpu);
1391 const struct cpumask *nodemask = NULL;
1392 enum { cpuset, possible, fail } state = cpuset;
1393 int dest_cpu;
1394
1395
1396
1397
1398
1399
1400 if (nid != -1) {
1401 nodemask = cpumask_of_node(nid);
1402
1403
1404 for_each_cpu(dest_cpu, nodemask) {
1405 if (!cpu_online(dest_cpu))
1406 continue;
1407 if (!cpu_active(dest_cpu))
1408 continue;
1409 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1410 return dest_cpu;
1411 }
1412 }
1413
1414 for (;;) {
1415
1416 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1417 if (!cpu_online(dest_cpu))
1418 continue;
1419 if (!cpu_active(dest_cpu))
1420 continue;
1421 goto out;
1422 }
1423
1424 switch (state) {
1425 case cpuset:
1426
1427 cpuset_cpus_allowed_fallback(p);
1428 state = possible;
1429 break;
1430
1431 case possible:
1432 do_set_cpus_allowed(p, cpu_possible_mask);
1433 state = fail;
1434 break;
1435
1436 case fail:
1437 BUG();
1438 break;
1439 }
1440 }
1441
1442out:
1443 if (state != cpuset) {
1444
1445
1446
1447
1448
1449 if (p->mm && printk_ratelimit()) {
1450 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1451 task_pid_nr(p), p->comm, cpu);
1452 }
1453 }
1454
1455 return dest_cpu;
1456}
1457
1458
1459
1460
1461static inline
1462int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1463{
1464 if (p->nr_cpus_allowed > 1)
1465 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1478 !cpu_online(cpu)))
1479 cpu = select_fallback_rq(task_cpu(p), p);
1480
1481 return cpu;
1482}
1483
1484static void update_avg(u64 *avg, u64 sample)
1485{
1486 s64 diff = sample - *avg;
1487 *avg += diff >> 3;
1488}
1489#endif
1490
1491static void
1492ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1493{
1494#ifdef CONFIG_SCHEDSTATS
1495 struct rq *rq = this_rq();
1496
1497#ifdef CONFIG_SMP
1498 int this_cpu = smp_processor_id();
1499
1500 if (cpu == this_cpu) {
1501 schedstat_inc(rq, ttwu_local);
1502 schedstat_inc(p, se.statistics->nr_wakeups_local);
1503 } else {
1504 struct sched_domain *sd;
1505
1506 schedstat_inc(p, se.statistics->nr_wakeups_remote);
1507 rcu_read_lock();
1508 for_each_domain(this_cpu, sd) {
1509 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1510 schedstat_inc(sd, ttwu_wake_remote);
1511 break;
1512 }
1513 }
1514 rcu_read_unlock();
1515 }
1516
1517 if (wake_flags & WF_MIGRATED)
1518 schedstat_inc(p, se.statistics->nr_wakeups_migrate);
1519
1520#endif
1521
1522 schedstat_inc(rq, ttwu_count);
1523 schedstat_inc(p, se.statistics->nr_wakeups);
1524
1525 if (wake_flags & WF_SYNC)
1526 schedstat_inc(p, se.statistics->nr_wakeups_sync);
1527
1528#endif
1529}
1530
1531static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1532{
1533 activate_task(rq, p, en_flags);
1534 p->on_rq = TASK_ON_RQ_QUEUED;
1535
1536
1537 if (p->flags & PF_WQ_WORKER)
1538 wq_worker_waking_up(p, cpu_of(rq));
1539}
1540
1541
1542
1543
1544static void
1545ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1546{
1547 check_preempt_curr(rq, p, wake_flags);
1548 trace_sched_wakeup(p, true);
1549
1550 p->state = TASK_RUNNING;
1551#ifdef CONFIG_SMP
1552 if (p->sched_class->task_woken)
1553 p->sched_class->task_woken(rq, p);
1554
1555 if (rq->idle_stamp) {
1556 u64 delta = rq_clock(rq) - rq->idle_stamp;
1557 u64 max = 2*rq->max_idle_balance_cost;
1558
1559 update_avg(&rq->avg_idle, delta);
1560
1561 if (rq->avg_idle > max)
1562 rq->avg_idle = max;
1563
1564 rq->idle_stamp = 0;
1565 }
1566#endif
1567}
1568
1569static void
1570ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1571{
1572 int en_flags = ENQUEUE_WAKEUP;
1573
1574#ifdef CONFIG_SMP
1575 if (p->sched_contributes_to_load)
1576 rq->nr_uninterruptible--;
1577
1578 if (wake_flags & WF_MIGRATED)
1579 en_flags |= ENQUEUE_MIGRATED;
1580#endif
1581
1582 ttwu_activate(rq, p, en_flags);
1583 ttwu_do_wakeup(rq, p, wake_flags);
1584}
1585
1586
1587
1588
1589
1590
1591
1592static int ttwu_remote(struct task_struct *p, int wake_flags)
1593{
1594 struct rq *rq;
1595 int ret = 0;
1596
1597 rq = __task_rq_lock(p);
1598 if (task_on_rq_queued(p)) {
1599 ttwu_do_wakeup(rq, p, wake_flags);
1600 ret = 1;
1601 }
1602 __task_rq_unlock(rq);
1603
1604 return ret;
1605}
1606
1607#ifdef CONFIG_SMP
1608void sched_ttwu_pending(void)
1609{
1610 struct rq *rq = this_rq();
1611 struct llist_node *llist = llist_del_all(&rq->wake_list);
1612 struct task_struct *p;
1613 unsigned long flags;
1614
1615 if (!llist)
1616 return;
1617
1618 raw_spin_lock_irqsave(&rq->lock, flags);
1619
1620 while (llist) {
1621 int wake_flags = 0;
1622
1623 p = llist_entry(llist, struct task_struct, wake_entry);
1624 llist = llist_next(llist);
1625
1626 if (p->sched_remote_wakeup)
1627 wake_flags = WF_MIGRATED;
1628
1629 ttwu_do_activate(rq, p, wake_flags);
1630 }
1631
1632 raw_spin_unlock_irqrestore(&rq->lock, flags);
1633}
1634
1635void scheduler_ipi(void)
1636{
1637 if (llist_empty(&this_rq()->wake_list)
1638 && !tick_nohz_full_cpu(smp_processor_id())
1639 && !got_nohz_idle_kick())
1640 return;
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655 irq_enter();
1656 tick_nohz_full_check();
1657 sched_ttwu_pending();
1658
1659
1660
1661
1662 if (unlikely(got_nohz_idle_kick())) {
1663 this_rq()->idle_balance = 1;
1664 raise_softirq_irqoff(SCHED_SOFTIRQ);
1665 }
1666 irq_exit();
1667}
1668
1669static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1670{
1671 struct rq *rq = cpu_rq(cpu);
1672
1673 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1674
1675 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1676 if (!set_nr_if_polling(rq->idle))
1677 smp_send_reschedule(cpu);
1678 else
1679 trace_sched_wake_idle_without_ipi(cpu);
1680 }
1681}
1682
1683bool cpus_share_cache(int this_cpu, int that_cpu)
1684{
1685 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1686}
1687#endif
1688
1689static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1690{
1691 struct rq *rq = cpu_rq(cpu);
1692
1693#if defined(CONFIG_SMP)
1694 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1695 sched_clock_cpu(cpu);
1696 ttwu_queue_remote(p, cpu, wake_flags);
1697 return;
1698 }
1699#endif
1700
1701 raw_spin_lock(&rq->lock);
1702 ttwu_do_activate(rq, p, wake_flags);
1703 raw_spin_unlock(&rq->lock);
1704}
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721static int
1722try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1723{
1724 unsigned long flags;
1725 int cpu, success = 0;
1726
1727
1728
1729
1730
1731
1732
1733 raw_spin_lock_irqsave(&p->pi_lock, flags);
1734 smp_mb__after_spinlock();
1735 if (!(p->state & state))
1736 goto out;
1737
1738 success = 1;
1739 cpu = task_cpu(p);
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762 smp_rmb();
1763 if (p->on_rq && ttwu_remote(p, wake_flags))
1764 goto stat;
1765
1766#ifdef CONFIG_SMP
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784 smp_rmb();
1785
1786
1787
1788
1789
1790 while (p->on_cpu)
1791 cpu_relax();
1792
1793
1794
1795 smp_rmb();
1796
1797 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1798 p->state = TASK_WAKING;
1799
1800 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1801 if (task_cpu(p) != cpu) {
1802 wake_flags |= WF_MIGRATED;
1803 set_task_cpu(p, cpu);
1804 }
1805#endif
1806
1807 ttwu_queue(p, cpu, wake_flags);
1808stat:
1809 if (schedstat_enabled())
1810 ttwu_stat(p, cpu, wake_flags);
1811out:
1812 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1813
1814 return success;
1815}
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825static void try_to_wake_up_local(struct task_struct *p)
1826{
1827 struct rq *rq = task_rq(p);
1828
1829 if (WARN_ON_ONCE(rq != this_rq()) ||
1830 WARN_ON_ONCE(p == current))
1831 return;
1832
1833 lockdep_assert_held(&rq->lock);
1834
1835 if (!raw_spin_trylock(&p->pi_lock)) {
1836 raw_spin_unlock(&rq->lock);
1837 raw_spin_lock(&p->pi_lock);
1838 raw_spin_lock(&rq->lock);
1839 }
1840
1841 if (!(p->state & TASK_NORMAL))
1842 goto out;
1843
1844 if (!task_on_rq_queued(p))
1845 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1846
1847 ttwu_do_wakeup(rq, p, 0);
1848 if (schedstat_enabled())
1849 ttwu_stat(p, smp_processor_id(), 0);
1850out:
1851 raw_spin_unlock(&p->pi_lock);
1852}
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866int wake_up_process(struct task_struct *p)
1867{
1868 return try_to_wake_up(p, TASK_NORMAL, 0);
1869}
1870EXPORT_SYMBOL(wake_up_process);
1871
1872int wake_up_state(struct task_struct *p, unsigned int state)
1873{
1874 return try_to_wake_up(p, state, 0);
1875}
1876
1877
1878
1879
1880void __dl_clear_params(struct task_struct *p)
1881{
1882 struct sched_dl_entity *dl_se = &p->dl;
1883
1884 dl_se->dl_runtime = 0;
1885 dl_se->dl_deadline = 0;
1886 dl_se->dl_period = 0;
1887 dl_se->flags = 0;
1888 dl_se->dl_bw = 0;
1889 dl_se->dl_density = 0;
1890
1891 dl_se->dl_throttled = 0;
1892 dl_se->dl_yielded = 0;
1893 dl_se->dl_non_contending = 0;
1894}
1895
1896
1897
1898
1899
1900
1901
1902static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1903{
1904 p->on_rq = 0;
1905
1906 p->se.on_rq = 0;
1907 p->se.exec_start = 0;
1908 p->se.sum_exec_runtime = 0;
1909 p->se.prev_sum_exec_runtime = 0;
1910 p->se.nr_migrations = 0;
1911 p->se.vruntime = 0;
1912 INIT_LIST_HEAD(&p->se.group_node);
1913
1914#ifdef CONFIG_SCHEDSTATS
1915
1916 p->se.statistics = &p->statistics;
1917 memset(p->se.statistics, 0, sizeof(*p->se.statistics));
1918#endif
1919
1920 RB_CLEAR_NODE(&p->dl.rb_node);
1921 init_dl_task_timer(&p->dl);
1922 init_dl_inactive_task_timer(&p->dl);
1923 __dl_clear_params(p);
1924
1925 INIT_LIST_HEAD(&p->rt.run_list);
1926
1927#ifdef CONFIG_PREEMPT_NOTIFIERS
1928 INIT_HLIST_HEAD(&p->preempt_notifiers);
1929#endif
1930
1931#ifdef CONFIG_NUMA_BALANCING
1932 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1933 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1934 p->mm->numa_scan_seq = 0;
1935 }
1936
1937 if (clone_flags & CLONE_VM)
1938 p->numa_preferred_nid = current->numa_preferred_nid;
1939 else
1940 p->numa_preferred_nid = -1;
1941
1942 p->node_stamp = 0ULL;
1943 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1944 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1945 p->numa_work.next = &p->numa_work;
1946 p->numa_faults_memory = NULL;
1947 p->numa_faults_buffer_memory = NULL;
1948 p->last_task_numa_placement = 0;
1949 p->last_sum_exec_runtime = 0;
1950
1951 INIT_LIST_HEAD(&p->numa_entry);
1952 p->numa_group = NULL;
1953#endif
1954}
1955
1956#ifdef CONFIG_NUMA_BALANCING
1957#ifdef CONFIG_SCHED_DEBUG
1958void set_numabalancing_state(bool enabled)
1959{
1960 if (enabled)
1961 sched_feat_set("NUMA");
1962 else
1963 sched_feat_set("NO_NUMA");
1964}
1965#else
1966__read_mostly bool numabalancing_enabled;
1967
1968void set_numabalancing_state(bool enabled)
1969{
1970 numabalancing_enabled = enabled;
1971}
1972#endif
1973
1974#ifdef CONFIG_PROC_SYSCTL
1975int sysctl_numa_balancing(struct ctl_table *table, int write,
1976 void __user *buffer, size_t *lenp, loff_t *ppos)
1977{
1978 struct ctl_table t;
1979 int err;
1980 int state = numabalancing_enabled;
1981
1982 if (write && !capable(CAP_SYS_ADMIN))
1983 return -EPERM;
1984
1985 t = *table;
1986 t.data = &state;
1987 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1988 if (err < 0)
1989 return err;
1990 if (write)
1991 set_numabalancing_state(state);
1992 return err;
1993}
1994#endif
1995#endif
1996
1997#ifdef CONFIG_SCHEDSTATS
1998
1999struct static_key sched_schedstats __read_mostly = STATIC_KEY_INIT_FALSE;
2000static bool __initdata __sched_schedstats = false;
2001
2002static void set_schedstats(bool enable)
2003{
2004 if (enable && !schedstat_enabled())
2005 static_key_slow_inc(&sched_schedstats);
2006 else if (!enable && schedstat_enabled())
2007 static_key_slow_dec(&sched_schedstats);
2008}
2009
2010void force_schedstat_enabled(void)
2011{
2012 if (!schedstat_enabled()) {
2013 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2014 static_key_slow_inc(&sched_schedstats);
2015 }
2016}
2017
2018static int __init setup_schedstats(char *str)
2019{
2020 int ret = 0;
2021 if (!str)
2022 goto out;
2023
2024
2025
2026
2027
2028
2029 if (!strcmp(str, "enable")) {
2030 __sched_schedstats = true;
2031 ret = 1;
2032 } else if (!strcmp(str, "disable")) {
2033 __sched_schedstats = false;
2034 ret = 1;
2035 }
2036out:
2037 if (!ret)
2038 pr_warn("Unable to parse schedstats=\n");
2039
2040 return ret;
2041}
2042__setup("schedstats=", setup_schedstats);
2043
2044static void __init init_schedstats(void)
2045{
2046 set_schedstats(__sched_schedstats);
2047}
2048
2049#ifdef CONFIG_PROC_SYSCTL
2050int sysctl_schedstats(struct ctl_table *table, int write,
2051 void __user *buffer, size_t *lenp, loff_t *ppos)
2052{
2053 struct ctl_table t;
2054 int err;
2055 int state = static_key_false(&sched_schedstats);
2056
2057 if (write && !capable(CAP_SYS_ADMIN))
2058 return -EPERM;
2059
2060 t = *table;
2061 t.data = &state;
2062 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2063 if (err < 0)
2064 return err;
2065 if (write)
2066 set_schedstats(state);
2067 return err;
2068}
2069#endif
2070#else
2071static inline void init_schedstats(void) {}
2072#endif
2073
2074
2075
2076
2077int sched_fork(unsigned long clone_flags, struct task_struct *p)
2078{
2079 unsigned long flags;
2080 int cpu = get_cpu();
2081
2082 __sched_fork(clone_flags, p);
2083
2084
2085
2086
2087
2088 p->state = TASK_RUNNING;
2089
2090
2091
2092
2093 p->prio = current->normal_prio;
2094
2095
2096
2097
2098 if (unlikely(p->sched_reset_on_fork)) {
2099 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2100 p->policy = SCHED_NORMAL;
2101 p->static_prio = NICE_TO_PRIO(0);
2102 p->rt_priority = 0;
2103 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2104 p->static_prio = NICE_TO_PRIO(0);
2105
2106 p->prio = p->normal_prio = __normal_prio(p);
2107 set_load_weight(p);
2108
2109
2110
2111
2112
2113 p->sched_reset_on_fork = 0;
2114 }
2115
2116 if (dl_prio(p->prio)) {
2117 put_cpu();
2118 return -EAGAIN;
2119 } else if (rt_prio(p->prio)) {
2120 p->sched_class = &rt_sched_class;
2121 } else {
2122 p->sched_class = &fair_sched_class;
2123 }
2124
2125 if (p->sched_class->task_fork)
2126 p->sched_class->task_fork(p);
2127
2128
2129
2130
2131
2132
2133
2134
2135 raw_spin_lock_irqsave(&p->pi_lock, flags);
2136 set_task_cpu(p, cpu);
2137 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2138
2139#ifdef CONFIG_SCHED_INFO
2140 if (likely(sched_info_on()))
2141 memset(&p->sched_info, 0, sizeof(p->sched_info));
2142#endif
2143#if defined(CONFIG_SMP)
2144 p->on_cpu = 0;
2145#endif
2146#ifdef CONFIG_PREEMPT_COUNT
2147
2148 task_thread_info(p)->preempt_count = 1;
2149#endif
2150#ifdef CONFIG_SMP
2151 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2152 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2153#endif
2154
2155 put_cpu();
2156 return 0;
2157}
2158
2159unsigned long to_ratio(u64 period, u64 runtime)
2160{
2161 if (runtime == RUNTIME_INF)
2162 return BW_UNIT;
2163
2164
2165
2166
2167
2168
2169 if (period == 0)
2170 return 0;
2171
2172 return div64_u64(runtime << BW_SHIFT, period);
2173}
2174
2175#ifdef CONFIG_SMP
2176inline struct dl_bw *dl_bw_of(int i)
2177{
2178 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2179 "sched RCU must be held");
2180 return &cpu_rq(i)->rd->dl_bw;
2181}
2182
2183inline int dl_bw_cpus(int i)
2184{
2185 struct root_domain *rd = cpu_rq(i)->rd;
2186 int cpus = 0;
2187
2188 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2189 "sched RCU must be held");
2190 for_each_cpu_and(i, rd->span, cpu_active_mask)
2191 cpus++;
2192
2193 return cpus;
2194}
2195#else
2196inline struct dl_bw *dl_bw_of(int i)
2197{
2198 return &cpu_rq(i)->dl.dl_bw;
2199}
2200
2201inline int dl_bw_cpus(int i)
2202{
2203 return 1;
2204}
2205#endif
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215static int dl_overflow(struct task_struct *p, int policy,
2216 const struct sched_attr *attr)
2217{
2218
2219 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2220 u64 period = attr->sched_period ?: attr->sched_deadline;
2221 u64 runtime = attr->sched_runtime;
2222 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2223 int cpus, err = -1;
2224
2225 if (new_bw == p->dl.dl_bw)
2226 return 0;
2227
2228
2229
2230
2231
2232
2233 raw_spin_lock(&dl_b->lock);
2234 cpus = dl_bw_cpus(task_cpu(p));
2235 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2236 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2237 if (hrtimer_active(&p->dl.inactive_timer))
2238 __dl_clear(dl_b, p->dl.dl_bw, cpus);
2239 __dl_add(dl_b, new_bw, cpus);
2240 err = 0;
2241 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2242 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2243
2244
2245
2246
2247
2248
2249
2250 __dl_clear(dl_b, p->dl.dl_bw, cpus);
2251 __dl_add(dl_b, new_bw, cpus);
2252 dl_change_utilization(p, new_bw);
2253 err = 0;
2254 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2255
2256
2257
2258
2259
2260 err = 0;
2261 }
2262 raw_spin_unlock(&dl_b->lock);
2263
2264 return err;
2265}
2266
2267extern void init_dl_bw(struct dl_bw *dl_b);
2268
2269
2270
2271
2272
2273
2274
2275
2276void wake_up_new_task(struct task_struct *p)
2277{
2278 unsigned long flags;
2279 struct rq *rq;
2280
2281 raw_spin_lock_irqsave(&p->pi_lock, flags);
2282#ifdef CONFIG_SMP
2283
2284
2285
2286
2287
2288 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2289#endif
2290
2291
2292 init_task_runnable_average(p);
2293 rq = __task_rq_lock(p);
2294 activate_task(rq, p, 0);
2295 p->on_rq = TASK_ON_RQ_QUEUED;
2296 trace_sched_wakeup_new(p, true);
2297 check_preempt_curr(rq, p, WF_FORK);
2298#ifdef CONFIG_SMP
2299 if (p->sched_class->task_woken)
2300 p->sched_class->task_woken(rq, p);
2301#endif
2302 task_rq_unlock(rq, p, &flags);
2303}
2304
2305#ifdef CONFIG_PREEMPT_NOTIFIERS
2306
2307
2308
2309
2310
2311void preempt_notifier_register(struct preempt_notifier *notifier)
2312{
2313 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2314}
2315EXPORT_SYMBOL_GPL(preempt_notifier_register);
2316
2317
2318
2319
2320
2321
2322
2323void preempt_notifier_unregister(struct preempt_notifier *notifier)
2324{
2325 hlist_del(¬ifier->link);
2326}
2327EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2328
2329static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2330{
2331 struct preempt_notifier *notifier;
2332
2333 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2334 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2335}
2336
2337static void
2338fire_sched_out_preempt_notifiers(struct task_struct *curr,
2339 struct task_struct *next)
2340{
2341 struct preempt_notifier *notifier;
2342
2343 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2344 notifier->ops->sched_out(notifier, next);
2345}
2346
2347#else
2348
2349static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2350{
2351}
2352
2353static void
2354fire_sched_out_preempt_notifiers(struct task_struct *curr,
2355 struct task_struct *next)
2356{
2357}
2358
2359#endif
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374static inline void
2375prepare_task_switch(struct rq *rq, struct task_struct *prev,
2376 struct task_struct *next)
2377{
2378 trace_sched_switch(prev, next);
2379 sched_info_switch(prev, next);
2380 perf_event_task_sched_out(prev, next);
2381 fire_sched_out_preempt_notifiers(prev, next);
2382 prepare_lock_switch(rq, next);
2383 prepare_arch_switch(next);
2384}
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2402 __releases(rq->lock)
2403{
2404 struct mm_struct *mm = rq->prev_mm;
2405 long prev_state;
2406
2407 rq->prev_mm = NULL;
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420 prev_state = prev->state;
2421 vtime_task_switch(prev);
2422 finish_arch_switch(prev);
2423 perf_event_task_sched_in(prev, current);
2424 finish_lock_switch(rq, prev);
2425 finish_arch_post_lock_switch();
2426
2427 fire_sched_in_preempt_notifiers(current);
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440 if (mm) {
2441 membarrier_mm_sync_core_before_usermode(mm);
2442 mmdrop(mm);
2443 }
2444 if (unlikely(prev_state == TASK_DEAD)) {
2445 task_numa_free(prev);
2446
2447 if (prev->sched_class->task_dead)
2448 prev->sched_class->task_dead(prev);
2449
2450
2451
2452
2453
2454 kprobe_flush_task(prev);
2455 put_task_struct(prev);
2456 }
2457
2458 tick_nohz_task_switch(current);
2459}
2460
2461#ifdef CONFIG_SMP
2462
2463
2464static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2465{
2466 if (prev->sched_class->pre_schedule)
2467 prev->sched_class->pre_schedule(rq, prev);
2468}
2469
2470
2471static inline void post_schedule(struct rq *rq)
2472{
2473 if (rq->post_schedule) {
2474 unsigned long flags;
2475
2476 raw_spin_lock_irqsave(&rq->lock, flags);
2477 if (rq->curr->sched_class->post_schedule)
2478 rq->curr->sched_class->post_schedule(rq);
2479 raw_spin_unlock_irqrestore(&rq->lock, flags);
2480
2481 rq->post_schedule = 0;
2482 }
2483}
2484
2485#else
2486
2487static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2488{
2489}
2490
2491static inline void post_schedule(struct rq *rq)
2492{
2493}
2494
2495#endif
2496
2497
2498
2499
2500
2501asmlinkage void schedule_tail(struct task_struct *prev)
2502 __releases(rq->lock)
2503{
2504 struct rq *rq = this_rq();
2505
2506 finish_task_switch(rq, prev);
2507
2508
2509
2510
2511
2512 post_schedule(rq);
2513
2514#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2515
2516 preempt_enable();
2517#endif
2518 if (current->set_child_tid)
2519 put_user(task_pid_vnr(current), current->set_child_tid);
2520}
2521
2522
2523
2524
2525
2526static inline void
2527context_switch(struct rq *rq, struct task_struct *prev,
2528 struct task_struct *next)
2529{
2530 struct mm_struct *mm, *oldmm;
2531
2532 prepare_task_switch(rq, prev, next);
2533
2534 mm = next->mm;
2535 oldmm = prev->active_mm;
2536
2537
2538
2539
2540
2541 arch_start_context_switch(prev);
2542
2543
2544
2545
2546
2547
2548
2549
2550 if (!mm) {
2551 next->active_mm = oldmm;
2552 atomic_inc(&oldmm->mm_count);
2553 enter_lazy_tlb(oldmm, next);
2554 } else
2555 switch_mm_irqs_off(oldmm, mm, next);
2556
2557 if (!prev->mm) {
2558 prev->active_mm = NULL;
2559 rq->prev_mm = oldmm;
2560 }
2561
2562
2563
2564
2565
2566
2567#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2568 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2569#endif
2570
2571 context_tracking_task_switch(prev, next);
2572
2573 switch_to(prev, next, prev);
2574
2575 barrier();
2576
2577
2578
2579
2580
2581 finish_task_switch(this_rq(), prev);
2582}
2583
2584
2585
2586
2587
2588
2589
2590unsigned long nr_running(void)
2591{
2592 unsigned long i, sum = 0;
2593
2594 for_each_online_cpu(i)
2595 sum += cpu_rq(i)->nr_running;
2596
2597 return sum;
2598}
2599
2600
2601
2602
2603bool single_task_running(void)
2604{
2605 if (cpu_rq(smp_processor_id())->nr_running == 1)
2606 return true;
2607 else
2608 return false;
2609}
2610EXPORT_SYMBOL(single_task_running);
2611
2612unsigned long long nr_context_switches(void)
2613{
2614 int i;
2615 unsigned long long sum = 0;
2616
2617 for_each_possible_cpu(i)
2618 sum += cpu_rq(i)->nr_switches;
2619
2620 return sum;
2621}
2622
2623unsigned long nr_iowait(void)
2624{
2625 unsigned long i, sum = 0;
2626
2627 for_each_possible_cpu(i)
2628 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2629
2630 return sum;
2631}
2632
2633unsigned long nr_iowait_cpu(int cpu)
2634{
2635 struct rq *this = cpu_rq(cpu);
2636 return atomic_read(&this->nr_iowait);
2637}
2638
2639unsigned long this_cpu_load(void)
2640{
2641 struct rq *this = this_rq();
2642 return this->cpu_load[0];
2643}
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694static atomic_long_t calc_load_tasks;
2695static unsigned long calc_load_update;
2696unsigned long avenrun[3];
2697EXPORT_SYMBOL(avenrun);
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2708{
2709 loads[0] = (avenrun[0] + offset) << shift;
2710 loads[1] = (avenrun[1] + offset) << shift;
2711 loads[2] = (avenrun[2] + offset) << shift;
2712}
2713
2714static long calc_load_fold_active(struct rq *this_rq)
2715{
2716 long nr_active, delta = 0;
2717
2718 nr_active = this_rq->nr_running;
2719 nr_active += (long) this_rq->nr_uninterruptible;
2720
2721 if (nr_active != this_rq->calc_load_active) {
2722 delta = nr_active - this_rq->calc_load_active;
2723 this_rq->calc_load_active = nr_active;
2724 }
2725
2726 return delta;
2727}
2728
2729
2730
2731
2732static unsigned long
2733calc_load(unsigned long load, unsigned long exp, unsigned long active)
2734{
2735 load *= exp;
2736 load += active * (FIXED_1 - exp);
2737 load += 1UL << (FSHIFT - 1);
2738 return load >> FSHIFT;
2739}
2740
2741#ifdef CONFIG_NO_HZ_COMMON
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784static atomic_long_t calc_load_idle[2];
2785static int calc_load_idx;
2786
2787static inline int calc_load_write_idx(void)
2788{
2789 int idx = calc_load_idx;
2790
2791
2792
2793
2794
2795 smp_rmb();
2796
2797
2798
2799
2800
2801 if (!time_before(jiffies, calc_load_update))
2802 idx++;
2803
2804 return idx & 1;
2805}
2806
2807static inline int calc_load_read_idx(void)
2808{
2809 return calc_load_idx & 1;
2810}
2811
2812void calc_load_enter_idle(void)
2813{
2814 struct rq *this_rq = this_rq();
2815 long delta;
2816
2817
2818
2819
2820
2821 delta = calc_load_fold_active(this_rq);
2822 if (delta) {
2823 int idx = calc_load_write_idx();
2824 atomic_long_add(delta, &calc_load_idle[idx]);
2825 }
2826}
2827
2828void calc_load_exit_idle(void)
2829{
2830 struct rq *this_rq = this_rq();
2831
2832
2833
2834
2835 if (time_before(jiffies, this_rq->calc_load_update))
2836 return;
2837
2838
2839
2840
2841
2842
2843 this_rq->calc_load_update = calc_load_update;
2844 if (time_before(jiffies, this_rq->calc_load_update + 10))
2845 this_rq->calc_load_update += LOAD_FREQ;
2846}
2847
2848static long calc_load_fold_idle(void)
2849{
2850 int idx = calc_load_read_idx();
2851 long delta = 0;
2852
2853 if (atomic_long_read(&calc_load_idle[idx]))
2854 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2855
2856 return delta;
2857}
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874static unsigned long
2875fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2876{
2877 unsigned long result = 1UL << frac_bits;
2878
2879 if (n) for (;;) {
2880 if (n & 1) {
2881 result *= x;
2882 result += 1UL << (frac_bits - 1);
2883 result >>= frac_bits;
2884 }
2885 n >>= 1;
2886 if (!n)
2887 break;
2888 x *= x;
2889 x += 1UL << (frac_bits - 1);
2890 x >>= frac_bits;
2891 }
2892
2893 return result;
2894}
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919static unsigned long
2920calc_load_n(unsigned long load, unsigned long exp,
2921 unsigned long active, unsigned int n)
2922{
2923
2924 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2925}
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936static void calc_global_nohz(void)
2937{
2938 long delta, active, n;
2939
2940 if (!time_before(jiffies, calc_load_update + 10)) {
2941
2942
2943
2944 delta = jiffies - calc_load_update - 10;
2945 n = 1 + (delta / LOAD_FREQ);
2946
2947 active = atomic_long_read(&calc_load_tasks);
2948 active = active > 0 ? active * FIXED_1 : 0;
2949
2950 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2951 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2952 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2953
2954 calc_load_update += n * LOAD_FREQ;
2955 }
2956
2957
2958
2959
2960
2961
2962
2963
2964 smp_wmb();
2965 calc_load_idx++;
2966}
2967#else
2968
2969static inline long calc_load_fold_idle(void) { return 0; }
2970static inline void calc_global_nohz(void) { }
2971
2972#endif
2973
2974
2975
2976
2977
2978void calc_global_load(unsigned long ticks)
2979{
2980 long active, delta;
2981
2982 if (time_before(jiffies, calc_load_update + 10))
2983 return;
2984
2985
2986
2987
2988 delta = calc_load_fold_idle();
2989 if (delta)
2990 atomic_long_add(delta, &calc_load_tasks);
2991
2992 active = atomic_long_read(&calc_load_tasks);
2993 active = active > 0 ? active * FIXED_1 : 0;
2994
2995 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2996 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2997 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2998
2999 calc_load_update += LOAD_FREQ;
3000
3001
3002
3003
3004 calc_global_nohz();
3005}
3006
3007
3008
3009
3010
3011static void calc_load_account_active(struct rq *this_rq)
3012{
3013 long delta;
3014
3015 if (time_before(jiffies, this_rq->calc_load_update))
3016 return;
3017
3018 delta = calc_load_fold_active(this_rq);
3019 if (delta)
3020 atomic_long_add(delta, &calc_load_tasks);
3021
3022 this_rq->calc_load_update += LOAD_FREQ;
3023}
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056#define DEGRADE_SHIFT 7
3057static const unsigned char
3058 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3059static const unsigned char
3060 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3061 {0, 0, 0, 0, 0, 0, 0, 0},
3062 {64, 32, 8, 0, 0, 0, 0, 0},
3063 {96, 72, 40, 12, 1, 0, 0},
3064 {112, 98, 75, 43, 15, 1, 0},
3065 {120, 112, 98, 76, 45, 16, 2} };
3066
3067
3068
3069
3070
3071
3072static unsigned long
3073decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3074{
3075 int j = 0;
3076
3077 if (!missed_updates)
3078 return load;
3079
3080 if (missed_updates >= degrade_zero_ticks[idx])
3081 return 0;
3082
3083 if (idx == 1)
3084 return load >> missed_updates;
3085
3086 while (missed_updates) {
3087 if (missed_updates % 2)
3088 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3089
3090 missed_updates >>= 1;
3091 j++;
3092 }
3093 return load;
3094}
3095
3096
3097
3098
3099
3100
3101static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
3102 unsigned long pending_updates)
3103{
3104 int i, scale;
3105
3106 this_rq->nr_load_updates++;
3107
3108
3109 this_rq->cpu_load[0] = this_load;
3110 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3111 unsigned long old_load, new_load;
3112
3113
3114
3115 old_load = this_rq->cpu_load[i];
3116 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3117 new_load = this_load;
3118
3119
3120
3121
3122
3123 if (new_load > old_load)
3124 new_load += scale - 1;
3125
3126 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3127 }
3128
3129 sched_avg_update(this_rq);
3130}
3131
3132#ifdef CONFIG_NO_HZ_COMMON
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150void update_idle_cpu_load(struct rq *this_rq)
3151{
3152 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
3153 unsigned long load = this_rq->load.weight;
3154 unsigned long pending_updates;
3155
3156
3157
3158
3159 if (load || curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3165 __update_cpu_load(this_rq, load, pending_updates);
3166}
3167
3168
3169
3170
3171void update_cpu_load_nohz(void)
3172{
3173 struct rq *this_rq = this_rq();
3174 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
3175 unsigned long pending_updates;
3176
3177 if (curr_jiffies == this_rq->last_load_update_tick)
3178 return;
3179
3180 raw_spin_lock(&this_rq->lock);
3181 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3182 if (pending_updates) {
3183 this_rq->last_load_update_tick = curr_jiffies;
3184
3185
3186
3187
3188 __update_cpu_load(this_rq, 0, pending_updates);
3189 }
3190 raw_spin_unlock(&this_rq->lock);
3191}
3192#endif
3193
3194
3195
3196
3197static void update_cpu_load_active(struct rq *this_rq)
3198{
3199
3200
3201
3202 this_rq->last_load_update_tick = jiffies;
3203 __update_cpu_load(this_rq, this_rq->load.weight, 1);
3204
3205 calc_load_account_active(this_rq);
3206}
3207
3208#ifdef CONFIG_SMP
3209
3210
3211
3212
3213
3214void sched_exec(void)
3215{
3216 struct task_struct *p = current;
3217 unsigned long flags;
3218 int dest_cpu;
3219
3220 raw_spin_lock_irqsave(&p->pi_lock, flags);
3221 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3222 if (dest_cpu == smp_processor_id())
3223 goto unlock;
3224
3225 if (likely(cpu_active(dest_cpu))) {
3226 struct migration_arg arg = { p, dest_cpu };
3227
3228 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3229 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3230 return;
3231 }
3232unlock:
3233 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3234}
3235
3236#endif
3237
3238DEFINE_PER_CPU(struct kernel_stat, kstat);
3239DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3240
3241EXPORT_PER_CPU_SYMBOL(kstat);
3242EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3243
3244
3245
3246
3247
3248
3249
3250unsigned long long task_sched_runtime(struct task_struct *p)
3251{
3252 unsigned long flags;
3253 struct rq *rq;
3254 u64 ns;
3255
3256#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268 if (!p->on_cpu || !task_on_rq_queued(p))
3269 return p->se.sum_exec_runtime;
3270#endif
3271 rq = task_rq_lock(p, &flags);
3272
3273
3274
3275
3276
3277 if (task_current(rq, p) && task_on_rq_queued(p)) {
3278 update_rq_clock(rq);
3279 p->sched_class->update_curr(rq);
3280 }
3281 ns = p->se.sum_exec_runtime;
3282 task_rq_unlock(rq, p, &flags);
3283
3284 return ns;
3285}
3286
3287
3288
3289
3290
3291void scheduler_tick(void)
3292{
3293 int cpu = smp_processor_id();
3294 struct rq *rq = cpu_rq(cpu);
3295 struct task_struct *curr = rq->curr;
3296
3297 sched_clock_tick();
3298
3299 raw_spin_lock(&rq->lock);
3300 update_rq_clock(rq);
3301 update_cpu_load_active(rq);
3302 curr->sched_class->task_tick(rq, curr, 0);
3303 raw_spin_unlock(&rq->lock);
3304
3305 perf_event_task_tick();
3306
3307#ifdef CONFIG_SMP
3308 rq->idle_balance = idle_cpu(cpu);
3309 trigger_load_balance(rq, cpu);
3310#endif
3311 rq_last_tick_reset(rq);
3312}
3313
3314#ifdef CONFIG_NO_HZ_FULL
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328u64 scheduler_tick_max_deferment(void)
3329{
3330 struct rq *rq = this_rq();
3331 unsigned long next, now = ACCESS_ONCE(jiffies);
3332
3333 next = rq->last_sched_tick + HZ;
3334
3335 if (time_before_eq(next, now))
3336 return 0;
3337
3338 return jiffies_to_nsecs(next - now);
3339}
3340#endif
3341
3342notrace unsigned long get_parent_ip(unsigned long addr)
3343{
3344 if (in_lock_functions(addr)) {
3345 addr = CALLER_ADDR2;
3346 if (in_lock_functions(addr))
3347 addr = CALLER_ADDR3;
3348 }
3349 return addr;
3350}
3351
3352#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3353 defined(CONFIG_PREEMPT_TRACER))
3354
3355void __kprobes add_preempt_count(int val)
3356{
3357#ifdef CONFIG_DEBUG_PREEMPT
3358
3359
3360
3361 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3362 return;
3363#endif
3364 preempt_count() += val;
3365#ifdef CONFIG_DEBUG_PREEMPT
3366
3367
3368
3369 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3370 PREEMPT_MASK - 10);
3371#endif
3372 if (preempt_count() == val)
3373 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3374}
3375EXPORT_SYMBOL(add_preempt_count);
3376
3377void __kprobes sub_preempt_count(int val)
3378{
3379#ifdef CONFIG_DEBUG_PREEMPT
3380
3381
3382
3383 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3384 return;
3385
3386
3387
3388 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3389 !(preempt_count() & PREEMPT_MASK)))
3390 return;
3391#endif
3392
3393 if (preempt_count() == val)
3394 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3395 preempt_count() -= val;
3396}
3397EXPORT_SYMBOL(sub_preempt_count);
3398
3399#endif
3400
3401
3402
3403
3404static noinline void __schedule_bug(struct task_struct *prev)
3405{
3406 if (oops_in_progress)
3407 return;
3408
3409 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3410 prev->comm, prev->pid, preempt_count());
3411
3412 debug_show_held_locks(prev);
3413 print_modules();
3414 if (irqs_disabled())
3415 print_irqtrace_events(prev);
3416 if (panic_on_warn)
3417 panic("scheduling while atomic\n");
3418
3419 dump_stack();
3420 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3421}
3422
3423
3424
3425
3426static inline void schedule_debug(struct task_struct *prev)
3427{
3428
3429
3430
3431
3432
3433 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3434 __schedule_bug(prev);
3435 rcu_sleep_check();
3436
3437 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3438
3439 schedstat_inc(this_rq(), sched_count);
3440}
3441
3442static void put_prev_task(struct rq *rq, struct task_struct *prev)
3443{
3444 if (prev->on_rq || rq->skip_clock_update < 0)
3445 update_rq_clock(rq);
3446 prev->sched_class->put_prev_task(rq, prev);
3447}
3448
3449
3450
3451
3452static inline struct task_struct *
3453pick_next_task(struct rq *rq)
3454{
3455 const struct sched_class *class;
3456 struct task_struct *p;
3457
3458
3459
3460
3461
3462 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3463 p = fair_sched_class.pick_next_task(rq);
3464 if (likely(p))
3465 return p;
3466
3467
3468 else
3469 p = idle_sched_class.pick_next_task(rq);
3470
3471 return p;
3472 }
3473
3474 for_each_class(class) {
3475 p = class->pick_next_task(rq);
3476 if (p)
3477 return p;
3478 }
3479
3480 BUG();
3481}
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520static void __sched __schedule(void)
3521{
3522 struct task_struct *prev, *next;
3523 unsigned long *switch_count;
3524 struct rq *rq;
3525 int cpu;
3526
3527need_resched:
3528 preempt_disable();
3529 cpu = smp_processor_id();
3530 rq = cpu_rq(cpu);
3531 rcu_note_context_switch(cpu);
3532 prev = rq->curr;
3533
3534 schedule_debug(prev);
3535
3536 if (sched_feat(HRTICK))
3537 hrtick_clear(rq);
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547 raw_spin_lock_irq(&rq->lock);
3548 smp_mb__after_spinlock();
3549
3550 switch_count = &prev->nivcsw;
3551 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3552 if (unlikely(signal_pending_state(prev->state, prev))) {
3553 prev->state = TASK_RUNNING;
3554 } else {
3555 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3556 prev->on_rq = 0;
3557
3558
3559
3560
3561
3562
3563 if (prev->flags & PF_WQ_WORKER) {
3564 struct task_struct *to_wakeup;
3565
3566 to_wakeup = wq_worker_sleeping(prev, cpu);
3567 if (to_wakeup)
3568 try_to_wake_up_local(to_wakeup);
3569 }
3570 }
3571 switch_count = &prev->nvcsw;
3572 }
3573
3574 pre_schedule(rq, prev);
3575
3576 if (unlikely(!rq->nr_running))
3577 idle_balance(cpu, rq);
3578
3579 put_prev_task(rq, prev);
3580 next = pick_next_task(rq);
3581 clear_tsk_need_resched(prev);
3582 rq->skip_clock_update = 0;
3583
3584 if (likely(prev != next)) {
3585 rq->nr_switches++;
3586 rq->curr = next;
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601 ++*switch_count;
3602
3603 context_switch(rq, prev, next);
3604
3605
3606
3607
3608
3609
3610 cpu = smp_processor_id();
3611 rq = cpu_rq(cpu);
3612 } else
3613 raw_spin_unlock_irq(&rq->lock);
3614
3615 post_schedule(rq);
3616
3617 sched_preempt_enable_no_resched();
3618 if (need_resched())
3619 goto need_resched;
3620}
3621STACK_FRAME_NON_STANDARD(__schedule);
3622
3623static inline void sched_submit_work(struct task_struct *tsk)
3624{
3625 if (!tsk->state || tsk_is_pi_blocked(tsk))
3626 return;
3627
3628
3629
3630
3631 if (blk_needs_flush_plug(tsk))
3632 blk_schedule_flush_plug(tsk);
3633}
3634
3635asmlinkage void __sched schedule(void)
3636{
3637 struct task_struct *tsk = current;
3638
3639 sched_submit_work(tsk);
3640 __schedule();
3641}
3642EXPORT_SYMBOL(schedule);
3643
3644#ifdef CONFIG_CONTEXT_TRACKING
3645asmlinkage void __sched schedule_user(void)
3646{
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657 enum ctx_state prev_state = exception_enter();
3658 schedule();
3659 exception_exit(prev_state);
3660}
3661#endif
3662
3663
3664
3665
3666
3667
3668void __sched schedule_preempt_disabled(void)
3669{
3670 sched_preempt_enable_no_resched();
3671 schedule();
3672 preempt_disable();
3673}
3674
3675#ifdef CONFIG_PREEMPT
3676
3677
3678
3679
3680
3681asmlinkage void __sched notrace preempt_schedule(void)
3682{
3683
3684
3685
3686
3687 if (likely(!preemptible()))
3688 return;
3689
3690 do {
3691 add_preempt_count_notrace(PREEMPT_ACTIVE);
3692 __schedule();
3693 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3694
3695
3696
3697
3698
3699 barrier();
3700 } while (need_resched());
3701}
3702EXPORT_SYMBOL(preempt_schedule);
3703
3704
3705
3706
3707
3708
3709
3710asmlinkage void __sched preempt_schedule_irq(void)
3711{
3712 struct thread_info *ti = current_thread_info();
3713 enum ctx_state prev_state;
3714
3715
3716 BUG_ON(ti->preempt_count || !irqs_disabled());
3717
3718 prev_state = exception_enter();
3719
3720 do {
3721 add_preempt_count(PREEMPT_ACTIVE);
3722 local_irq_enable();
3723 __schedule();
3724 local_irq_disable();
3725 sub_preempt_count(PREEMPT_ACTIVE);
3726
3727
3728
3729
3730
3731 barrier();
3732 } while (need_resched());
3733
3734 exception_exit(prev_state);
3735}
3736
3737#endif
3738
3739int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3740 void *key)
3741{
3742 return try_to_wake_up(curr->private, mode, wake_flags);
3743}
3744EXPORT_SYMBOL(default_wake_function);
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3756 int nr_exclusive, int wake_flags, void *key)
3757{
3758 wait_queue_t *curr, *next;
3759
3760 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3761 unsigned flags = curr->flags;
3762
3763 if (curr->func(curr, mode, wake_flags, key) &&
3764 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3765 break;
3766 }
3767}
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779void __wake_up(wait_queue_head_t *q, unsigned int mode,
3780 int nr_exclusive, void *key)
3781{
3782 unsigned long flags;
3783
3784 spin_lock_irqsave(&q->lock, flags);
3785 __wake_up_common(q, mode, nr_exclusive, 0, key);
3786 spin_unlock_irqrestore(&q->lock, flags);
3787}
3788EXPORT_SYMBOL(__wake_up);
3789
3790
3791
3792
3793void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3794{
3795 __wake_up_common(q, mode, nr, 0, NULL);
3796}
3797EXPORT_SYMBOL_GPL(__wake_up_locked);
3798
3799void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3800{
3801 __wake_up_common(q, mode, 1, 0, key);
3802}
3803EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3823 int nr_exclusive, void *key)
3824{
3825 unsigned long flags;
3826 int wake_flags = WF_SYNC;
3827
3828 if (unlikely(!q))
3829 return;
3830
3831 if (unlikely(!nr_exclusive))
3832 wake_flags = 0;
3833
3834 spin_lock_irqsave(&q->lock, flags);
3835 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3836 spin_unlock_irqrestore(&q->lock, flags);
3837}
3838EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3839
3840
3841
3842
3843void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3844{
3845 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3846}
3847EXPORT_SYMBOL_GPL(__wake_up_sync);
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861void complete(struct completion *x)
3862{
3863 unsigned long flags;
3864
3865 spin_lock_irqsave(&x->wait.lock, flags);
3866 x->done++;
3867 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3868 spin_unlock_irqrestore(&x->wait.lock, flags);
3869}
3870EXPORT_SYMBOL(complete);
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881void complete_all(struct completion *x)
3882{
3883 unsigned long flags;
3884
3885 spin_lock_irqsave(&x->wait.lock, flags);
3886 x->done += UINT_MAX/2;
3887 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3888 spin_unlock_irqrestore(&x->wait.lock, flags);
3889}
3890EXPORT_SYMBOL(complete_all);
3891
3892static inline long __sched
3893do_wait_for_common(struct completion *x,
3894 long (*action)(long), long timeout, int state)
3895{
3896 if (!x->done) {
3897 DECLARE_WAITQUEUE(wait, current);
3898
3899 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3900 do {
3901 if (signal_pending_state(state, current)) {
3902 timeout = -ERESTARTSYS;
3903 break;
3904 }
3905 __set_current_state(state);
3906 spin_unlock_irq(&x->wait.lock);
3907 timeout = action(timeout);
3908 spin_lock_irq(&x->wait.lock);
3909 } while (!x->done && timeout);
3910 __remove_wait_queue(&x->wait, &wait);
3911 if (!x->done)
3912 return timeout;
3913 }
3914 x->done--;
3915 return timeout ?: 1;
3916}
3917
3918static inline long __sched
3919__wait_for_common(struct completion *x,
3920 long (*action)(long), long timeout, int state)
3921{
3922 might_sleep();
3923
3924 spin_lock_irq(&x->wait.lock);
3925 timeout = do_wait_for_common(x, action, timeout, state);
3926 spin_unlock_irq(&x->wait.lock);
3927 return timeout;
3928}
3929
3930static long __sched
3931wait_for_common(struct completion *x, long timeout, int state)
3932{
3933 return __wait_for_common(x, schedule_timeout, timeout, state);
3934}
3935
3936static long __sched
3937wait_for_common_io(struct completion *x, long timeout, int state)
3938{
3939 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3940}
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952void __sched wait_for_completion(struct completion *x)
3953{
3954 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3955}
3956EXPORT_SYMBOL(wait_for_completion);
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970unsigned long __sched
3971wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3972{
3973 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3974}
3975EXPORT_SYMBOL(wait_for_completion_timeout);
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985void __sched wait_for_completion_io(struct completion *x)
3986{
3987 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3988}
3989EXPORT_SYMBOL(wait_for_completion_io);
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003unsigned long __sched
4004wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
4005{
4006 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
4007}
4008EXPORT_SYMBOL(wait_for_completion_io_timeout);
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019int __sched wait_for_completion_interruptible(struct completion *x)
4020{
4021 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4022 if (t == -ERESTARTSYS)
4023 return t;
4024 return 0;
4025}
4026EXPORT_SYMBOL(wait_for_completion_interruptible);
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039long __sched
4040wait_for_completion_interruptible_timeout(struct completion *x,
4041 unsigned long timeout)
4042{
4043 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4044}
4045EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056int __sched wait_for_completion_killable(struct completion *x)
4057{
4058 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4059 if (t == -ERESTARTSYS)
4060 return t;
4061 return 0;
4062}
4063EXPORT_SYMBOL(wait_for_completion_killable);
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077long __sched
4078wait_for_completion_killable_timeout(struct completion *x,
4079 unsigned long timeout)
4080{
4081 return wait_for_common(x, timeout, TASK_KILLABLE);
4082}
4083EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097bool try_wait_for_completion(struct completion *x)
4098{
4099 unsigned long flags;
4100 int ret = 1;
4101
4102 spin_lock_irqsave(&x->wait.lock, flags);
4103 if (!x->done)
4104 ret = 0;
4105 else
4106 x->done--;
4107 spin_unlock_irqrestore(&x->wait.lock, flags);
4108 return ret;
4109}
4110EXPORT_SYMBOL(try_wait_for_completion);
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120bool completion_done(struct completion *x)
4121{
4122 unsigned long flags;
4123 int ret = 1;
4124
4125 spin_lock_irqsave(&x->wait.lock, flags);
4126 if (!x->done)
4127 ret = 0;
4128 spin_unlock_irqrestore(&x->wait.lock, flags);
4129 return ret;
4130}
4131EXPORT_SYMBOL(completion_done);
4132
4133static long __sched
4134sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4135{
4136 unsigned long flags;
4137 wait_queue_t wait;
4138
4139 init_waitqueue_entry(&wait, current);
4140
4141 __set_current_state(state);
4142
4143 spin_lock_irqsave(&q->lock, flags);
4144 __add_wait_queue(q, &wait);
4145 spin_unlock(&q->lock);
4146 timeout = schedule_timeout(timeout);
4147 spin_lock_irq(&q->lock);
4148 __remove_wait_queue(q, &wait);
4149 spin_unlock_irqrestore(&q->lock, flags);
4150
4151 return timeout;
4152}
4153
4154void __sched interruptible_sleep_on(wait_queue_head_t *q)
4155{
4156 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4157}
4158EXPORT_SYMBOL(interruptible_sleep_on);
4159
4160long __sched
4161interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4162{
4163 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4164}
4165EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4166
4167void __sched sleep_on(wait_queue_head_t *q)
4168{
4169 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4170}
4171EXPORT_SYMBOL(sleep_on);
4172
4173long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4174{
4175 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4176}
4177EXPORT_SYMBOL(sleep_on_timeout);
4178
4179#ifdef CONFIG_RT_MUTEXES
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191void rt_mutex_setprio(struct task_struct *p, int prio)
4192{
4193 int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
4194 struct rq *rq;
4195 const struct sched_class *prev_class;
4196
4197 BUG_ON(prio > MAX_PRIO);
4198
4199 rq = __task_rq_lock(p);
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213 if (unlikely(p == rq->idle)) {
4214 WARN_ON(p != rq->curr);
4215 WARN_ON(p->pi_blocked_on);
4216 goto out_unlock;
4217 }
4218
4219 trace_sched_pi_setprio(p, prio);
4220 oldprio = p->prio;
4221 prev_class = p->sched_class;
4222 queued = task_on_rq_queued(p);
4223 running = task_current(rq, p);
4224 if (queued)
4225 dequeue_task(rq, p, DEQUEUE_SAVE);
4226 if (running)
4227 p->sched_class->put_prev_task(rq, p);
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238 if (dl_prio(prio)) {
4239 struct task_struct *pi_task = rt_mutex_get_top_task(p);
4240 if (!dl_prio(p->normal_prio) ||
4241 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
4242 p->dl.dl_boosted = 1;
4243 enqueue_flag |= ENQUEUE_REPLENISH;
4244 } else
4245 p->dl.dl_boosted = 0;
4246 p->sched_class = &dl_sched_class;
4247 } else if (rt_prio(prio)) {
4248 if (dl_prio(oldprio))
4249 p->dl.dl_boosted = 0;
4250 if (oldprio < prio)
4251 enqueue_flag |= ENQUEUE_HEAD;
4252 p->sched_class = &rt_sched_class;
4253 } else {
4254 if (dl_prio(oldprio))
4255 p->dl.dl_boosted = 0;
4256 p->sched_class = &fair_sched_class;
4257 }
4258
4259 p->prio = prio;
4260
4261 if (running)
4262 p->sched_class->set_curr_task(rq);
4263 if (queued)
4264 enqueue_task(rq, p, enqueue_flag);
4265
4266 check_class_changed(rq, p, prev_class, oldprio);
4267out_unlock:
4268 __task_rq_unlock(rq);
4269}
4270#endif
4271
4272void set_user_nice(struct task_struct *p, long nice)
4273{
4274 int old_prio, delta, queued;
4275 unsigned long flags;
4276 struct rq *rq;
4277
4278 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4279 return;
4280
4281
4282
4283
4284 rq = task_rq_lock(p, &flags);
4285
4286
4287
4288
4289
4290
4291 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4292 p->static_prio = NICE_TO_PRIO(nice);
4293 goto out_unlock;
4294 }
4295 queued = task_on_rq_queued(p);
4296 if (queued)
4297 dequeue_task(rq, p, DEQUEUE_SAVE);
4298
4299 p->static_prio = NICE_TO_PRIO(nice);
4300 set_load_weight(p);
4301 old_prio = p->prio;
4302 p->prio = effective_prio(p);
4303 delta = p->prio - old_prio;
4304
4305 if (queued) {
4306 enqueue_task(rq, p, ENQUEUE_RESTORE);
4307
4308
4309
4310
4311 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4312 resched_curr(rq);
4313 }
4314out_unlock:
4315 task_rq_unlock(rq, p, &flags);
4316}
4317EXPORT_SYMBOL(set_user_nice);
4318
4319
4320
4321
4322
4323
4324int can_nice(const struct task_struct *p, const int nice)
4325{
4326
4327 int nice_rlim = 20 - nice;
4328
4329 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4330 capable(CAP_SYS_NICE));
4331}
4332
4333#ifdef __ARCH_WANT_SYS_NICE
4334
4335
4336
4337
4338
4339
4340
4341
4342SYSCALL_DEFINE1(nice, int, increment)
4343{
4344 long nice, retval;
4345
4346
4347
4348
4349
4350
4351 if (increment < -40)
4352 increment = -40;
4353 if (increment > 40)
4354 increment = 40;
4355
4356 nice = TASK_NICE(current) + increment;
4357 if (nice < -20)
4358 nice = -20;
4359 if (nice > 19)
4360 nice = 19;
4361
4362 if (increment < 0 && !can_nice(current, nice))
4363 return -EPERM;
4364
4365 retval = security_task_setnice(current, nice);
4366 if (retval)
4367 return retval;
4368
4369 set_user_nice(current, nice);
4370 return 0;
4371}
4372
4373#endif
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383int task_prio(const struct task_struct *p)
4384{
4385 return p->prio - MAX_RT_PRIO;
4386}
4387
4388
4389
4390
4391
4392
4393
4394int task_nice(const struct task_struct *p)
4395{
4396 return TASK_NICE(p);
4397}
4398EXPORT_SYMBOL(task_nice);
4399
4400
4401
4402
4403
4404
4405
4406int idle_cpu(int cpu)
4407{
4408 struct rq *rq = cpu_rq(cpu);
4409
4410 if (rq->curr != rq->idle)
4411 return 0;
4412
4413 if (rq->nr_running)
4414 return 0;
4415
4416#ifdef CONFIG_SMP
4417 if (!llist_empty(&rq->wake_list))
4418 return 0;
4419#endif
4420
4421 return 1;
4422}
4423
4424
4425
4426
4427
4428
4429
4430struct task_struct *idle_task(int cpu)
4431{
4432 return cpu_rq(cpu)->idle;
4433}
4434
4435
4436
4437
4438
4439
4440
4441static struct task_struct *find_process_by_pid(pid_t pid)
4442{
4443 return pid ? find_task_by_vpid(pid) : current;
4444}
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454static void
4455__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
4456{
4457 struct sched_dl_entity *dl_se = &p->dl;
4458
4459 dl_se->dl_runtime = attr->sched_runtime;
4460 dl_se->dl_deadline = attr->sched_deadline;
4461 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
4462 dl_se->flags = attr->sched_flags;
4463 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
4464 dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
4465}
4466
4467
4468static void __setscheduler(struct rq *rq, struct task_struct *p,
4469 const struct sched_attr *attr)
4470{
4471 int policy = attr->sched_policy;
4472
4473 if (policy == -1)
4474 policy = p->policy;
4475
4476 p->policy = policy;
4477
4478 if (dl_policy(policy))
4479 __setparam_dl(p, attr);
4480 else if (fair_policy(policy))
4481 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4482
4483
4484
4485
4486
4487
4488 p->rt_priority = attr->sched_priority;
4489
4490 p->normal_prio = normal_prio(p);
4491 p->prio = rt_mutex_getprio(p);
4492
4493 if (dl_prio(p->prio))
4494 p->sched_class = &dl_sched_class;
4495 else if (rt_prio(p->prio))
4496 p->sched_class = &rt_sched_class;
4497 else
4498 p->sched_class = &fair_sched_class;
4499
4500 set_load_weight(p);
4501}
4502
4503static void
4504__getparam_dl(struct task_struct *p, struct sched_attr *attr)
4505{
4506 struct sched_dl_entity *dl_se = &p->dl;
4507
4508 attr->sched_priority = p->rt_priority;
4509 attr->sched_runtime = dl_se->dl_runtime;
4510 attr->sched_deadline = dl_se->dl_deadline;
4511 attr->sched_period = dl_se->dl_period;
4512 attr->sched_flags = dl_se->flags;
4513}
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525static bool
4526__checkparam_dl(const struct sched_attr *attr)
4527{
4528
4529 if (attr->sched_deadline == 0)
4530 return false;
4531
4532
4533
4534
4535
4536 if (attr->sched_runtime < (1ULL << DL_SCALE))
4537 return false;
4538
4539
4540
4541
4542
4543 if (attr->sched_deadline & (1ULL << 63) ||
4544 attr->sched_period & (1ULL << 63))
4545 return false;
4546
4547
4548 if ((attr->sched_period != 0 &&
4549 attr->sched_period < attr->sched_deadline) ||
4550 attr->sched_deadline < attr->sched_runtime)
4551 return false;
4552
4553 return true;
4554}
4555
4556
4557
4558
4559static bool check_same_owner(struct task_struct *p)
4560{
4561 const struct cred *cred = current_cred(), *pcred;
4562 bool match;
4563
4564 rcu_read_lock();
4565 pcred = __task_cred(p);
4566 match = (uid_eq(cred->euid, pcred->euid) ||
4567 uid_eq(cred->euid, pcred->uid));
4568 rcu_read_unlock();
4569 return match;
4570}
4571
4572static int __sched_setscheduler(struct task_struct *p,
4573 const struct sched_attr *attr,
4574 bool user)
4575{
4576 int retval, oldprio, oldpolicy = -1, queued, running;
4577 int policy = attr->sched_policy;
4578 unsigned long flags;
4579 const struct sched_class *prev_class;
4580 struct rq *rq;
4581 int reset_on_fork;
4582
4583
4584 BUG_ON(in_interrupt());
4585recheck:
4586
4587 if (policy < 0) {
4588 reset_on_fork = p->sched_reset_on_fork;
4589 policy = oldpolicy = p->policy;
4590 } else {
4591 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4592
4593 if (policy != SCHED_DEADLINE &&
4594 policy != SCHED_FIFO && policy != SCHED_RR &&
4595 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4596 policy != SCHED_IDLE)
4597 return -EINVAL;
4598 }
4599
4600 if (attr->sched_flags &
4601 ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
4602 return -EINVAL;
4603
4604
4605
4606
4607
4608
4609 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4610 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4611 return -EINVAL;
4612 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4613 (rt_policy(policy) != (attr->sched_priority != 0)))
4614 return -EINVAL;
4615
4616
4617
4618
4619 if (user && !capable(CAP_SYS_NICE)) {
4620 if (fair_policy(policy)) {
4621 if (attr->sched_nice < TASK_NICE(p) &&
4622 !can_nice(p, attr->sched_nice))
4623 return -EPERM;
4624 }
4625
4626 if (rt_policy(policy)) {
4627 unsigned long rlim_rtprio =
4628 task_rlimit(p, RLIMIT_RTPRIO);
4629
4630
4631 if (policy != p->policy && !rlim_rtprio)
4632 return -EPERM;
4633
4634
4635 if (attr->sched_priority > p->rt_priority &&
4636 attr->sched_priority > rlim_rtprio)
4637 return -EPERM;
4638 }
4639
4640
4641
4642
4643
4644
4645
4646 if (dl_policy(policy))
4647 return -EPERM;
4648
4649
4650
4651
4652
4653 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4654 if (!can_nice(p, TASK_NICE(p)))
4655 return -EPERM;
4656 }
4657
4658
4659 if (!check_same_owner(p))
4660 return -EPERM;
4661
4662
4663 if (p->sched_reset_on_fork && !reset_on_fork)
4664 return -EPERM;
4665 }
4666
4667 if (user) {
4668 retval = security_task_setscheduler(p);
4669 if (retval)
4670 return retval;
4671 }
4672
4673
4674
4675
4676
4677
4678
4679
4680 rq = task_rq_lock(p, &flags);
4681
4682
4683
4684
4685 if (p == rq->stop) {
4686 task_rq_unlock(rq, p, &flags);
4687 return -EINVAL;
4688 }
4689
4690
4691
4692
4693 if (unlikely(policy == p->policy)) {
4694 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
4695 goto change;
4696 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4697 goto change;
4698 if (dl_policy(policy))
4699 goto change;
4700
4701 task_rq_unlock(rq, p, &flags);
4702 return 0;
4703 }
4704change:
4705
4706 if (user) {
4707#ifdef CONFIG_RT_GROUP_SCHED
4708
4709
4710
4711
4712 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4713 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4714 !task_group_is_autogroup(task_group(p))) {
4715 task_rq_unlock(rq, p, &flags);
4716 return -EPERM;
4717 }
4718#endif
4719#ifdef CONFIG_SMP
4720 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4721 cpumask_t *span = rq->rd->span;
4722
4723
4724
4725
4726
4727
4728 if (!cpumask_subset(span, &p->cpus_allowed) ||
4729 rq->rd->dl_bw.bw == 0) {
4730 task_rq_unlock(rq, p, &flags);
4731 return -EPERM;
4732 }
4733 }
4734#endif
4735 }
4736
4737
4738 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4739 policy = oldpolicy = -1;
4740 task_rq_unlock(rq, p, &flags);
4741 goto recheck;
4742 }
4743
4744
4745
4746
4747
4748
4749 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
4750 task_rq_unlock(rq, p, &flags);
4751 return -EBUSY;
4752 }
4753
4754 queued = task_on_rq_queued(p);
4755 running = task_current(rq, p);
4756 if (queued)
4757 dequeue_task(rq, p, DEQUEUE_SAVE);
4758 if (running)
4759 p->sched_class->put_prev_task(rq, p);
4760
4761 p->sched_reset_on_fork = reset_on_fork;
4762
4763 oldprio = p->prio;
4764 prev_class = p->sched_class;
4765 __setscheduler(rq, p, attr);
4766
4767 if (running)
4768 p->sched_class->set_curr_task(rq);
4769 if (queued)
4770 enqueue_task(rq, p, ENQUEUE_RESTORE);
4771
4772 check_class_changed(rq, p, prev_class, oldprio);
4773 task_rq_unlock(rq, p, &flags);
4774
4775 rt_mutex_adjust_pi(p);
4776
4777 return 0;
4778}
4779
4780static int _sched_setscheduler(struct task_struct *p, int policy,
4781 const struct sched_param *param, bool check)
4782{
4783 struct sched_attr attr = {
4784 .sched_policy = policy,
4785 .sched_priority = param->sched_priority,
4786 .sched_nice = PRIO_TO_NICE(p->static_prio),
4787 };
4788
4789
4790
4791
4792
4793 if ((policy != -1) && (policy & SCHED_RESET_ON_FORK)) {
4794 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4795 policy &= ~SCHED_RESET_ON_FORK;
4796 attr.sched_policy = policy;
4797 }
4798
4799 return __sched_setscheduler(p, &attr, check);
4800}
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811int sched_setscheduler(struct task_struct *p, int policy,
4812 const struct sched_param *param)
4813{
4814 return _sched_setscheduler(p, policy, param, true);
4815}
4816EXPORT_SYMBOL_GPL(sched_setscheduler);
4817
4818int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4819{
4820 return __sched_setscheduler(p, attr, true);
4821}
4822EXPORT_SYMBOL_GPL(sched_setattr);
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4838 const struct sched_param *param)
4839{
4840 return _sched_setscheduler(p, policy, param, false);
4841}
4842EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4843
4844static int
4845do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4846{
4847 struct sched_param lparam;
4848 struct task_struct *p;
4849 int retval;
4850
4851 if (!param || pid < 0)
4852 return -EINVAL;
4853 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4854 return -EFAULT;
4855
4856 rcu_read_lock();
4857 retval = -ESRCH;
4858 p = find_process_by_pid(pid);
4859 if (p != NULL)
4860 retval = sched_setscheduler(p, policy, &lparam);
4861 rcu_read_unlock();
4862
4863 return retval;
4864}
4865
4866
4867
4868
4869static int sched_copy_attr(struct sched_attr __user *uattr,
4870 struct sched_attr *attr)
4871{
4872 u32 size;
4873 int ret;
4874
4875 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4876 return -EFAULT;
4877
4878
4879
4880
4881 memset(attr, 0, sizeof(*attr));
4882
4883 ret = get_user(size, &uattr->size);
4884 if (ret)
4885 return ret;
4886
4887 if (size > PAGE_SIZE)
4888 goto err_size;
4889
4890 if (!size)
4891 size = SCHED_ATTR_SIZE_VER0;
4892
4893 if (size < SCHED_ATTR_SIZE_VER0)
4894 goto err_size;
4895
4896
4897
4898
4899
4900
4901
4902 if (size > sizeof(*attr)) {
4903 unsigned char __user *addr;
4904 unsigned char __user *end;
4905 unsigned char val;
4906
4907 addr = (void __user *)uattr + sizeof(*attr);
4908 end = (void __user *)uattr + size;
4909
4910 for (; addr < end; addr++) {
4911 ret = get_user(val, addr);
4912 if (ret)
4913 return ret;
4914 if (val)
4915 goto err_size;
4916 }
4917 size = sizeof(*attr);
4918 }
4919
4920 ret = copy_from_user(attr, uattr, size);
4921 if (ret)
4922 return -EFAULT;
4923
4924
4925
4926
4927
4928 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
4929
4930out:
4931 return ret;
4932
4933err_size:
4934 put_user(sizeof(*attr), &uattr->size);
4935 ret = -E2BIG;
4936 goto out;
4937}
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4948 struct sched_param __user *, param)
4949{
4950
4951 if (policy < 0)
4952 return -EINVAL;
4953
4954 return do_sched_setscheduler(pid, policy, param);
4955}
4956
4957
4958
4959
4960
4961
4962
4963
4964SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4965{
4966 return do_sched_setscheduler(pid, -1, param);
4967}
4968
4969
4970
4971
4972
4973
4974
4975SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4976 unsigned int, flags)
4977{
4978 struct sched_attr attr;
4979 struct task_struct *p;
4980 int retval;
4981
4982 if (!uattr || pid < 0 || flags)
4983 return -EINVAL;
4984
4985 retval = sched_copy_attr(uattr, &attr);
4986 if (retval)
4987 return retval;
4988
4989 if ((int)attr.sched_policy < 0)
4990 return -EINVAL;
4991
4992 rcu_read_lock();
4993 retval = -ESRCH;
4994 p = find_process_by_pid(pid);
4995 if (p != NULL)
4996 retval = sched_setattr(p, &attr);
4997 rcu_read_unlock();
4998
4999 return retval;
5000}
5001
5002
5003
5004
5005
5006
5007
5008
5009SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5010{
5011 struct task_struct *p;
5012 int retval;
5013
5014 if (pid < 0)
5015 return -EINVAL;
5016
5017 retval = -ESRCH;
5018 rcu_read_lock();
5019 p = find_process_by_pid(pid);
5020 if (p) {
5021 retval = security_task_getscheduler(p);
5022 if (!retval)
5023 retval = p->policy
5024 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5025 }
5026 rcu_read_unlock();
5027 return retval;
5028}
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5039{
5040 struct sched_param lp;
5041 struct task_struct *p;
5042 int retval;
5043
5044 if (!param || pid < 0)
5045 return -EINVAL;
5046
5047 rcu_read_lock();
5048 p = find_process_by_pid(pid);
5049 retval = -ESRCH;
5050 if (!p)
5051 goto out_unlock;
5052
5053 retval = security_task_getscheduler(p);
5054 if (retval)
5055 goto out_unlock;
5056
5057 if (task_has_dl_policy(p)) {
5058 retval = -EINVAL;
5059 goto out_unlock;
5060 }
5061 lp.sched_priority = p->rt_priority;
5062 rcu_read_unlock();
5063
5064
5065
5066
5067 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5068
5069 return retval;
5070
5071out_unlock:
5072 rcu_read_unlock();
5073 return retval;
5074}
5075
5076static int sched_read_attr(struct sched_attr __user *uattr,
5077 struct sched_attr *attr,
5078 unsigned int usize)
5079{
5080 int ret;
5081
5082 if (!access_ok(VERIFY_WRITE, uattr, usize))
5083 return -EFAULT;
5084
5085
5086
5087
5088
5089
5090 if (usize < sizeof(*attr)) {
5091 unsigned char *addr;
5092 unsigned char *end;
5093
5094 addr = (void *)attr + usize;
5095 end = (void *)attr + sizeof(*attr);
5096
5097 for (; addr < end; addr++) {
5098 if (*addr)
5099 goto err_size;
5100 }
5101
5102 attr->size = usize;
5103 }
5104
5105 ret = copy_to_user(uattr, attr, attr->size);
5106 if (ret)
5107 return -EFAULT;
5108
5109out:
5110 return ret;
5111
5112err_size:
5113 ret = -E2BIG;
5114 goto out;
5115}
5116
5117
5118
5119
5120
5121
5122
5123
5124SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5125 unsigned int, size, unsigned int, flags)
5126{
5127 struct sched_attr attr = {
5128 .size = sizeof(struct sched_attr),
5129 };
5130 struct task_struct *p;
5131 int retval;
5132
5133 if (!uattr || pid < 0 || size > PAGE_SIZE ||
5134 size < SCHED_ATTR_SIZE_VER0 || flags)
5135 return -EINVAL;
5136
5137 rcu_read_lock();
5138 p = find_process_by_pid(pid);
5139 retval = -ESRCH;
5140 if (!p)
5141 goto out_unlock;
5142
5143 retval = security_task_getscheduler(p);
5144 if (retval)
5145 goto out_unlock;
5146
5147 attr.sched_policy = p->policy;
5148 if (p->sched_reset_on_fork)
5149 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5150 if (task_has_dl_policy(p))
5151 __getparam_dl(p, &attr);
5152 else if (task_has_rt_policy(p))
5153 attr.sched_priority = p->rt_priority;
5154 else
5155 attr.sched_nice = TASK_NICE(p);
5156
5157 rcu_read_unlock();
5158
5159 retval = sched_read_attr(uattr, &attr, size);
5160 return retval;
5161
5162out_unlock:
5163 rcu_read_unlock();
5164 return retval;
5165}
5166
5167long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5168{
5169 cpumask_var_t cpus_allowed, new_mask;
5170 struct task_struct *p;
5171 int retval;
5172
5173 rcu_read_lock();
5174
5175 p = find_process_by_pid(pid);
5176 if (!p) {
5177 rcu_read_unlock();
5178 return -ESRCH;
5179 }
5180
5181
5182 get_task_struct(p);
5183 rcu_read_unlock();
5184
5185 if (p->flags & PF_NO_SETAFFINITY) {
5186 retval = -EINVAL;
5187 goto out_put_task;
5188 }
5189 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5190 retval = -ENOMEM;
5191 goto out_put_task;
5192 }
5193 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5194 retval = -ENOMEM;
5195 goto out_free_cpus_allowed;
5196 }
5197 retval = -EPERM;
5198 if (!check_same_owner(p)) {
5199 rcu_read_lock();
5200 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
5201 rcu_read_unlock();
5202 goto out_unlock;
5203 }
5204 rcu_read_unlock();
5205 }
5206
5207 retval = security_task_setscheduler(p);
5208 if (retval)
5209 goto out_unlock;
5210
5211
5212 cpuset_cpus_allowed(p, cpus_allowed);
5213 cpumask_and(new_mask, in_mask, cpus_allowed);
5214
5215
5216
5217
5218
5219
5220
5221#ifdef CONFIG_SMP
5222 if (task_has_dl_policy(p)) {
5223 const struct cpumask *span = task_rq(p)->rd->span;
5224
5225 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
5226 retval = -EBUSY;
5227 goto out_unlock;
5228 }
5229 }
5230#endif
5231again:
5232 retval = set_cpus_allowed_ptr(p, new_mask);
5233
5234 if (!retval) {
5235 cpuset_cpus_allowed(p, cpus_allowed);
5236 if (!cpumask_subset(new_mask, cpus_allowed)) {
5237
5238
5239
5240
5241
5242 cpumask_copy(new_mask, cpus_allowed);
5243 goto again;
5244 }
5245 }
5246out_unlock:
5247 free_cpumask_var(new_mask);
5248out_free_cpus_allowed:
5249 free_cpumask_var(cpus_allowed);
5250out_put_task:
5251 put_task_struct(p);
5252 return retval;
5253}
5254
5255static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5256 struct cpumask *new_mask)
5257{
5258 if (len < cpumask_size())
5259 cpumask_clear(new_mask);
5260 else if (len > cpumask_size())
5261 len = cpumask_size();
5262
5263 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5264}
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5275 unsigned long __user *, user_mask_ptr)
5276{
5277 cpumask_var_t new_mask;
5278 int retval;
5279
5280 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5281 return -ENOMEM;
5282
5283 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5284 if (retval == 0)
5285 retval = sched_setaffinity(pid, new_mask);
5286 free_cpumask_var(new_mask);
5287 return retval;
5288}
5289
5290long sched_getaffinity(pid_t pid, struct cpumask *mask)
5291{
5292 struct task_struct *p;
5293 unsigned long flags;
5294 int retval;
5295
5296 rcu_read_lock();
5297
5298 retval = -ESRCH;
5299 p = find_process_by_pid(pid);
5300 if (!p)
5301 goto out_unlock;
5302
5303 retval = security_task_getscheduler(p);
5304 if (retval)
5305 goto out_unlock;
5306
5307 raw_spin_lock_irqsave(&p->pi_lock, flags);
5308 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
5309 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5310
5311out_unlock:
5312 rcu_read_unlock();
5313
5314 return retval;
5315}
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5326 unsigned long __user *, user_mask_ptr)
5327{
5328 int ret;
5329 cpumask_var_t mask;
5330
5331 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5332 return -EINVAL;
5333 if (len & (sizeof(unsigned long)-1))
5334 return -EINVAL;
5335
5336 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5337 return -ENOMEM;
5338
5339 ret = sched_getaffinity(pid, mask);
5340 if (ret == 0) {
5341 size_t retlen = min_t(size_t, len, cpumask_size());
5342
5343 if (copy_to_user(user_mask_ptr, mask, retlen))
5344 ret = -EFAULT;
5345 else
5346 ret = retlen;
5347 }
5348 free_cpumask_var(mask);
5349
5350 return ret;
5351}
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361SYSCALL_DEFINE0(sched_yield)
5362{
5363 struct rq *rq = this_rq_lock();
5364
5365 schedstat_inc(rq, yld_count);
5366 current->sched_class->yield_task(rq);
5367
5368
5369
5370
5371
5372 __release(rq->lock);
5373 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5374 do_raw_spin_unlock(&rq->lock);
5375 sched_preempt_enable_no_resched();
5376
5377 schedule();
5378
5379 return 0;
5380}
5381
5382static inline int should_resched(void)
5383{
5384 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5385}
5386
5387static void __cond_resched(void)
5388{
5389 add_preempt_count(PREEMPT_ACTIVE);
5390 __schedule();
5391 sub_preempt_count(PREEMPT_ACTIVE);
5392}
5393
5394int __sched _cond_resched(void)
5395{
5396 if (should_resched()) {
5397 __cond_resched();
5398 return 1;
5399 }
5400 return 0;
5401}
5402EXPORT_SYMBOL(_cond_resched);
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412int __cond_resched_lock(spinlock_t *lock)
5413{
5414 int resched = should_resched();
5415 int ret = 0;
5416
5417 lockdep_assert_held(lock);
5418
5419 if (spin_needbreak(lock) || resched) {
5420 spin_unlock(lock);
5421 if (resched)
5422 __cond_resched();
5423 else
5424 cpu_relax();
5425 ret = 1;
5426 spin_lock(lock);
5427 }
5428 return ret;
5429}
5430EXPORT_SYMBOL(__cond_resched_lock);
5431
5432int __sched __cond_resched_softirq(void)
5433{
5434 BUG_ON(!in_softirq());
5435
5436 if (should_resched()) {
5437 local_bh_enable();
5438 __cond_resched();
5439 local_bh_disable();
5440 return 1;
5441 }
5442 return 0;
5443}
5444EXPORT_SYMBOL(__cond_resched_softirq);
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468void __sched yield(void)
5469{
5470 set_current_state(TASK_RUNNING);
5471 sys_sched_yield();
5472}
5473EXPORT_SYMBOL(yield);
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490int __sched yield_to(struct task_struct *p, bool preempt)
5491{
5492 struct task_struct *curr = current;
5493 struct rq *rq, *p_rq;
5494 unsigned long flags;
5495 int yielded = 0;
5496
5497 local_irq_save(flags);
5498 rq = this_rq();
5499
5500again:
5501 p_rq = task_rq(p);
5502
5503
5504
5505
5506 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5507 yielded = -ESRCH;
5508 goto out_irq;
5509 }
5510
5511 double_rq_lock(rq, p_rq);
5512 if (task_rq(p) != p_rq) {
5513 double_rq_unlock(rq, p_rq);
5514 goto again;
5515 }
5516
5517 if (!curr->sched_class->yield_to_task)
5518 goto out_unlock;
5519
5520 if (curr->sched_class != p->sched_class)
5521 goto out_unlock;
5522
5523 if (task_running(p_rq, p) || p->state)
5524 goto out_unlock;
5525
5526 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5527 if (yielded) {
5528 schedstat_inc(rq, yld_count);
5529
5530
5531
5532
5533 if (preempt && rq != p_rq)
5534 resched_curr(p_rq);
5535 }
5536
5537out_unlock:
5538 double_rq_unlock(rq, p_rq);
5539out_irq:
5540 local_irq_restore(flags);
5541
5542 if (yielded > 0)
5543 schedule();
5544
5545 return yielded;
5546}
5547EXPORT_SYMBOL_GPL(yield_to);
5548
5549
5550
5551
5552
5553void __sched io_schedule(void)
5554{
5555 io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
5556}
5557EXPORT_SYMBOL(io_schedule);
5558
5559long __sched io_schedule_timeout(long timeout)
5560{
5561 int old_iowait = current->in_iowait;
5562 struct rq *rq;
5563 long ret;
5564
5565 current->in_iowait = 1;
5566 if (old_iowait)
5567 blk_schedule_flush_plug(current);
5568 else
5569 blk_flush_plug(current);
5570
5571 delayacct_blkio_start();
5572 rq = raw_rq();
5573 atomic_inc(&rq->nr_iowait);
5574 ret = schedule_timeout(timeout);
5575 current->in_iowait = old_iowait;
5576 atomic_dec(&rq->nr_iowait);
5577 delayacct_blkio_end();
5578
5579 return ret;
5580}
5581EXPORT_SYMBOL(io_schedule_timeout);
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5592{
5593 int ret = -EINVAL;
5594
5595 switch (policy) {
5596 case SCHED_FIFO:
5597 case SCHED_RR:
5598 ret = MAX_USER_RT_PRIO-1;
5599 break;
5600 case SCHED_DEADLINE:
5601 case SCHED_NORMAL:
5602 case SCHED_BATCH:
5603 case SCHED_IDLE:
5604 ret = 0;
5605 break;
5606 }
5607 return ret;
5608}
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5619{
5620 int ret = -EINVAL;
5621
5622 switch (policy) {
5623 case SCHED_FIFO:
5624 case SCHED_RR:
5625 ret = 1;
5626 break;
5627 case SCHED_DEADLINE:
5628 case SCHED_NORMAL:
5629 case SCHED_BATCH:
5630 case SCHED_IDLE:
5631 ret = 0;
5632 }
5633 return ret;
5634}
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5648 struct timespec __user *, interval)
5649{
5650 struct task_struct *p;
5651 unsigned int time_slice;
5652 unsigned long flags;
5653 struct rq *rq;
5654 int retval;
5655 struct timespec t;
5656
5657 if (pid < 0)
5658 return -EINVAL;
5659
5660 retval = -ESRCH;
5661 rcu_read_lock();
5662 p = find_process_by_pid(pid);
5663 if (!p)
5664 goto out_unlock;
5665
5666 retval = security_task_getscheduler(p);
5667 if (retval)
5668 goto out_unlock;
5669
5670 rq = task_rq_lock(p, &flags);
5671 time_slice = p->sched_class->get_rr_interval(rq, p);
5672 task_rq_unlock(rq, p, &flags);
5673
5674 rcu_read_unlock();
5675 jiffies_to_timespec(time_slice, &t);
5676 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5677 return retval;
5678
5679out_unlock:
5680 rcu_read_unlock();
5681 return retval;
5682}
5683
5684static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5685
5686void sched_show_task(struct task_struct *p)
5687{
5688 unsigned long free = 0;
5689 int ppid;
5690 unsigned state;
5691
5692 state = p->state ? __ffs(p->state) + 1 : 0;
5693 printk(KERN_INFO "%-15.15s %c", p->comm,
5694 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5695#if BITS_PER_LONG == 32
5696 if (state == TASK_RUNNING)
5697 printk(KERN_CONT " running ");
5698 else
5699 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5700#else
5701 if (state == TASK_RUNNING)
5702 printk(KERN_CONT " running task ");
5703 else
5704 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5705#endif
5706#ifdef CONFIG_DEBUG_STACK_USAGE
5707 free = stack_not_used(p);
5708#endif
5709 rcu_read_lock();
5710 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5711 rcu_read_unlock();
5712 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5713 task_pid_nr(p), ppid,
5714 (unsigned long)task_thread_info(p)->flags);
5715
5716 print_worker_info(KERN_INFO, p);
5717 show_stack(p, NULL);
5718}
5719
5720void show_state_filter(unsigned long state_filter)
5721{
5722 struct task_struct *g, *p;
5723
5724#if BITS_PER_LONG == 32
5725 printk(KERN_INFO
5726 " task PC stack pid father\n");
5727#else
5728 printk(KERN_INFO
5729 " task PC stack pid father\n");
5730#endif
5731 rcu_read_lock();
5732 do_each_thread(g, p) {
5733
5734
5735
5736
5737 touch_nmi_watchdog();
5738 if (!state_filter || (p->state & state_filter))
5739 sched_show_task(p);
5740 } while_each_thread(g, p);
5741
5742 touch_all_softlockup_watchdogs();
5743
5744#ifdef CONFIG_SCHED_DEBUG
5745 sysrq_sched_debug_show();
5746#endif
5747 rcu_read_unlock();
5748
5749
5750
5751 if (!state_filter)
5752 debug_show_all_locks();
5753}
5754
5755void init_idle_bootup_task(struct task_struct *idle)
5756{
5757 idle->sched_class = &idle_sched_class;
5758}
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768void init_idle(struct task_struct *idle, int cpu)
5769{
5770 struct rq *rq = cpu_rq(cpu);
5771 unsigned long flags;
5772
5773 raw_spin_lock_irqsave(&rq->lock, flags);
5774
5775 __sched_fork(0, idle);
5776 idle->state = TASK_RUNNING;
5777 idle->se.exec_start = sched_clock();
5778
5779 do_set_cpus_allowed(idle, cpumask_of(cpu));
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790 rcu_read_lock();
5791 __set_task_cpu(idle, cpu);
5792 rcu_read_unlock();
5793
5794 rq->curr = rq->idle = idle;
5795 idle->on_rq = TASK_ON_RQ_QUEUED;
5796#if defined(CONFIG_SMP)
5797 idle->on_cpu = 1;
5798#endif
5799 raw_spin_unlock_irqrestore(&rq->lock, flags);
5800
5801
5802 task_thread_info(idle)->preempt_count = 0;
5803
5804
5805
5806
5807 idle->sched_class = &idle_sched_class;
5808 ftrace_graph_init_idle_task(idle, cpu);
5809 vtime_init_idle(idle, cpu);
5810#if defined(CONFIG_SMP)
5811 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5812#endif
5813}
5814
5815int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5816 const struct cpumask *trial)
5817{
5818 int ret = 1, trial_cpus;
5819 struct dl_bw *cur_dl_b;
5820 unsigned long flags;
5821
5822 if (!cpumask_weight(cur))
5823 return ret;
5824
5825 rcu_read_lock_sched();
5826 cur_dl_b = dl_bw_of(cpumask_any(cur));
5827 trial_cpus = cpumask_weight(trial);
5828
5829 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5830 if (cur_dl_b->bw != -1 &&
5831 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5832 ret = 0;
5833 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5834 rcu_read_unlock_sched();
5835
5836 return ret;
5837}
5838
5839int task_can_attach(struct task_struct *p,
5840 const struct cpumask *cs_cpus_allowed)
5841{
5842 int ret = 0;
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853 if (p->flags & PF_NO_SETAFFINITY) {
5854 ret = -EINVAL;
5855 goto out;
5856 }
5857
5858#ifdef CONFIG_SMP
5859 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5860 cs_cpus_allowed)) {
5861 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5862 cs_cpus_allowed);
5863 struct dl_bw *dl_b;
5864 bool overflow;
5865 int cpus;
5866 unsigned long flags;
5867
5868 rcu_read_lock_sched();
5869 dl_b = dl_bw_of(dest_cpu);
5870 raw_spin_lock_irqsave(&dl_b->lock, flags);
5871 cpus = dl_bw_cpus(dest_cpu);
5872 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5873 if (overflow)
5874 ret = -EBUSY;
5875 else {
5876
5877
5878
5879
5880
5881
5882 __dl_add(dl_b, p->dl.dl_bw, cpus);
5883 }
5884 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5885 rcu_read_unlock_sched();
5886
5887 }
5888#endif
5889out:
5890 return ret;
5891}
5892
5893#ifdef CONFIG_SMP
5894void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5895{
5896 if (p->sched_class && p->sched_class->set_cpus_allowed)
5897 p->sched_class->set_cpus_allowed(p, new_mask);
5898
5899 cpumask_copy(&p->cpus_allowed, new_mask);
5900 p->nr_cpus_allowed = cpumask_weight(new_mask);
5901}
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5927{
5928 unsigned long flags;
5929 struct rq *rq;
5930 unsigned int dest_cpu;
5931 int ret = 0;
5932
5933 rq = task_rq_lock(p, &flags);
5934
5935 if (cpumask_equal(&p->cpus_allowed, new_mask))
5936 goto out;
5937
5938 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5939 ret = -EINVAL;
5940 goto out;
5941 }
5942
5943 do_set_cpus_allowed(p, new_mask);
5944
5945
5946 if (cpumask_test_cpu(task_cpu(p), new_mask))
5947 goto out;
5948
5949 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5950 if (task_on_rq_queued(p)) {
5951 struct migration_arg arg = { p, dest_cpu };
5952
5953 task_rq_unlock(rq, p, &flags);
5954 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5955 tlb_migrate_finish(p->mm);
5956 return 0;
5957 }
5958out:
5959 task_rq_unlock(rq, p, &flags);
5960
5961 return ret;
5962}
5963EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5977{
5978 struct rq *rq_dest, *rq_src;
5979 int ret = 0;
5980
5981 if (unlikely(!cpu_active(dest_cpu)))
5982 return ret;
5983
5984 rq_src = cpu_rq(src_cpu);
5985 rq_dest = cpu_rq(dest_cpu);
5986
5987 raw_spin_lock(&p->pi_lock);
5988 double_rq_lock(rq_src, rq_dest);
5989
5990 if (task_cpu(p) != src_cpu)
5991 goto done;
5992
5993 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5994 goto fail;
5995
5996
5997
5998
5999
6000 if (task_on_rq_queued(p)) {
6001 dequeue_task(rq_src, p, DEQUEUE_SAVE);
6002 set_task_cpu(p, dest_cpu);
6003 enqueue_task(rq_dest, p, ENQUEUE_RESTORE);
6004 check_preempt_curr(rq_dest, p, 0);
6005 }
6006done:
6007 ret = 1;
6008fail:
6009 double_rq_unlock(rq_src, rq_dest);
6010 raw_spin_unlock(&p->pi_lock);
6011 return ret;
6012}
6013
6014static bool sched_smp_initialized __read_mostly;
6015
6016#ifdef CONFIG_NUMA_BALANCING
6017
6018int migrate_task_to(struct task_struct *p, int target_cpu)
6019{
6020 struct migration_arg arg = { p, target_cpu };
6021 int curr_cpu = task_cpu(p);
6022
6023 if (curr_cpu == target_cpu)
6024 return 0;
6025
6026 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
6027 return -EINVAL;
6028
6029
6030
6031 trace_sched_move_numa(p, curr_cpu, target_cpu);
6032 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
6033}
6034
6035
6036
6037
6038
6039void sched_setnuma(struct task_struct *p, int nid)
6040{
6041 struct rq *rq;
6042 unsigned long flags;
6043 bool queued, running;
6044
6045 rq = task_rq_lock(p, &flags);
6046 queued = task_on_rq_queued(p);
6047 running = task_current(rq, p);
6048
6049 if (queued)
6050 dequeue_task(rq, p, DEQUEUE_SAVE);
6051 if (running)
6052 p->sched_class->put_prev_task(rq, p);
6053
6054 p->numa_preferred_nid = nid;
6055
6056 if (running)
6057 p->sched_class->set_curr_task(rq);
6058 if (queued)
6059 enqueue_task(rq, p, ENQUEUE_RESTORE);
6060 task_rq_unlock(rq, p, &flags);
6061}
6062#endif
6063
6064
6065
6066
6067
6068
6069static int migration_cpu_stop(void *data)
6070{
6071 struct migration_arg *arg = data;
6072
6073
6074
6075
6076
6077 local_irq_disable();
6078 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6079 local_irq_enable();
6080 return 0;
6081}
6082
6083#ifdef CONFIG_HOTPLUG_CPU
6084
6085
6086
6087
6088
6089void idle_task_exit(void)
6090{
6091 struct mm_struct *mm = current->active_mm;
6092
6093 BUG_ON(cpu_online(smp_processor_id()));
6094
6095 if (mm != &init_mm)
6096 switch_mm(mm, &init_mm, current);
6097 mmdrop(mm);
6098}
6099
6100
6101
6102
6103
6104
6105
6106
6107static void calc_load_migrate(struct rq *rq)
6108{
6109 long delta = calc_load_fold_active(rq);
6110 if (delta)
6111 atomic_long_add(delta, &calc_load_tasks);
6112}
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122static void migrate_tasks(unsigned int dead_cpu)
6123{
6124 struct rq *rq = cpu_rq(dead_cpu);
6125 struct task_struct *next, *stop = rq->stop;
6126 int dest_cpu;
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137 rq->stop = NULL;
6138
6139 for ( ; ; ) {
6140
6141
6142
6143
6144 if (rq->nr_running == 1)
6145 break;
6146
6147 next = pick_next_task(rq);
6148 BUG_ON(!next);
6149 next->sched_class->put_prev_task(rq, next);
6150
6151
6152 dest_cpu = select_fallback_rq(dead_cpu, next);
6153 raw_spin_unlock(&rq->lock);
6154
6155 __migrate_task(next, dead_cpu, dest_cpu);
6156
6157 raw_spin_lock(&rq->lock);
6158 }
6159
6160 rq->stop = stop;
6161}
6162
6163#endif
6164
6165#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6166
6167static struct ctl_table sd_ctl_dir[] = {
6168 {
6169 .procname = "sched_domain",
6170 .mode = 0555,
6171 },
6172 {}
6173};
6174
6175static struct ctl_table sd_ctl_root[] = {
6176 {
6177 .procname = "kernel",
6178 .mode = 0555,
6179 .child = sd_ctl_dir,
6180 },
6181 {}
6182};
6183
6184static struct ctl_table *sd_alloc_ctl_entry(int n)
6185{
6186 struct ctl_table *entry =
6187 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6188
6189 return entry;
6190}
6191
6192static void sd_free_ctl_entry(struct ctl_table **tablep)
6193{
6194 struct ctl_table *entry;
6195
6196
6197
6198
6199
6200
6201
6202 for (entry = *tablep; entry->mode; entry++) {
6203 if (entry->child)
6204 sd_free_ctl_entry(&entry->child);
6205 if (entry->proc_handler == NULL)
6206 kfree(entry->procname);
6207 }
6208
6209 kfree(*tablep);
6210 *tablep = NULL;
6211}
6212
6213static int min_load_idx = 0;
6214static int max_load_idx = CPU_LOAD_IDX_MAX-1;
6215
6216static void
6217set_table_entry(struct ctl_table *entry,
6218 const char *procname, void *data, int maxlen,
6219 umode_t mode, proc_handler *proc_handler,
6220 bool load_idx)
6221{
6222 entry->procname = procname;
6223 entry->data = data;
6224 entry->maxlen = maxlen;
6225 entry->mode = mode;
6226 entry->proc_handler = proc_handler;
6227
6228 if (load_idx) {
6229 entry->extra1 = &min_load_idx;
6230 entry->extra2 = &max_load_idx;
6231 }
6232}
6233
6234static struct ctl_table *
6235sd_alloc_ctl_domain_table(struct sched_domain *sd)
6236{
6237 struct ctl_table *table = sd_alloc_ctl_entry(14);
6238
6239 if (table == NULL)
6240 return NULL;
6241
6242 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6243 sizeof(long), 0644, proc_doulongvec_minmax, false);
6244 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6245 sizeof(long), 0644, proc_doulongvec_minmax, false);
6246 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6247 sizeof(int), 0644, proc_dointvec_minmax, true);
6248 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6249 sizeof(int), 0644, proc_dointvec_minmax, true);
6250 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6251 sizeof(int), 0644, proc_dointvec_minmax, true);
6252 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6253 sizeof(int), 0644, proc_dointvec_minmax, true);
6254 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6255 sizeof(int), 0644, proc_dointvec_minmax, true);
6256 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6257 sizeof(int), 0644, proc_dointvec_minmax, false);
6258 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6259 sizeof(int), 0644, proc_dointvec_minmax, false);
6260 set_table_entry(&table[9], "cache_nice_tries",
6261 &sd->cache_nice_tries,
6262 sizeof(int), 0644, proc_dointvec_minmax, false);
6263 set_table_entry(&table[10], "flags", &sd->flags,
6264 sizeof(int), 0644, proc_dointvec_minmax, false);
6265 set_table_entry(&table[11], "max_newidle_lb_cost",
6266 &sd->max_newidle_lb_cost,
6267 sizeof(long), 0644, proc_doulongvec_minmax, false);
6268 set_table_entry(&table[12], "name", sd->name,
6269 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
6270
6271
6272 return table;
6273}
6274
6275static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6276{
6277 struct ctl_table *entry, *table;
6278 struct sched_domain *sd;
6279 int domain_num = 0, i;
6280 char buf[32];
6281
6282 for_each_domain(cpu, sd)
6283 domain_num++;
6284 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6285 if (table == NULL)
6286 return NULL;
6287
6288 i = 0;
6289 for_each_domain(cpu, sd) {
6290 snprintf(buf, 32, "domain%d", i);
6291 entry->procname = kstrdup(buf, GFP_KERNEL);
6292 entry->mode = 0555;
6293 entry->child = sd_alloc_ctl_domain_table(sd);
6294 entry++;
6295 i++;
6296 }
6297 return table;
6298}
6299
6300static struct ctl_table_header *sd_sysctl_header;
6301static void register_sched_domain_sysctl(void)
6302{
6303 int i, cpu_num = num_possible_cpus();
6304 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6305 char buf[32];
6306
6307 WARN_ON(sd_ctl_dir[0].child);
6308 sd_ctl_dir[0].child = entry;
6309
6310 if (entry == NULL)
6311 return;
6312
6313 for_each_possible_cpu(i) {
6314 snprintf(buf, 32, "cpu%d", i);
6315 entry->procname = kstrdup(buf, GFP_KERNEL);
6316 entry->mode = 0555;
6317 entry->child = sd_alloc_ctl_cpu_table(i);
6318 entry++;
6319 }
6320
6321 WARN_ON(sd_sysctl_header);
6322 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6323}
6324
6325
6326static void unregister_sched_domain_sysctl(void)
6327{
6328 if (sd_sysctl_header)
6329 unregister_sysctl_table(sd_sysctl_header);
6330 sd_sysctl_header = NULL;
6331 if (sd_ctl_dir[0].child)
6332 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6333}
6334#else
6335static void register_sched_domain_sysctl(void)
6336{
6337}
6338static void unregister_sched_domain_sysctl(void)
6339{
6340}
6341#endif
6342
6343static void set_rq_online(struct rq *rq)
6344{
6345 if (!rq->online) {
6346 const struct sched_class *class;
6347
6348 cpumask_set_cpu(rq->cpu, rq->rd->online);
6349 rq->online = 1;
6350
6351 for_each_class(class) {
6352 if (class->rq_online)
6353 class->rq_online(rq);
6354 }
6355 }
6356}
6357
6358static void set_rq_offline(struct rq *rq)
6359{
6360 if (rq->online) {
6361 const struct sched_class *class;
6362
6363 for_each_class(class) {
6364 if (class->rq_offline)
6365 class->rq_offline(rq);
6366 }
6367
6368 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6369 rq->online = 0;
6370 }
6371}
6372
6373
6374
6375
6376
6377static int
6378migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6379{
6380 int cpu = (long)hcpu;
6381 unsigned long flags;
6382 struct rq *rq = cpu_rq(cpu);
6383
6384 switch (action & ~CPU_TASKS_FROZEN) {
6385
6386 case CPU_UP_PREPARE:
6387 rq->calc_load_update = calc_load_update;
6388 break;
6389
6390 case CPU_ONLINE:
6391
6392 raw_spin_lock_irqsave(&rq->lock, flags);
6393 if (rq->rd) {
6394 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6395
6396 set_rq_online(rq);
6397 }
6398 raw_spin_unlock_irqrestore(&rq->lock, flags);
6399 break;
6400
6401#ifdef CONFIG_HOTPLUG_CPU
6402 case CPU_DYING:
6403 sched_ttwu_pending();
6404
6405 raw_spin_lock_irqsave(&rq->lock, flags);
6406 if (rq->rd) {
6407 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6408 set_rq_offline(rq);
6409 }
6410 migrate_tasks(cpu);
6411 BUG_ON(rq->nr_running != 1);
6412 raw_spin_unlock_irqrestore(&rq->lock, flags);
6413 break;
6414
6415 case CPU_DEAD:
6416 calc_load_migrate(rq);
6417 break;
6418#endif
6419 }
6420
6421 update_max_interval();
6422
6423 return NOTIFY_OK;
6424}
6425
6426
6427
6428
6429
6430
6431static struct notifier_block migration_notifier = {
6432 .notifier_call = migration_call,
6433 .priority = CPU_PRI_MIGRATION,
6434};
6435
6436static int sched_cpu_active(struct notifier_block *nfb,
6437 unsigned long action, void *hcpu)
6438{
6439 switch (action & ~CPU_TASKS_FROZEN) {
6440 case CPU_STARTING:
6441 case CPU_DOWN_FAILED:
6442 set_cpu_active((long)hcpu, true);
6443 return NOTIFY_OK;
6444 default:
6445 return NOTIFY_DONE;
6446 }
6447}
6448
6449static int sched_cpu_inactive(struct notifier_block *nfb,
6450 unsigned long action, void *hcpu)
6451{
6452 switch (action & ~CPU_TASKS_FROZEN) {
6453 case CPU_DOWN_PREPARE:
6454 set_cpu_active((long)hcpu, false);
6455 return NOTIFY_OK;
6456 default:
6457 return NOTIFY_DONE;
6458 }
6459}
6460
6461#endif
6462
6463#ifdef CONFIG_SMP
6464
6465
6466static cpumask_var_t sched_domains_tmpmask;
6467static cpumask_var_t sched_domains_tmpmask2;
6468
6469#ifdef CONFIG_SCHED_DEBUG
6470
6471static __read_mostly int sched_debug_enabled;
6472
6473static int __init sched_debug_setup(char *str)
6474{
6475 sched_debug_enabled = 1;
6476
6477 return 0;
6478}
6479early_param("sched_debug", sched_debug_setup);
6480
6481static inline bool sched_debug(void)
6482{
6483 return sched_debug_enabled;
6484}
6485
6486static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6487 struct cpumask *groupmask)
6488{
6489 struct sched_group *group = sd->groups;
6490
6491 cpumask_clear(groupmask);
6492
6493 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6494
6495 if (!(sd->flags & SD_LOAD_BALANCE)) {
6496 printk("does not load-balance\n");
6497 if (sd->parent)
6498 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6499 " has parent");
6500 return -1;
6501 }
6502
6503 printk(KERN_CONT "span %pc level %s\n", sched_domain_span(sd), sd->name);
6504
6505 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6506 printk(KERN_ERR "ERROR: domain->span does not contain "
6507 "CPU%d\n", cpu);
6508 }
6509 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6510 printk(KERN_ERR "ERROR: domain->groups does not contain"
6511 " CPU%d\n", cpu);
6512 }
6513
6514 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6515 do {
6516 if (!group) {
6517 printk("\n");
6518 printk(KERN_ERR "ERROR: group is NULL\n");
6519 break;
6520 }
6521
6522
6523
6524
6525
6526
6527 if (!group->sgp->power_orig) {
6528 printk(KERN_CONT "\n");
6529 printk(KERN_ERR "ERROR: domain->cpu_power not "
6530 "set\n");
6531 break;
6532 }
6533
6534 if (!cpumask_weight(sched_group_cpus(group))) {
6535 printk(KERN_CONT "\n");
6536 printk(KERN_ERR "ERROR: empty group\n");
6537 break;
6538 }
6539
6540 if (!(sd->flags & SD_OVERLAP) &&
6541 cpumask_intersects(groupmask, sched_group_cpus(group))) {
6542 printk(KERN_CONT "\n");
6543 printk(KERN_ERR "ERROR: repeated CPUs\n");
6544 break;
6545 }
6546
6547 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6548
6549 printk(KERN_CONT " %pc", sched_group_cpus(group));
6550 if (group->sgp->power != SCHED_POWER_SCALE) {
6551 printk(KERN_CONT " (cpu_power = %d)",
6552 group->sgp->power);
6553 }
6554
6555 if (group == sd->groups && sd->child &&
6556 !cpumask_equal(sched_domain_span(sd->child),
6557 sched_group_cpus(group))) {
6558 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
6559 }
6560
6561 group = group->next;
6562 } while (group != sd->groups);
6563 printk(KERN_CONT "\n");
6564
6565 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6566 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6567
6568 if (sd->parent &&
6569 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6570 printk(KERN_ERR "ERROR: parent span is not a superset "
6571 "of domain->span\n");
6572 return 0;
6573}
6574
6575static void sched_domain_debug(struct sched_domain *sd, int cpu)
6576{
6577 int level = 0;
6578
6579 if (!sched_debug_enabled)
6580 return;
6581
6582 if (!sd) {
6583 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6584 return;
6585 }
6586
6587 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6588
6589 for (;;) {
6590 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6591 break;
6592 level++;
6593 sd = sd->parent;
6594 if (!sd)
6595 break;
6596 }
6597}
6598#else
6599# define sched_domain_debug(sd, cpu) do { } while (0)
6600static inline bool sched_debug(void)
6601{
6602 return false;
6603}
6604#endif
6605
6606static int sd_degenerate(struct sched_domain *sd)
6607{
6608 if (cpumask_weight(sched_domain_span(sd)) == 1)
6609 return 1;
6610
6611
6612 if (sd->flags & (SD_LOAD_BALANCE |
6613 SD_BALANCE_NEWIDLE |
6614 SD_BALANCE_FORK |
6615 SD_BALANCE_EXEC |
6616 SD_SHARE_CPUPOWER |
6617 SD_SHARE_PKG_RESOURCES)) {
6618 if (sd->groups != sd->groups->next)
6619 return 0;
6620 }
6621
6622
6623 if (sd->flags & (SD_WAKE_AFFINE))
6624 return 0;
6625
6626 return 1;
6627}
6628
6629static int
6630sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6631{
6632 unsigned long cflags = sd->flags, pflags = parent->flags;
6633
6634 if (sd_degenerate(parent))
6635 return 1;
6636
6637 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6638 return 0;
6639
6640
6641 if (parent->groups == parent->groups->next) {
6642 pflags &= ~(SD_LOAD_BALANCE |
6643 SD_BALANCE_NEWIDLE |
6644 SD_BALANCE_FORK |
6645 SD_BALANCE_EXEC |
6646 SD_SHARE_CPUPOWER |
6647 SD_SHARE_PKG_RESOURCES |
6648 SD_PREFER_SIBLING);
6649 if (nr_node_ids == 1)
6650 pflags &= ~SD_SERIALIZE;
6651 }
6652 if (~cflags & pflags)
6653 return 0;
6654
6655 return 1;
6656}
6657
6658static void free_rootdomain(struct rcu_head *rcu)
6659{
6660 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6661
6662 cpupri_cleanup(&rd->cpupri);
6663 cpudl_cleanup(&rd->cpudl);
6664 free_cpumask_var(rd->dlo_mask);
6665 free_cpumask_var(rd->rto_mask);
6666 free_cpumask_var(rd->online);
6667 free_cpumask_var(rd->span);
6668 kfree(rd);
6669}
6670
6671static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6672{
6673 struct root_domain *old_rd = NULL;
6674 unsigned long flags;
6675
6676 raw_spin_lock_irqsave(&rq->lock, flags);
6677
6678 if (rq->rd) {
6679 old_rd = rq->rd;
6680
6681 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6682 set_rq_offline(rq);
6683
6684 cpumask_clear_cpu(rq->cpu, old_rd->span);
6685
6686
6687
6688
6689
6690
6691 if (!atomic_dec_and_test(&old_rd->refcount))
6692 old_rd = NULL;
6693 }
6694
6695 atomic_inc(&rd->refcount);
6696 rq->rd = rd;
6697
6698 cpumask_set_cpu(rq->cpu, rd->span);
6699 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6700 set_rq_online(rq);
6701
6702 raw_spin_unlock_irqrestore(&rq->lock, flags);
6703
6704 if (old_rd)
6705 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6706}
6707
6708static int init_rootdomain(struct root_domain *rd)
6709{
6710 memset(rd, 0, sizeof(*rd));
6711
6712 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
6713 goto out;
6714 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
6715 goto free_span;
6716 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
6717 goto free_online;
6718 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6719 goto free_dlo_mask;
6720
6721 init_dl_bw(&rd->dl_bw);
6722 if (cpudl_init(&rd->cpudl) != 0)
6723 goto free_dlo_mask;
6724
6725 if (cpupri_init(&rd->cpupri) != 0)
6726 goto free_rto_mask;
6727 return 0;
6728
6729free_rto_mask:
6730 free_cpumask_var(rd->rto_mask);
6731free_dlo_mask:
6732 free_cpumask_var(rd->dlo_mask);
6733free_online:
6734 free_cpumask_var(rd->online);
6735free_span:
6736 free_cpumask_var(rd->span);
6737out:
6738 return -ENOMEM;
6739}
6740
6741
6742
6743
6744
6745struct root_domain def_root_domain;
6746
6747static void init_defrootdomain(void)
6748{
6749 init_rootdomain(&def_root_domain);
6750
6751 atomic_set(&def_root_domain.refcount, 1);
6752}
6753
6754static struct root_domain *alloc_rootdomain(void)
6755{
6756 struct root_domain *rd;
6757
6758 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6759 if (!rd)
6760 return NULL;
6761
6762 if (init_rootdomain(rd) != 0) {
6763 kfree(rd);
6764 return NULL;
6765 }
6766
6767 return rd;
6768}
6769
6770static void free_sched_groups(struct sched_group *sg, int free_sgp)
6771{
6772 struct sched_group *tmp, *first;
6773
6774 if (!sg)
6775 return;
6776
6777 first = sg;
6778 do {
6779 tmp = sg->next;
6780
6781 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6782 kfree(sg->sgp);
6783
6784 if (atomic_dec_and_test(&sg->ref))
6785 kfree(sg);
6786 sg = tmp;
6787 } while (sg != first);
6788}
6789
6790static void free_sched_domain(struct rcu_head *rcu)
6791{
6792 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6793
6794
6795
6796
6797
6798 free_sched_groups(sd->groups, 1);
6799
6800 kfree(sd);
6801}
6802
6803static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6804{
6805 call_rcu(&sd->rcu, free_sched_domain);
6806}
6807
6808static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6809{
6810 for (; sd; sd = sd->parent)
6811 destroy_sched_domain(sd, cpu);
6812}
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823DEFINE_PER_CPU(struct sched_domain *, sd_llc);
6824DEFINE_PER_CPU(int, sd_llc_size);
6825DEFINE_PER_CPU(int, sd_llc_id);
6826DEFINE_PER_CPU(struct sched_domain *, sd_numa);
6827DEFINE_PER_CPU(struct sched_domain *, sd_busy);
6828DEFINE_PER_CPU(struct sched_domain *, sd_asym);
6829
6830static void update_top_cache_domain(int cpu)
6831{
6832 struct sched_domain *sd;
6833 struct sched_domain *busy_sd = NULL;
6834 int id = cpu;
6835 int size = 1;
6836
6837 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6838 if (sd) {
6839 id = cpumask_first(sched_domain_span(sd));
6840 size = cpumask_weight(sched_domain_span(sd));
6841 busy_sd = sd->parent;
6842 }
6843 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
6844
6845 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6846 per_cpu(sd_llc_size, cpu) = size;
6847 per_cpu(sd_llc_id, cpu) = id;
6848
6849 sd = lowest_flag_domain(cpu, SD_NUMA);
6850 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
6851
6852 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
6853 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
6854}
6855
6856
6857
6858
6859
6860static void
6861cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6862{
6863 struct rq *rq = cpu_rq(cpu);
6864 struct sched_domain *tmp;
6865
6866
6867 for (tmp = sd; tmp; ) {
6868 struct sched_domain *parent = tmp->parent;
6869 if (!parent)
6870 break;
6871
6872 if (sd_parent_degenerate(tmp, parent)) {
6873 tmp->parent = parent->parent;
6874 if (parent->parent)
6875 parent->parent->child = tmp;
6876
6877
6878
6879
6880
6881 if (parent->flags & SD_PREFER_SIBLING)
6882 tmp->flags |= SD_PREFER_SIBLING;
6883 destroy_sched_domain(parent, cpu);
6884 } else
6885 tmp = tmp->parent;
6886 }
6887
6888 if (sd && sd_degenerate(sd)) {
6889 tmp = sd;
6890 sd = sd->parent;
6891 destroy_sched_domain(tmp, cpu);
6892 if (sd)
6893 sd->child = NULL;
6894 }
6895
6896 sched_domain_debug(sd, cpu);
6897
6898 rq_attach_root(rq, rd);
6899 tmp = rq->sd;
6900 rcu_assign_pointer(rq->sd, sd);
6901 destroy_sched_domains(tmp, cpu);
6902
6903 update_top_cache_domain(cpu);
6904}
6905
6906
6907static int __init isolated_cpu_setup(char *str)
6908{
6909 int ret;
6910
6911 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6912 ret = cpulist_parse(str, cpu_isolated_map);
6913 if (ret) {
6914 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
6915 return 0;
6916 }
6917 return 1;
6918}
6919__setup("isolcpus=", isolated_cpu_setup);
6920
6921struct s_data {
6922 struct sched_domain ** __percpu sd;
6923 struct root_domain *rd;
6924};
6925
6926enum s_alloc {
6927 sa_rootdomain,
6928 sa_sd,
6929 sa_sd_storage,
6930 sa_none,
6931};
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942int group_balance_cpu(struct sched_group *sg)
6943{
6944 return cpumask_first(sched_group_mask(sg));
6945}
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056static void
7057build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
7058{
7059 const struct cpumask *sg_span = sched_group_cpus(sg);
7060 struct sd_data *sdd = sd->private;
7061 struct sched_domain *sibling;
7062 int i;
7063
7064 cpumask_clear(mask);
7065
7066 for_each_cpu(i, sg_span) {
7067 sibling = *per_cpu_ptr(sdd->sd, i);
7068
7069
7070
7071
7072
7073
7074 if (!sibling->child)
7075 continue;
7076
7077
7078 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
7079 continue;
7080
7081 cpumask_set_cpu(i, mask);
7082 }
7083
7084
7085 WARN_ON_ONCE(cpumask_empty(mask));
7086}
7087
7088
7089
7090
7091
7092
7093static struct sched_group *
7094build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
7095{
7096 struct sched_group *sg;
7097 struct cpumask *sg_span;
7098
7099 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7100 GFP_KERNEL, cpu_to_node(cpu));
7101
7102 if (!sg)
7103 return NULL;
7104
7105 sg_span = sched_group_cpus(sg);
7106 if (sd->child)
7107 cpumask_copy(sg_span, sched_domain_span(sd->child));
7108 else
7109 cpumask_copy(sg_span, sched_domain_span(sd));
7110
7111 atomic_inc(&sg->ref);
7112 return sg;
7113}
7114
7115static void init_overlap_sched_group(struct sched_domain *sd,
7116 struct sched_group *sg)
7117{
7118 struct cpumask *mask = sched_domains_tmpmask2;
7119 struct sd_data *sdd = sd->private;
7120 struct cpumask *sg_span;
7121 int cpu;
7122
7123 build_group_mask(sd, sg, mask);
7124 cpu = cpumask_first_and(sched_group_cpus(sg), mask);
7125
7126 sg->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7127 if (atomic_inc_return(&sg->sgp->ref) == 1)
7128 cpumask_copy(sched_group_mask(sg), mask);
7129 else
7130 WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask));
7131
7132
7133
7134
7135
7136
7137 sg_span = sched_group_cpus(sg);
7138 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
7139 sg->sgp->power_orig = sg->sgp->power;
7140}
7141
7142static int
7143build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7144{
7145 struct sched_group *first = NULL, *last = NULL, *sg;
7146 const struct cpumask *span = sched_domain_span(sd);
7147 struct cpumask *covered = sched_domains_tmpmask;
7148 struct sd_data *sdd = sd->private;
7149 struct sched_domain *sibling;
7150 int i;
7151
7152 cpumask_clear(covered);
7153
7154 for_each_cpu_wrap(i, span, cpu) {
7155 struct cpumask *sg_span;
7156
7157 if (cpumask_test_cpu(i, covered))
7158 continue;
7159
7160 sibling = *per_cpu_ptr(sdd->sd, i);
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
7173 continue;
7174
7175 sg = build_group_from_child_sched_domain(sibling, cpu);
7176 if (!sg)
7177 goto fail;
7178
7179 sg_span = sched_group_cpus(sg);
7180 cpumask_or(covered, covered, sg_span);
7181
7182 init_overlap_sched_group(sd, sg);
7183
7184 if (!first)
7185 first = sg;
7186 if (last)
7187 last->next = sg;
7188 last = sg;
7189 last->next = first;
7190 }
7191 sd->groups = first;
7192
7193 return 0;
7194
7195fail:
7196 free_sched_groups(first, 0);
7197
7198 return -ENOMEM;
7199}
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273static struct sched_group *get_group(int cpu, struct sd_data *sdd)
7274{
7275 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7276 struct sched_domain *child = sd->child;
7277 struct sched_group *sg;
7278
7279 if (child)
7280 cpu = cpumask_first(sched_domain_span(child));
7281
7282 sg = *per_cpu_ptr(sdd->sg, cpu);
7283 sg->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7284
7285
7286 atomic_inc(&sg->ref);
7287 atomic_inc(&sg->sgp->ref);
7288
7289 if (child) {
7290 cpumask_copy(sched_group_cpus(sg), sched_domain_span(child));
7291 cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg));
7292 } else {
7293 cpumask_set_cpu(cpu, sched_group_cpus(sg));
7294 cpumask_set_cpu(cpu, sched_group_mask(sg));
7295 }
7296
7297 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sched_group_cpus(sg));
7298 sg->sgp->power_orig = sg->sgp->power;
7299
7300 return sg;
7301}
7302
7303
7304
7305
7306
7307
7308
7309
7310static int
7311build_sched_groups(struct sched_domain *sd, int cpu)
7312{
7313 struct sched_group *first = NULL, *last = NULL;
7314 struct sd_data *sdd = sd->private;
7315 const struct cpumask *span = sched_domain_span(sd);
7316 struct cpumask *covered;
7317 int i;
7318
7319 lockdep_assert_held(&sched_domains_mutex);
7320 covered = sched_domains_tmpmask;
7321
7322 cpumask_clear(covered);
7323
7324 for_each_cpu_wrap(i, span, cpu) {
7325 struct sched_group *sg;
7326
7327 if (cpumask_test_cpu(i, covered))
7328 continue;
7329
7330 sg = get_group(i, sdd);
7331
7332 cpumask_or(covered, covered, sched_group_cpus(sg));
7333
7334 if (!first)
7335 first = sg;
7336 if (last)
7337 last->next = sg;
7338 last = sg;
7339 }
7340 last->next = first;
7341 sd->groups = first;
7342
7343 return 0;
7344}
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7357{
7358 struct sched_group *sg = sd->groups;
7359
7360 WARN_ON(!sd || !sg);
7361
7362 do {
7363 int cpu, max_cpu = -1;
7364
7365 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7366
7367 if (!(sd->flags & SD_ASYM_PACKING))
7368 goto next;
7369
7370 for_each_cpu(cpu, sched_group_cpus(sg)) {
7371 if (max_cpu < 0)
7372 max_cpu = cpu;
7373 else if (sched_asym_prefer(cpu, max_cpu))
7374 max_cpu = cpu;
7375 }
7376 sg->asym_prefer_cpu = max_cpu;
7377
7378next:
7379 sg = sg->next;
7380 } while (sg != sd->groups);
7381
7382 if (cpu != group_balance_cpu(sg))
7383 return;
7384
7385 update_group_power(sd, cpu);
7386 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
7387}
7388
7389int __weak arch_sd_sibling_asym_packing(void)
7390{
7391 return 0*SD_ASYM_PACKING;
7392}
7393
7394
7395
7396
7397
7398
7399static int default_relax_domain_level = -1;
7400int sched_domain_level_max;
7401
7402static int __init setup_relax_domain_level(char *str)
7403{
7404 if (kstrtoint(str, 0, &default_relax_domain_level))
7405 pr_warn("Unable to set relax_domain_level\n");
7406
7407 return 1;
7408}
7409__setup("relax_domain_level=", setup_relax_domain_level);
7410
7411static void set_domain_attribute(struct sched_domain *sd,
7412 struct sched_domain_attr *attr)
7413{
7414 int request;
7415
7416 if (!attr || attr->relax_domain_level < 0) {
7417 if (default_relax_domain_level < 0)
7418 return;
7419 else
7420 request = default_relax_domain_level;
7421 } else
7422 request = attr->relax_domain_level;
7423 if (request < sd->level) {
7424
7425 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7426 } else {
7427
7428 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7429 }
7430}
7431
7432static void __sdt_free(const struct cpumask *cpu_map);
7433static int __sdt_alloc(const struct cpumask *cpu_map);
7434
7435static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7436 const struct cpumask *cpu_map)
7437{
7438 switch (what) {
7439 case sa_rootdomain:
7440 if (!atomic_read(&d->rd->refcount))
7441 free_rootdomain(&d->rd->rcu);
7442 case sa_sd:
7443 free_percpu(d->sd);
7444 case sa_sd_storage:
7445 __sdt_free(cpu_map);
7446 case sa_none:
7447 break;
7448 }
7449}
7450
7451static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7452 const struct cpumask *cpu_map)
7453{
7454 memset(d, 0, sizeof(*d));
7455
7456 if (__sdt_alloc(cpu_map))
7457 return sa_sd_storage;
7458 d->sd = alloc_percpu(struct sched_domain *);
7459 if (!d->sd)
7460 return sa_sd_storage;
7461 d->rd = alloc_rootdomain();
7462 if (!d->rd)
7463 return sa_sd;
7464 return sa_rootdomain;
7465}
7466
7467
7468
7469
7470
7471
7472static void claim_allocations(int cpu, struct sched_domain *sd)
7473{
7474 struct sd_data *sdd = sd->private;
7475
7476 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7477 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7478
7479 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7480 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7481
7482 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7483 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7484}
7485
7486#ifdef CONFIG_NUMA
7487static int sched_domains_numa_levels;
7488static int *sched_domains_numa_distance;
7489static struct cpumask ***sched_domains_numa_masks;
7490static int sched_domains_curr_level;
7491#endif
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503#define TOPOLOGY_SD_FLAGS \
7504 (SD_SHARE_CPUPOWER | \
7505 SD_SHARE_PKG_RESOURCES | \
7506 SD_NUMA | \
7507 SD_ASYM_PACKING)
7508
7509static struct sched_domain *
7510sd_init(struct sched_domain_topology_level *tl, int cpu)
7511{
7512 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
7513 int sd_weight, sd_flags = 0;
7514
7515#ifdef CONFIG_NUMA
7516
7517
7518
7519 sched_domains_curr_level = tl->numa_level;
7520#endif
7521
7522 sd_weight = cpumask_weight(tl->mask(cpu));
7523
7524 if (tl->sd_flags)
7525 sd_flags = (*tl->sd_flags)();
7526 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
7527 "wrong sd_flags in topology description\n"))
7528 sd_flags &= ~TOPOLOGY_SD_FLAGS;
7529
7530 *sd = (struct sched_domain){
7531 .min_interval = sd_weight,
7532 .max_interval = 2*sd_weight,
7533 .busy_factor = 32,
7534 .imbalance_pct = 125,
7535
7536 .cache_nice_tries = 0,
7537 .busy_idx = 0,
7538 .idle_idx = 0,
7539 .newidle_idx = 0,
7540 .wake_idx = 0,
7541 .forkexec_idx = 0,
7542
7543 .flags = 1*SD_LOAD_BALANCE
7544 | 1*SD_BALANCE_NEWIDLE
7545 | 1*SD_BALANCE_EXEC
7546 | 1*SD_BALANCE_FORK
7547 | 0*SD_BALANCE_WAKE
7548 | 1*SD_WAKE_AFFINE
7549 | 0*SD_SHARE_CPUPOWER
7550 | 0*SD_SHARE_PKG_RESOURCES
7551 | 0*SD_SERIALIZE
7552 | 0*SD_PREFER_SIBLING
7553 | 0*SD_NUMA
7554 | sd_flags
7555 ,
7556
7557 .last_balance = jiffies,
7558 .balance_interval = sd_weight,
7559 .smt_gain = 0,
7560 .max_newidle_lb_cost = 0,
7561 .next_decay_max_lb_cost = jiffies,
7562#ifdef CONFIG_SCHED_DEBUG
7563 .name = tl->name,
7564#endif
7565 };
7566
7567
7568
7569
7570
7571 if (sd->flags & SD_SHARE_CPUPOWER) {
7572 sd->imbalance_pct = 110;
7573 sd->smt_gain = 1178;
7574 sd->flags |= arch_sd_sibling_asym_packing();
7575
7576 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
7577 sd->imbalance_pct = 117;
7578 sd->cache_nice_tries = 1;
7579 sd->busy_idx = 2;
7580
7581#ifdef CONFIG_NUMA
7582 } else if (sd->flags & SD_NUMA) {
7583 sd->cache_nice_tries = 2;
7584 sd->busy_idx = 3;
7585 sd->idle_idx = 2;
7586
7587 sd->flags |= SD_SERIALIZE;
7588 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
7589 sd->flags &= ~(SD_BALANCE_EXEC |
7590 SD_BALANCE_FORK |
7591 SD_WAKE_AFFINE);
7592 }
7593
7594#endif
7595 } else {
7596 sd->flags |= SD_PREFER_SIBLING;
7597 sd->cache_nice_tries = 1;
7598 sd->busy_idx = 2;
7599 sd->idle_idx = 1;
7600 }
7601
7602 sd->private = &tl->data;
7603
7604 return sd;
7605}
7606
7607
7608
7609
7610static struct sched_domain_topology_level default_topology[] = {
7611#ifdef CONFIG_SCHED_SMT
7612 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
7613#endif
7614#ifdef CONFIG_SCHED_MC
7615 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
7616#endif
7617#ifdef CONFIG_SCHED_BOOK
7618 { cpu_book_mask, SD_INIT_NAME(BOOK) },
7619#endif
7620#ifdef CONFIG_SCHED_DRAWER
7621 { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
7622#endif
7623 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
7624 { NULL, },
7625};
7626
7627struct sched_domain_topology_level *sched_domain_topology = default_topology;
7628
7629#define for_each_sd_topology(tl) \
7630 for (tl = sched_domain_topology; tl->mask; tl++)
7631
7632void set_sched_topology(struct sched_domain_topology_level *tl)
7633{
7634 if (WARN_ON_ONCE(sched_smp_initialized))
7635 return;
7636
7637 sched_domain_topology = tl;
7638}
7639
7640#ifdef CONFIG_NUMA
7641
7642static const struct cpumask *sd_numa_mask(int cpu)
7643{
7644 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
7645}
7646
7647static void sched_numa_warn(const char *str)
7648{
7649 static int done = false;
7650 int i,j;
7651
7652 if (done)
7653 return;
7654
7655 done = true;
7656
7657 printk(KERN_WARNING "ERROR: %s\n\n", str);
7658
7659 for (i = 0; i < nr_node_ids; i++) {
7660 printk(KERN_WARNING " ");
7661 for (j = 0; j < nr_node_ids; j++)
7662 printk(KERN_CONT "%02d ", node_distance(i,j));
7663 printk(KERN_CONT "\n");
7664 }
7665 printk(KERN_WARNING "\n");
7666}
7667
7668static bool find_numa_distance(int distance)
7669{
7670 int i;
7671
7672 if (distance == node_distance(0, 0))
7673 return true;
7674
7675 for (i = 0; i < sched_domains_numa_levels; i++) {
7676 if (sched_domains_numa_distance[i] == distance)
7677 return true;
7678 }
7679
7680 return false;
7681}
7682
7683static void sched_init_numa(void)
7684{
7685 int next_distance, curr_distance = node_distance(0, 0);
7686 struct sched_domain_topology_level *tl;
7687 int level = 0;
7688 int i, j, k;
7689
7690 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
7691 if (!sched_domains_numa_distance)
7692 return;
7693
7694
7695
7696
7697
7698
7699
7700
7701 next_distance = curr_distance;
7702 for (i = 0; i < nr_node_ids; i++) {
7703 for (j = 0; j < nr_node_ids; j++) {
7704 for (k = 0; k < nr_node_ids; k++) {
7705 int distance = node_distance(i, k);
7706
7707 if (distance > curr_distance &&
7708 (distance < next_distance ||
7709 next_distance == curr_distance))
7710 next_distance = distance;
7711
7712
7713
7714
7715
7716
7717 if (sched_debug() && node_distance(k, i) != distance)
7718 sched_numa_warn("Node-distance not symmetric");
7719
7720 if (sched_debug() && i && !find_numa_distance(distance))
7721 sched_numa_warn("Node-0 not representative");
7722 }
7723 if (next_distance != curr_distance) {
7724 sched_domains_numa_distance[level++] = next_distance;
7725 sched_domains_numa_levels = level;
7726 curr_distance = next_distance;
7727 } else break;
7728 }
7729
7730
7731
7732
7733 if (!sched_debug())
7734 break;
7735 }
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753 sched_domains_numa_levels = 0;
7754
7755 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
7756 if (!sched_domains_numa_masks)
7757 return;
7758
7759
7760
7761
7762
7763 for (i = 0; i < level; i++) {
7764 sched_domains_numa_masks[i] =
7765 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
7766 if (!sched_domains_numa_masks[i])
7767 return;
7768
7769 for (j = 0; j < nr_node_ids; j++) {
7770 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
7771 if (!mask)
7772 return;
7773
7774 sched_domains_numa_masks[i][j] = mask;
7775
7776 for (k = 0; k < nr_node_ids; k++) {
7777 if (node_distance(j, k) > sched_domains_numa_distance[i])
7778 continue;
7779
7780 cpumask_or(mask, mask, cpumask_of_node(k));
7781 }
7782 }
7783 }
7784
7785
7786 for (i = 0; sched_domain_topology[i].mask; i++);
7787
7788 tl = kzalloc((i + level + 1) *
7789 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
7790 if (!tl)
7791 return;
7792
7793
7794
7795
7796 for (i = 0; sched_domain_topology[i].mask; i++)
7797 tl[i] = sched_domain_topology[i];
7798
7799
7800
7801
7802 for (j = 0; j < level; i++, j++) {
7803 tl[i] = (struct sched_domain_topology_level){
7804 .mask = sd_numa_mask,
7805 .sd_flags = cpu_numa_flags,
7806 .flags = SDTL_OVERLAP,
7807 .numa_level = j,
7808 SD_INIT_NAME(NUMA)
7809 };
7810 }
7811
7812 sched_domain_topology = tl;
7813
7814 sched_domains_numa_levels = level;
7815}
7816
7817static void sched_domains_numa_masks_set(int cpu)
7818{
7819 int i, j;
7820 int node = cpu_to_node(cpu);
7821
7822 for (i = 0; i < sched_domains_numa_levels; i++) {
7823 for (j = 0; j < nr_node_ids; j++) {
7824 if (node_distance(j, node) <= sched_domains_numa_distance[i])
7825 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
7826 }
7827 }
7828}
7829
7830static void sched_domains_numa_masks_clear(int cpu)
7831{
7832 int i, j;
7833 for (i = 0; i < sched_domains_numa_levels; i++) {
7834 for (j = 0; j < nr_node_ids; j++)
7835 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
7836 }
7837}
7838
7839
7840
7841
7842
7843static int sched_domains_numa_masks_update(struct notifier_block *nfb,
7844 unsigned long action,
7845 void *hcpu)
7846{
7847 int cpu = (long)hcpu;
7848
7849 if (!sched_smp_initialized)
7850 return NOTIFY_DONE;
7851
7852 switch (action & ~CPU_TASKS_FROZEN) {
7853 case CPU_ONLINE:
7854 sched_domains_numa_masks_set(cpu);
7855 break;
7856
7857 case CPU_DEAD:
7858 sched_domains_numa_masks_clear(cpu);
7859 break;
7860
7861 default:
7862 return NOTIFY_DONE;
7863 }
7864
7865 return NOTIFY_OK;
7866}
7867#else
7868static inline void sched_init_numa(void)
7869{
7870}
7871
7872static int sched_domains_numa_masks_update(struct notifier_block *nfb,
7873 unsigned long action,
7874 void *hcpu)
7875{
7876 return 0;
7877}
7878#endif
7879
7880static int __sdt_alloc(const struct cpumask *cpu_map)
7881{
7882 struct sched_domain_topology_level *tl;
7883 int j;
7884
7885 for_each_sd_topology(tl) {
7886 struct sd_data *sdd = &tl->data;
7887
7888 sdd->sd = alloc_percpu(struct sched_domain *);
7889 if (!sdd->sd)
7890 return -ENOMEM;
7891
7892 sdd->sg = alloc_percpu(struct sched_group *);
7893 if (!sdd->sg)
7894 return -ENOMEM;
7895
7896 sdd->sgp = alloc_percpu(struct sched_group_power *);
7897 if (!sdd->sgp)
7898 return -ENOMEM;
7899
7900 for_each_cpu(j, cpu_map) {
7901 struct sched_domain *sd;
7902 struct sched_group *sg;
7903 struct sched_group_power *sgp;
7904
7905 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7906 GFP_KERNEL, cpu_to_node(j));
7907 if (!sd)
7908 return -ENOMEM;
7909
7910 *per_cpu_ptr(sdd->sd, j) = sd;
7911
7912 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7913 GFP_KERNEL, cpu_to_node(j));
7914 if (!sg)
7915 return -ENOMEM;
7916
7917 sg->next = sg;
7918
7919 *per_cpu_ptr(sdd->sg, j) = sg;
7920
7921 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
7922 GFP_KERNEL, cpu_to_node(j));
7923 if (!sgp)
7924 return -ENOMEM;
7925
7926 *per_cpu_ptr(sdd->sgp, j) = sgp;
7927 }
7928 }
7929
7930 return 0;
7931}
7932
7933static void __sdt_free(const struct cpumask *cpu_map)
7934{
7935 struct sched_domain_topology_level *tl;
7936 int j;
7937
7938 for_each_sd_topology(tl) {
7939 struct sd_data *sdd = &tl->data;
7940
7941 for_each_cpu(j, cpu_map) {
7942 struct sched_domain *sd;
7943
7944 if (sdd->sd) {
7945 sd = *per_cpu_ptr(sdd->sd, j);
7946 if (sd && (sd->flags & SD_OVERLAP))
7947 free_sched_groups(sd->groups, 0);
7948 kfree(*per_cpu_ptr(sdd->sd, j));
7949 }
7950
7951 if (sdd->sg)
7952 kfree(*per_cpu_ptr(sdd->sg, j));
7953 if (sdd->sgp)
7954 kfree(*per_cpu_ptr(sdd->sgp, j));
7955 }
7956 free_percpu(sdd->sd);
7957 sdd->sd = NULL;
7958 free_percpu(sdd->sg);
7959 sdd->sg = NULL;
7960 free_percpu(sdd->sgp);
7961 sdd->sgp = NULL;
7962 }
7963}
7964
7965struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7966 struct s_data *d, const struct cpumask *cpu_map,
7967 struct sched_domain_attr *attr, struct sched_domain *child,
7968 int cpu)
7969{
7970 struct sched_domain *sd = sd_init(tl, cpu);
7971 if (!sd)
7972 return child;
7973
7974 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7975 if (child) {
7976 sd->level = child->level + 1;
7977 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7978 child->parent = sd;
7979 sd->child = child;
7980
7981 if (!cpumask_subset(sched_domain_span(child),
7982 sched_domain_span(sd))) {
7983 pr_err("BUG: arch topology broken\n");
7984#ifdef CONFIG_SCHED_DEBUG
7985 pr_err(" the %s domain not a subset of the %s domain\n",
7986 child->name, sd->name);
7987#endif
7988
7989 cpumask_or(sched_domain_span(sd),
7990 sched_domain_span(sd),
7991 sched_domain_span(child));
7992 }
7993
7994 }
7995 set_domain_attribute(sd, attr);
7996
7997 return sd;
7998}
7999
8000
8001
8002
8003
8004static int build_sched_domains(const struct cpumask *cpu_map,
8005 struct sched_domain_attr *attr)
8006{
8007 enum s_alloc alloc_state = sa_none;
8008 struct sched_domain *sd;
8009 struct s_data d;
8010 int i, ret = -ENOMEM;
8011
8012 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8013 if (alloc_state != sa_rootdomain)
8014 goto error;
8015
8016
8017 for_each_cpu(i, cpu_map) {
8018 struct sched_domain_topology_level *tl;
8019
8020 sd = NULL;
8021 for_each_sd_topology(tl) {
8022 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
8023 if (tl->flags & SDTL_OVERLAP)
8024 sd->flags |= SD_OVERLAP;
8025 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
8026 break;
8027 }
8028
8029 while (sd->child)
8030 sd = sd->child;
8031
8032 *per_cpu_ptr(d.sd, i) = sd;
8033 }
8034
8035
8036 for_each_cpu(i, cpu_map) {
8037 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
8038 sd->span_weight = cpumask_weight(sched_domain_span(sd));
8039 if (sd->flags & SD_OVERLAP) {
8040 if (build_overlap_sched_groups(sd, i))
8041 goto error;
8042 } else {
8043 if (build_sched_groups(sd, i))
8044 goto error;
8045 }
8046 }
8047 }
8048
8049
8050 for (i = nr_cpumask_bits-1; i >= 0; i--) {
8051 if (!cpumask_test_cpu(i, cpu_map))
8052 continue;
8053
8054 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
8055 claim_allocations(i, sd);
8056 init_sched_groups_power(i, sd);
8057 }
8058 }
8059
8060
8061 rcu_read_lock();
8062 for_each_cpu(i, cpu_map) {
8063 sd = *per_cpu_ptr(d.sd, i);
8064 cpu_attach_domain(sd, d.rd, i);
8065 }
8066 rcu_read_unlock();
8067
8068 ret = 0;
8069error:
8070 __free_domain_allocs(&d, alloc_state, cpu_map);
8071 return ret;
8072}
8073
8074static cpumask_var_t *doms_cur;
8075static int ndoms_cur;
8076static struct sched_domain_attr *dattr_cur;
8077
8078
8079
8080
8081
8082
8083
8084static cpumask_var_t fallback_doms;
8085
8086
8087
8088
8089
8090
8091int __weak arch_update_cpu_topology(void)
8092{
8093 return 0;
8094}
8095
8096cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8097{
8098 int i;
8099 cpumask_var_t *doms;
8100
8101 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8102 if (!doms)
8103 return NULL;
8104 for (i = 0; i < ndoms; i++) {
8105 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8106 free_sched_domains(doms, i);
8107 return NULL;
8108 }
8109 }
8110 return doms;
8111}
8112
8113void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8114{
8115 unsigned int i;
8116 for (i = 0; i < ndoms; i++)
8117 free_cpumask_var(doms[i]);
8118 kfree(doms);
8119}
8120
8121
8122
8123
8124
8125
8126static int sched_init_domains(const struct cpumask *cpu_map)
8127{
8128 int err;
8129
8130 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
8131 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
8132 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8133
8134 arch_update_cpu_topology();
8135 ndoms_cur = 1;
8136 doms_cur = alloc_sched_domains(ndoms_cur);
8137 if (!doms_cur)
8138 doms_cur = &fallback_doms;
8139 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8140 err = build_sched_domains(doms_cur[0], NULL);
8141 register_sched_domain_sysctl();
8142
8143 return err;
8144}
8145
8146
8147
8148
8149
8150static void detach_destroy_domains(const struct cpumask *cpu_map)
8151{
8152 int i;
8153
8154 rcu_read_lock();
8155 for_each_cpu(i, cpu_map)
8156 cpu_attach_domain(NULL, &def_root_domain, i);
8157 rcu_read_unlock();
8158}
8159
8160
8161static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8162 struct sched_domain_attr *new, int idx_new)
8163{
8164 struct sched_domain_attr tmp;
8165
8166
8167 if (!new && !cur)
8168 return 1;
8169
8170 tmp = SD_ATTR_INIT;
8171 return !memcmp(cur ? (cur + idx_cur) : &tmp,
8172 new ? (new + idx_new) : &tmp,
8173 sizeof(struct sched_domain_attr));
8174}
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8203 struct sched_domain_attr *dattr_new)
8204{
8205 int i, j, n;
8206 int new_topology;
8207
8208 mutex_lock(&sched_domains_mutex);
8209
8210
8211 unregister_sched_domain_sysctl();
8212
8213
8214 new_topology = arch_update_cpu_topology();
8215
8216 n = doms_new ? ndoms_new : 0;
8217
8218
8219 for (i = 0; i < ndoms_cur; i++) {
8220 for (j = 0; j < n && !new_topology; j++) {
8221 if (cpumask_equal(doms_cur[i], doms_new[j])
8222 && dattrs_equal(dattr_cur, i, dattr_new, j))
8223 goto match1;
8224 }
8225
8226 detach_destroy_domains(doms_cur[i]);
8227match1:
8228 ;
8229 }
8230
8231 n = ndoms_cur;
8232 if (doms_new == NULL) {
8233 n = 0;
8234 doms_new = &fallback_doms;
8235 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
8236 WARN_ON_ONCE(dattr_new);
8237 }
8238
8239
8240 for (i = 0; i < ndoms_new; i++) {
8241 for (j = 0; j < n && !new_topology; j++) {
8242 if (cpumask_equal(doms_new[i], doms_cur[j])
8243 && dattrs_equal(dattr_new, i, dattr_cur, j))
8244 goto match2;
8245 }
8246
8247 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
8248match2:
8249 ;
8250 }
8251
8252
8253 if (doms_cur != &fallback_doms)
8254 free_sched_domains(doms_cur, ndoms_cur);
8255 kfree(dattr_cur);
8256 doms_cur = doms_new;
8257 dattr_cur = dattr_new;
8258 ndoms_cur = ndoms_new;
8259
8260 register_sched_domain_sysctl();
8261
8262 mutex_unlock(&sched_domains_mutex);
8263}
8264
8265static int num_cpus_frozen;
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
8276 void *hcpu)
8277{
8278 if (!sched_smp_initialized)
8279 return NOTIFY_DONE;
8280
8281 switch (action) {
8282 case CPU_ONLINE_FROZEN:
8283 case CPU_DOWN_FAILED_FROZEN:
8284
8285
8286
8287
8288
8289
8290
8291 num_cpus_frozen--;
8292 if (likely(num_cpus_frozen)) {
8293 partition_sched_domains(1, NULL, NULL);
8294 break;
8295 }
8296
8297
8298
8299
8300
8301
8302
8303 case CPU_ONLINE:
8304 cpuset_update_active_cpus(true);
8305 break;
8306 default:
8307 return NOTIFY_DONE;
8308 }
8309 return NOTIFY_OK;
8310}
8311
8312static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8313 void *hcpu)
8314{
8315 unsigned long flags;
8316 long cpu = (long)hcpu;
8317 struct dl_bw *dl_b;
8318 bool overflow;
8319 int cpus;
8320
8321 if (!sched_smp_initialized)
8322 return NOTIFY_DONE;
8323
8324 switch (action) {
8325 case CPU_DOWN_PREPARE:
8326 rcu_read_lock_sched();
8327 dl_b = dl_bw_of(cpu);
8328
8329 raw_spin_lock_irqsave(&dl_b->lock, flags);
8330 cpus = dl_bw_cpus(cpu);
8331 overflow = __dl_overflow(dl_b, cpus, 0, 0);
8332 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8333
8334 rcu_read_unlock_sched();
8335
8336 if (overflow)
8337 return notifier_from_errno(-EBUSY);
8338 cpuset_update_active_cpus(false);
8339 break;
8340 case CPU_DOWN_PREPARE_FROZEN:
8341 num_cpus_frozen++;
8342 partition_sched_domains(1, NULL, NULL);
8343 break;
8344 default:
8345 return NOTIFY_DONE;
8346 }
8347 return NOTIFY_OK;
8348}
8349
8350void __init sched_init_smp(void)
8351{
8352 cpumask_var_t non_isolated_cpus;
8353
8354 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8355
8356 sched_init_numa();
8357
8358
8359
8360
8361
8362
8363 mutex_lock(&sched_domains_mutex);
8364 sched_init_domains(cpu_active_mask);
8365 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8366 if (cpumask_empty(non_isolated_cpus))
8367 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8368 mutex_unlock(&sched_domains_mutex);
8369
8370 init_hrtick();
8371
8372
8373 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8374 BUG();
8375 sched_init_granularity();
8376 free_cpumask_var(non_isolated_cpus);
8377
8378 init_sched_rt_class();
8379 init_sched_dl_class();
8380 sched_smp_initialized = true;
8381}
8382
8383static int __init migration_init(void)
8384{
8385 void *cpu = (void *)(long)smp_processor_id();
8386 int err;
8387
8388
8389 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
8390 BUG_ON(err == NOTIFY_BAD);
8391 migration_call(&migration_notifier, CPU_ONLINE, cpu);
8392 register_cpu_notifier(&migration_notifier);
8393
8394
8395 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
8396 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
8397
8398 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
8399 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
8400 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
8401
8402 return 0;
8403}
8404early_initcall(migration_init);
8405
8406#else
8407void __init sched_init_smp(void)
8408{
8409 sched_init_granularity();
8410}
8411#endif
8412
8413int in_sched_functions(unsigned long addr)
8414{
8415 return in_lock_functions(addr) ||
8416 (addr >= (unsigned long)__sched_text_start
8417 && addr < (unsigned long)__sched_text_end);
8418}
8419
8420#ifdef CONFIG_CGROUP_SCHED
8421
8422
8423
8424
8425struct task_group root_task_group;
8426LIST_HEAD(task_groups);
8427#endif
8428
8429DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
8430
8431void __init sched_init(void)
8432{
8433 int i, j;
8434 unsigned long alloc_size = 0, ptr;
8435
8436 wait_bit_init();
8437
8438#ifdef CONFIG_FAIR_GROUP_SCHED
8439 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8440#endif
8441#ifdef CONFIG_RT_GROUP_SCHED
8442 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8443#endif
8444 if (alloc_size) {
8445 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8446
8447#ifdef CONFIG_FAIR_GROUP_SCHED
8448 root_task_group.se = (struct sched_entity **)ptr;
8449 ptr += nr_cpu_ids * sizeof(void **);
8450
8451 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8452 ptr += nr_cpu_ids * sizeof(void **);
8453
8454#endif
8455#ifdef CONFIG_RT_GROUP_SCHED
8456 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8457 ptr += nr_cpu_ids * sizeof(void **);
8458
8459 root_task_group.rt_rq = (struct rt_rq **)ptr;
8460 ptr += nr_cpu_ids * sizeof(void **);
8461
8462#endif
8463 }
8464#ifdef CONFIG_CPUMASK_OFFSTACK
8465 for_each_possible_cpu(i) {
8466 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
8467 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8468 }
8469#endif
8470
8471 init_rt_bandwidth(&def_rt_bandwidth,
8472 global_rt_period(), global_rt_runtime());
8473 init_dl_bandwidth(&def_dl_bandwidth,
8474 global_rt_period(), global_rt_runtime());
8475
8476#ifdef CONFIG_SMP
8477 init_defrootdomain();
8478#endif
8479
8480#ifdef CONFIG_RT_GROUP_SCHED
8481 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8482 global_rt_period(), global_rt_runtime());
8483#endif
8484
8485#ifdef CONFIG_CGROUP_SCHED
8486 list_add(&root_task_group.list, &task_groups);
8487 INIT_LIST_HEAD(&root_task_group.children);
8488 INIT_LIST_HEAD(&root_task_group.siblings);
8489 autogroup_init(&init_task);
8490
8491#endif
8492
8493 for_each_possible_cpu(i) {
8494 struct rq *rq;
8495
8496 rq = cpu_rq(i);
8497 raw_spin_lock_init(&rq->lock);
8498 rq->nr_running = 0;
8499 rq->calc_load_active = 0;
8500 rq->calc_load_update = jiffies + LOAD_FREQ;
8501 init_cfs_rq(&rq->cfs);
8502 init_rt_rq(&rq->rt, rq);
8503 init_dl_rq(&rq->dl, rq);
8504#ifdef CONFIG_FAIR_GROUP_SCHED
8505 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8506 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8527 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8528#endif
8529
8530 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8531#ifdef CONFIG_RT_GROUP_SCHED
8532 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8533 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8534#endif
8535
8536 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8537 rq->cpu_load[j] = 0;
8538
8539 rq->last_load_update_tick = jiffies;
8540
8541#ifdef CONFIG_SMP
8542 rq->sd = NULL;
8543 rq->rd = NULL;
8544 rq->cpu_power = rq->cpu_capacity_orig = SCHED_POWER_SCALE;
8545 rq->post_schedule = 0;
8546 rq->active_balance = 0;
8547 rq->next_balance = jiffies;
8548 rq->push_cpu = 0;
8549 rq->cpu = i;
8550 rq->online = 0;
8551 rq->idle_stamp = 0;
8552 rq->avg_idle = 2*sysctl_sched_migration_cost;
8553 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
8554
8555 INIT_LIST_HEAD(&rq->cfs_tasks);
8556
8557 rq_attach_root(rq, &def_root_domain);
8558#ifdef CONFIG_NO_HZ_COMMON
8559 rq->nohz_flags = 0;
8560#endif
8561#ifdef CONFIG_NO_HZ_FULL
8562 rq->last_sched_tick = 0;
8563#endif
8564#endif
8565 init_rq_hrtick(rq);
8566 atomic_set(&rq->nr_iowait, 0);
8567 }
8568
8569 set_load_weight(&init_task);
8570
8571#ifdef CONFIG_PREEMPT_NOTIFIERS
8572 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8573#endif
8574
8575
8576
8577
8578 atomic_inc(&init_mm.mm_count);
8579 enter_lazy_tlb(&init_mm, current);
8580
8581
8582
8583
8584
8585
8586
8587 init_idle(current, smp_processor_id());
8588
8589 calc_load_update = jiffies + LOAD_FREQ;
8590
8591
8592
8593
8594 current->sched_class = &fair_sched_class;
8595
8596#ifdef CONFIG_SMP
8597
8598 if (cpu_isolated_map == NULL)
8599 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8600 idle_thread_set_boot_cpu();
8601#endif
8602 init_sched_fair_class();
8603
8604 init_schedstats();
8605
8606 scheduler_running = 1;
8607}
8608
8609#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8610static inline int preempt_count_equals(int preempt_offset)
8611{
8612 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8613
8614 return (nested == preempt_offset);
8615}
8616
8617void __might_sleep(const char *file, int line, int preempt_offset)
8618{
8619 static unsigned long prev_jiffy;
8620
8621 rcu_sleep_check();
8622 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8623 system_state != SYSTEM_RUNNING || oops_in_progress)
8624 return;
8625 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8626 return;
8627 prev_jiffy = jiffies;
8628
8629 printk(KERN_ERR
8630 "BUG: sleeping function called from invalid context at %s:%d\n",
8631 file, line);
8632 printk(KERN_ERR
8633 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8634 in_atomic(), irqs_disabled(),
8635 current->pid, current->comm);
8636
8637 debug_show_held_locks(current);
8638 if (irqs_disabled())
8639 print_irqtrace_events(current);
8640 dump_stack();
8641}
8642EXPORT_SYMBOL(__might_sleep);
8643#endif
8644
8645#ifdef CONFIG_MAGIC_SYSRQ
8646static void normalize_task(struct rq *rq, struct task_struct *p)
8647{
8648 const struct sched_class *prev_class = p->sched_class;
8649 struct sched_attr attr = {
8650 .sched_policy = SCHED_NORMAL,
8651 };
8652 int old_prio = p->prio;
8653 int queued;
8654
8655 queued = task_on_rq_queued(p);
8656 if (queued)
8657 dequeue_task(rq, p, DEQUEUE_SAVE);
8658 __setscheduler(rq, p, &attr);
8659 if (queued) {
8660 enqueue_task(rq, p, ENQUEUE_RESTORE);
8661 resched_curr(rq);
8662 }
8663
8664 check_class_changed(rq, p, prev_class, old_prio);
8665}
8666
8667void normalize_rt_tasks(void)
8668{
8669 struct task_struct *g, *p;
8670 unsigned long flags;
8671 struct rq *rq;
8672
8673 qread_lock_irqsave(&tasklist_lock, flags);
8674 do_each_thread(g, p) {
8675
8676
8677
8678 if (!p->mm)
8679 continue;
8680
8681 p->se.exec_start = 0;
8682#ifdef CONFIG_SCHEDSTATS
8683 p->se.statistics->wait_start = 0;
8684 p->se.statistics->sleep_start = 0;
8685 p->se.statistics->block_start = 0;
8686#endif
8687
8688 if (!dl_task(p) && !rt_task(p)) {
8689
8690
8691
8692
8693 if (TASK_NICE(p) < 0 && p->mm)
8694 set_user_nice(p, 0);
8695 continue;
8696 }
8697
8698 raw_spin_lock(&p->pi_lock);
8699 rq = __task_rq_lock(p);
8700
8701 normalize_task(rq, p);
8702
8703 __task_rq_unlock(rq);
8704 raw_spin_unlock(&p->pi_lock);
8705 } while_each_thread(g, p);
8706
8707 qread_unlock_irqrestore(&tasklist_lock, flags);
8708}
8709
8710#endif
8711
8712#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731struct task_struct *curr_task(int cpu)
8732{
8733 return cpu_curr(cpu);
8734}
8735
8736#endif
8737
8738#ifdef CONFIG_IA64
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754void set_curr_task(int cpu, struct task_struct *p)
8755{
8756 cpu_curr(cpu) = p;
8757}
8758
8759#endif
8760
8761#ifdef CONFIG_CGROUP_SCHED
8762
8763static DEFINE_SPINLOCK(task_group_lock);
8764
8765static void free_sched_group(struct task_group *tg)
8766{
8767 free_fair_sched_group(tg);
8768 free_rt_sched_group(tg);
8769 autogroup_free(tg);
8770 kfree(tg);
8771}
8772
8773
8774struct task_group *sched_create_group(struct task_group *parent)
8775{
8776 struct task_group *tg;
8777
8778 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8779 if (!tg)
8780 return ERR_PTR(-ENOMEM);
8781
8782 if (!alloc_fair_sched_group(tg, parent))
8783 goto err;
8784
8785 if (!alloc_rt_sched_group(tg, parent))
8786 goto err;
8787
8788 return tg;
8789
8790err:
8791 free_sched_group(tg);
8792 return ERR_PTR(-ENOMEM);
8793}
8794
8795void sched_online_group(struct task_group *tg, struct task_group *parent)
8796{
8797 unsigned long flags;
8798
8799 spin_lock_irqsave(&task_group_lock, flags);
8800 list_add_rcu(&tg->list, &task_groups);
8801
8802 WARN_ON(!parent);
8803
8804 tg->parent = parent;
8805 INIT_LIST_HEAD(&tg->children);
8806 list_add_rcu(&tg->siblings, &parent->children);
8807 spin_unlock_irqrestore(&task_group_lock, flags);
8808
8809 online_fair_sched_group(tg);
8810}
8811
8812
8813static void free_sched_group_rcu(struct rcu_head *rhp)
8814{
8815
8816 free_sched_group(container_of(rhp, struct task_group, rcu));
8817}
8818
8819
8820void sched_destroy_group(struct task_group *tg)
8821{
8822
8823 call_rcu(&tg->rcu, free_sched_group_rcu);
8824}
8825
8826void sched_offline_group(struct task_group *tg)
8827{
8828 unsigned long flags;
8829 int i;
8830
8831
8832 for_each_possible_cpu(i)
8833 unregister_fair_sched_group(tg, i);
8834
8835 spin_lock_irqsave(&task_group_lock, flags);
8836 list_del_rcu(&tg->list);
8837 list_del_rcu(&tg->siblings);
8838 spin_unlock_irqrestore(&task_group_lock, flags);
8839}
8840
8841
8842
8843
8844
8845
8846void sched_move_task(struct task_struct *tsk)
8847{
8848 struct task_group *tg;
8849 int queued, running;
8850 unsigned long flags;
8851 struct rq *rq;
8852
8853 rq = task_rq_lock(tsk, &flags);
8854
8855 running = task_current(rq, tsk);
8856 queued = task_on_rq_queued(tsk);
8857
8858 if (queued)
8859 dequeue_task(rq, tsk, DEQUEUE_SAVE);
8860 if (unlikely(running))
8861 tsk->sched_class->put_prev_task(rq, tsk);
8862
8863
8864
8865
8866
8867
8868 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, true),
8869 struct task_group, css);
8870 tg = autogroup_task_group(tsk, tg);
8871 tsk->sched_task_group = tg;
8872
8873#ifdef CONFIG_FAIR_GROUP_SCHED
8874 if (tsk->sched_class->task_move_group)
8875 tsk->sched_class->task_move_group(tsk, queued);
8876 else
8877#endif
8878 set_task_rq(tsk, task_cpu(tsk));
8879
8880 if (unlikely(running))
8881 tsk->sched_class->set_curr_task(rq);
8882 if (queued)
8883 enqueue_task(rq, tsk, ENQUEUE_RESTORE);
8884
8885 task_rq_unlock(rq, tsk, &flags);
8886}
8887#endif
8888
8889#ifdef CONFIG_RT_GROUP_SCHED
8890
8891
8892
8893static DEFINE_MUTEX(rt_constraints_mutex);
8894
8895
8896static inline int tg_has_rt_tasks(struct task_group *tg)
8897{
8898 struct task_struct *g, *p;
8899
8900 do_each_thread(g, p) {
8901 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8902 return 1;
8903 } while_each_thread(g, p);
8904
8905 return 0;
8906}
8907
8908struct rt_schedulable_data {
8909 struct task_group *tg;
8910 u64 rt_period;
8911 u64 rt_runtime;
8912};
8913
8914static int tg_rt_schedulable(struct task_group *tg, void *data)
8915{
8916 struct rt_schedulable_data *d = data;
8917 struct task_group *child;
8918 unsigned long total, sum = 0;
8919 u64 period, runtime;
8920
8921 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8922 runtime = tg->rt_bandwidth.rt_runtime;
8923
8924 if (tg == d->tg) {
8925 period = d->rt_period;
8926 runtime = d->rt_runtime;
8927 }
8928
8929
8930
8931
8932 if (runtime > period && runtime != RUNTIME_INF)
8933 return -EINVAL;
8934
8935
8936
8937
8938 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8939 return -EBUSY;
8940
8941 total = to_ratio(period, runtime);
8942
8943
8944
8945
8946 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8947 return -EINVAL;
8948
8949
8950
8951
8952 list_for_each_entry_rcu(child, &tg->children, siblings) {
8953 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8954 runtime = child->rt_bandwidth.rt_runtime;
8955
8956 if (child == d->tg) {
8957 period = d->rt_period;
8958 runtime = d->rt_runtime;
8959 }
8960
8961 sum += to_ratio(period, runtime);
8962 }
8963
8964 if (sum > total)
8965 return -EINVAL;
8966
8967 return 0;
8968}
8969
8970static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8971{
8972 int ret;
8973
8974 struct rt_schedulable_data data = {
8975 .tg = tg,
8976 .rt_period = period,
8977 .rt_runtime = runtime,
8978 };
8979
8980 rcu_read_lock();
8981 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8982 rcu_read_unlock();
8983
8984 return ret;
8985}
8986
8987static int tg_set_rt_bandwidth(struct task_group *tg,
8988 u64 rt_period, u64 rt_runtime)
8989{
8990 int i, err = 0;
8991
8992 mutex_lock(&rt_constraints_mutex);
8993 qread_lock(&tasklist_lock);
8994 err = __rt_schedulable(tg, rt_period, rt_runtime);
8995 if (err)
8996 goto unlock;
8997
8998 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8999 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
9000 tg->rt_bandwidth.rt_runtime = rt_runtime;
9001
9002 for_each_possible_cpu(i) {
9003 struct rt_rq *rt_rq = tg->rt_rq[i];
9004
9005 raw_spin_lock(&rt_rq->rt_runtime_lock);
9006 rt_rq->rt_runtime = rt_runtime;
9007 raw_spin_unlock(&rt_rq->rt_runtime_lock);
9008 }
9009 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9010unlock:
9011 qread_unlock(&tasklist_lock);
9012 mutex_unlock(&rt_constraints_mutex);
9013
9014 return err;
9015}
9016
9017static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
9018{
9019 u64 rt_runtime, rt_period;
9020
9021 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9022 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
9023 if (rt_runtime_us < 0)
9024 rt_runtime = RUNTIME_INF;
9025
9026 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
9027}
9028
9029static long sched_group_rt_runtime(struct task_group *tg)
9030{
9031 u64 rt_runtime_us;
9032
9033 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9034 return -1;
9035
9036 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9037 do_div(rt_runtime_us, NSEC_PER_USEC);
9038 return rt_runtime_us;
9039}
9040
9041static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
9042{
9043 u64 rt_runtime, rt_period;
9044
9045 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9046 rt_runtime = tg->rt_bandwidth.rt_runtime;
9047
9048 if (rt_period == 0)
9049 return -EINVAL;
9050
9051 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
9052}
9053
9054static long sched_group_rt_period(struct task_group *tg)
9055{
9056 u64 rt_period_us;
9057
9058 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9059 do_div(rt_period_us, NSEC_PER_USEC);
9060 return rt_period_us;
9061}
9062#endif
9063
9064#ifdef CONFIG_RT_GROUP_SCHED
9065static int sched_rt_global_constraints(void)
9066{
9067 int ret = 0;
9068
9069 mutex_lock(&rt_constraints_mutex);
9070 qread_lock(&tasklist_lock);
9071 ret = __rt_schedulable(NULL, 0, 0);
9072 qread_unlock(&tasklist_lock);
9073 mutex_unlock(&rt_constraints_mutex);
9074
9075 return ret;
9076}
9077
9078static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9079{
9080
9081 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9082 return 0;
9083
9084 return 1;
9085}
9086
9087#else
9088static int sched_rt_global_constraints(void)
9089{
9090 unsigned long flags;
9091 int i, ret = 0;
9092
9093 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9094 for_each_possible_cpu(i) {
9095 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9096
9097 raw_spin_lock(&rt_rq->rt_runtime_lock);
9098 rt_rq->rt_runtime = global_rt_runtime();
9099 raw_spin_unlock(&rt_rq->rt_runtime_lock);
9100 }
9101 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9102
9103 return ret;
9104}
9105#endif
9106
9107static int sched_dl_global_validate(void)
9108{
9109 u64 runtime = global_rt_runtime();
9110 u64 period = global_rt_period();
9111 u64 new_bw = to_ratio(period, runtime);
9112 struct dl_bw *dl_b;
9113 int cpu, ret = 0;
9114 unsigned long flags;
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125 for_each_possible_cpu(cpu) {
9126 rcu_read_lock_sched();
9127 dl_b = dl_bw_of(cpu);
9128
9129 raw_spin_lock_irqsave(&dl_b->lock, flags);
9130 if (new_bw < dl_b->total_bw)
9131 ret = -EBUSY;
9132 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
9133
9134 rcu_read_unlock_sched();
9135
9136 if (ret)
9137 break;
9138 }
9139
9140 return ret;
9141}
9142
9143void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
9144{
9145 if (global_rt_runtime() == RUNTIME_INF) {
9146 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
9147 dl_rq->extra_bw = 1 << BW_SHIFT;
9148 } else {
9149 dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
9150 global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
9151 dl_rq->extra_bw = to_ratio(global_rt_period(),
9152 global_rt_runtime());
9153 }
9154}
9155
9156static void sched_dl_do_global(void)
9157{
9158 u64 new_bw = -1;
9159 struct dl_bw *dl_b;
9160 int cpu;
9161 unsigned long flags;
9162
9163 def_dl_bandwidth.dl_period = global_rt_period();
9164 def_dl_bandwidth.dl_runtime = global_rt_runtime();
9165
9166 if (global_rt_runtime() != RUNTIME_INF)
9167 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
9168
9169
9170
9171
9172 for_each_possible_cpu(cpu) {
9173 rcu_read_lock_sched();
9174 dl_b = dl_bw_of(cpu);
9175
9176 raw_spin_lock_irqsave(&dl_b->lock, flags);
9177 dl_b->bw = new_bw;
9178 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
9179
9180 rcu_read_unlock_sched();
9181 init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
9182 }
9183}
9184
9185static int sched_rt_global_validate(void)
9186{
9187 if (sysctl_sched_rt_period <= 0)
9188 return -EINVAL;
9189
9190 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
9191 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
9192 return -EINVAL;
9193
9194 return 0;
9195}
9196
9197static void sched_rt_do_global(void)
9198{
9199 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9200 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
9201}
9202
9203int sched_rt_handler(struct ctl_table *table, int write,
9204 void __user *buffer, size_t *lenp,
9205 loff_t *ppos)
9206{
9207 int old_period, old_runtime;
9208 static DEFINE_MUTEX(mutex);
9209 int ret;
9210
9211 mutex_lock(&mutex);
9212 old_period = sysctl_sched_rt_period;
9213 old_runtime = sysctl_sched_rt_runtime;
9214
9215 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9216
9217 if (!ret && write) {
9218 ret = sched_rt_global_validate();
9219 if (ret)
9220 goto undo;
9221
9222 ret = sched_dl_global_validate();
9223 if (ret)
9224 goto undo;
9225
9226 ret = sched_rt_global_constraints();
9227 if (ret)
9228 goto undo;
9229
9230 sched_rt_do_global();
9231 sched_dl_do_global();
9232 }
9233 if (0) {
9234undo:
9235 sysctl_sched_rt_period = old_period;
9236 sysctl_sched_rt_runtime = old_runtime;
9237 }
9238 mutex_unlock(&mutex);
9239
9240 return ret;
9241}
9242
9243int sched_rr_handler(struct ctl_table *table, int write,
9244 void __user *buffer, size_t *lenp,
9245 loff_t *ppos)
9246{
9247 int ret;
9248 static DEFINE_MUTEX(mutex);
9249
9250 mutex_lock(&mutex);
9251 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9252
9253
9254 if (!ret && write) {
9255 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
9256 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
9257 }
9258 mutex_unlock(&mutex);
9259 return ret;
9260}
9261
9262#ifdef CONFIG_CGROUP_SCHED
9263
9264
9265static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9266{
9267 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9268 struct task_group, css);
9269}
9270
9271static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
9272{
9273 struct task_group *tg, *parent;
9274
9275 if (!cgrp->parent) {
9276
9277 return &root_task_group.css;
9278 }
9279
9280 parent = cgroup_tg(cgrp->parent);
9281 tg = sched_create_group(parent);
9282 if (IS_ERR(tg))
9283 return ERR_PTR(-ENOMEM);
9284
9285 return &tg->css;
9286}
9287
9288static int cpu_cgroup_css_online(struct cgroup *cgrp)
9289{
9290 struct task_group *tg = cgroup_tg(cgrp);
9291 struct task_group *parent;
9292
9293 if (!cgrp->parent)
9294 return 0;
9295
9296 parent = cgroup_tg(cgrp->parent);
9297 sched_online_group(tg, parent);
9298 return 0;
9299}
9300
9301static void cpu_cgroup_css_free(struct cgroup *cgrp)
9302{
9303 struct task_group *tg = cgroup_tg(cgrp);
9304
9305 sched_destroy_group(tg);
9306}
9307
9308static void cpu_cgroup_css_offline(struct cgroup *cgrp)
9309{
9310 struct task_group *tg = cgroup_tg(cgrp);
9311
9312 sched_offline_group(tg);
9313}
9314
9315static int cpu_cgroup_can_attach(struct cgroup *cgrp,
9316 struct cgroup_taskset *tset)
9317{
9318 struct task_struct *task;
9319
9320 cgroup_taskset_for_each(task, cgrp, tset) {
9321#ifdef CONFIG_RT_GROUP_SCHED
9322 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
9323 return -EINVAL;
9324#else
9325
9326 if (task->sched_class != &fair_sched_class)
9327 return -EINVAL;
9328#endif
9329 }
9330 return 0;
9331}
9332
9333static void cpu_cgroup_attach(struct cgroup *cgrp,
9334 struct cgroup_taskset *tset)
9335{
9336 struct task_struct *task;
9337
9338 cgroup_taskset_for_each(task, cgrp, tset)
9339 sched_move_task(task);
9340}
9341
9342static void
9343cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
9344 struct task_struct *task)
9345{
9346
9347
9348
9349
9350
9351 if (!(task->flags & PF_EXITING))
9352 return;
9353
9354 sched_move_task(task);
9355}
9356
9357#ifdef CONFIG_FAIR_GROUP_SCHED
9358static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9359 u64 shareval)
9360{
9361 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9362}
9363
9364static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9365{
9366 struct task_group *tg = cgroup_tg(cgrp);
9367
9368 return (u64) scale_load_down(tg->shares);
9369}
9370
9371#ifdef CONFIG_CFS_BANDWIDTH
9372static DEFINE_MUTEX(cfs_constraints_mutex);
9373
9374const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
9375const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
9376
9377static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9378
9379static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9380{
9381 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9382 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9383
9384 if (tg == &root_task_group)
9385 return -EINVAL;
9386
9387
9388
9389
9390
9391
9392 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9393 return -EINVAL;
9394
9395
9396
9397
9398
9399
9400 if (period > max_cfs_quota_period)
9401 return -EINVAL;
9402
9403 mutex_lock(&cfs_constraints_mutex);
9404 ret = __cfs_schedulable(tg, period, quota);
9405 if (ret)
9406 goto out_unlock;
9407
9408 runtime_enabled = quota != RUNTIME_INF;
9409 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9410
9411
9412
9413
9414 if (runtime_enabled && !runtime_was_enabled)
9415 cfs_bandwidth_usage_inc();
9416 raw_spin_lock_irq(&cfs_b->lock);
9417 cfs_b->period = ns_to_ktime(period);
9418 cfs_b->quota = quota;
9419
9420 __refill_cfs_bandwidth_runtime(cfs_b);
9421
9422 if (runtime_enabled && cfs_b->timer_active) {
9423
9424 __start_cfs_bandwidth(cfs_b, true);
9425 }
9426 raw_spin_unlock_irq(&cfs_b->lock);
9427
9428 for_each_possible_cpu(i) {
9429 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9430 struct rq *rq = cfs_rq->rq;
9431
9432 raw_spin_lock_irq(&rq->lock);
9433 cfs_rq->runtime_enabled = runtime_enabled;
9434 cfs_rq->runtime_remaining = 0;
9435
9436 if (cfs_rq->throttled)
9437 unthrottle_cfs_rq(cfs_rq);
9438 raw_spin_unlock_irq(&rq->lock);
9439 }
9440 if (runtime_was_enabled && !runtime_enabled)
9441 cfs_bandwidth_usage_dec();
9442out_unlock:
9443 mutex_unlock(&cfs_constraints_mutex);
9444
9445 return ret;
9446}
9447
9448int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9449{
9450 u64 quota, period;
9451
9452 period = ktime_to_ns(tg->cfs_bandwidth.period);
9453 if (cfs_quota_us < 0)
9454 quota = RUNTIME_INF;
9455 else
9456 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9457
9458 return tg_set_cfs_bandwidth(tg, period, quota);
9459}
9460
9461long tg_get_cfs_quota(struct task_group *tg)
9462{
9463 u64 quota_us;
9464
9465 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9466 return -1;
9467
9468 quota_us = tg->cfs_bandwidth.quota;
9469 do_div(quota_us, NSEC_PER_USEC);
9470
9471 return quota_us;
9472}
9473
9474int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9475{
9476 u64 quota, period;
9477
9478 period = (u64)cfs_period_us * NSEC_PER_USEC;
9479 quota = tg->cfs_bandwidth.quota;
9480
9481 return tg_set_cfs_bandwidth(tg, period, quota);
9482}
9483
9484long tg_get_cfs_period(struct task_group *tg)
9485{
9486 u64 cfs_period_us;
9487
9488 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9489 do_div(cfs_period_us, NSEC_PER_USEC);
9490
9491 return cfs_period_us;
9492}
9493
9494static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9495{
9496 return tg_get_cfs_quota(cgroup_tg(cgrp));
9497}
9498
9499static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9500 s64 cfs_quota_us)
9501{
9502 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9503}
9504
9505static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9506{
9507 return tg_get_cfs_period(cgroup_tg(cgrp));
9508}
9509
9510static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9511 u64 cfs_period_us)
9512{
9513 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9514}
9515
9516struct cfs_schedulable_data {
9517 struct task_group *tg;
9518 u64 period, quota;
9519};
9520
9521
9522
9523
9524
9525static u64 normalize_cfs_quota(struct task_group *tg,
9526 struct cfs_schedulable_data *d)
9527{
9528 u64 quota, period;
9529
9530 if (tg == d->tg) {
9531 period = d->period;
9532 quota = d->quota;
9533 } else {
9534 period = tg_get_cfs_period(tg);
9535 quota = tg_get_cfs_quota(tg);
9536 }
9537
9538
9539 if (quota == RUNTIME_INF || quota == -1)
9540 return RUNTIME_INF;
9541
9542 return to_ratio(period, quota);
9543}
9544
9545static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9546{
9547 struct cfs_schedulable_data *d = data;
9548 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9549 s64 quota = 0, parent_quota = -1;
9550
9551 if (!tg->parent) {
9552 quota = RUNTIME_INF;
9553 } else {
9554 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9555
9556 quota = normalize_cfs_quota(tg, d);
9557 parent_quota = parent_b->hierarchal_quota;
9558
9559
9560
9561
9562
9563 if (quota == RUNTIME_INF)
9564 quota = parent_quota;
9565 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9566 return -EINVAL;
9567 }
9568 cfs_b->hierarchal_quota = quota;
9569
9570 return 0;
9571}
9572
9573static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9574{
9575 int ret;
9576 struct cfs_schedulable_data data = {
9577 .tg = tg,
9578 .period = period,
9579 .quota = quota,
9580 };
9581
9582 if (quota != RUNTIME_INF) {
9583 do_div(data.period, NSEC_PER_USEC);
9584 do_div(data.quota, NSEC_PER_USEC);
9585 }
9586
9587 rcu_read_lock();
9588 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9589 rcu_read_unlock();
9590
9591 return ret;
9592}
9593
9594static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9595 struct cgroup_map_cb *cb)
9596{
9597 struct task_group *tg = cgroup_tg(cgrp);
9598 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9599
9600 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9601 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9602 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9603
9604 return 0;
9605}
9606#endif
9607#endif
9608
9609#ifdef CONFIG_RT_GROUP_SCHED
9610static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9611 s64 val)
9612{
9613 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9614}
9615
9616static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9617{
9618 return sched_group_rt_runtime(cgroup_tg(cgrp));
9619}
9620
9621static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9622 u64 rt_period_us)
9623{
9624 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9625}
9626
9627static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9628{
9629 return sched_group_rt_period(cgroup_tg(cgrp));
9630}
9631#endif
9632
9633static struct cftype cpu_files[] = {
9634#ifdef CONFIG_FAIR_GROUP_SCHED
9635 {
9636 .name = "shares",
9637 .read_u64 = cpu_shares_read_u64,
9638 .write_u64 = cpu_shares_write_u64,
9639 },
9640#endif
9641#ifdef CONFIG_CFS_BANDWIDTH
9642 {
9643 .name = "cfs_quota_us",
9644 .read_s64 = cpu_cfs_quota_read_s64,
9645 .write_s64 = cpu_cfs_quota_write_s64,
9646 },
9647 {
9648 .name = "cfs_period_us",
9649 .read_u64 = cpu_cfs_period_read_u64,
9650 .write_u64 = cpu_cfs_period_write_u64,
9651 },
9652 {
9653 .name = "stat",
9654 .read_map = cpu_stats_show,
9655 },
9656#endif
9657#ifdef CONFIG_RT_GROUP_SCHED
9658 {
9659 .name = "rt_runtime_us",
9660 .read_s64 = cpu_rt_runtime_read,
9661 .write_s64 = cpu_rt_runtime_write,
9662 },
9663 {
9664 .name = "rt_period_us",
9665 .read_u64 = cpu_rt_period_read_uint,
9666 .write_u64 = cpu_rt_period_write_uint,
9667 },
9668#endif
9669 { }
9670};
9671
9672struct cgroup_subsys cpu_cgroup_subsys = {
9673 .name = "cpu",
9674 .css_alloc = cpu_cgroup_css_alloc,
9675 .css_free = cpu_cgroup_css_free,
9676 .css_online = cpu_cgroup_css_online,
9677 .css_offline = cpu_cgroup_css_offline,
9678 .can_attach = cpu_cgroup_can_attach,
9679 .attach = cpu_cgroup_attach,
9680 .exit = cpu_cgroup_exit,
9681 .subsys_id = cpu_cgroup_subsys_id,
9682 .base_cftypes = cpu_files,
9683 .early_init = 1,
9684};
9685
9686#endif
9687
9688void dump_cpu_task(int cpu)
9689{
9690 pr_info("Task dump for CPU %d:\n", cpu);
9691 sched_show_task(cpu_curr(cpu));
9692}
9693