1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75
76#include <asm/switch_to.h>
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79#include <asm/mutex.h>
80#ifdef CONFIG_PARAVIRT
81#include <asm/paravirt.h>
82#endif
83
84#include "sched.h"
85#include "../workqueue_sched.h"
86#include "../smpboot.h"
87
88#define CREATE_TRACE_POINTS
89#include <trace/events/sched.h>
90
91void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92{
93 unsigned long delta;
94 ktime_t soft, hard, now;
95
96 for (;;) {
97 if (hrtimer_active(period_timer))
98 break;
99
100 now = hrtimer_cb_get_time(period_timer);
101 hrtimer_forward(period_timer, now, period);
102
103 soft = hrtimer_get_softexpires(period_timer);
104 hard = hrtimer_get_expires(period_timer);
105 delta = ktime_to_ns(ktime_sub(hard, soft));
106 __hrtimer_start_range_ns(period_timer, soft, delta,
107 HRTIMER_MODE_ABS_PINNED, 0);
108 }
109}
110
111DEFINE_MUTEX(sched_domains_mutex);
112DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
113
114static void update_rq_clock_task(struct rq *rq, s64 delta);
115
116void update_rq_clock(struct rq *rq)
117{
118 s64 delta;
119
120 if (rq->skip_clock_update > 0)
121 return;
122
123 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
124 rq->clock += delta;
125 update_rq_clock_task(rq, delta);
126}
127
128
129
130
131
132#define SCHED_FEAT(name, enabled) \
133 (1UL << __SCHED_FEAT_##name) * enabled |
134
135const_debug unsigned int sysctl_sched_features =
136#include "features.h"
137 0;
138
139#undef SCHED_FEAT
140
141#ifdef CONFIG_SCHED_DEBUG
142#define SCHED_FEAT(name, enabled) \
143 #name ,
144
145static const char * const sched_feat_names[] = {
146#include "features.h"
147};
148
149#undef SCHED_FEAT
150
151static int sched_feat_show(struct seq_file *m, void *v)
152{
153 int i;
154
155 for (i = 0; i < __SCHED_FEAT_NR; i++) {
156 if (!(sysctl_sched_features & (1UL << i)))
157 seq_puts(m, "NO_");
158 seq_printf(m, "%s ", sched_feat_names[i]);
159 }
160 seq_puts(m, "\n");
161
162 return 0;
163}
164
165#ifdef HAVE_JUMP_LABEL
166
167#define jump_label_key__true STATIC_KEY_INIT_TRUE
168#define jump_label_key__false STATIC_KEY_INIT_FALSE
169
170#define SCHED_FEAT(name, enabled) \
171 jump_label_key__##enabled ,
172
173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
174#include "features.h"
175};
176
177#undef SCHED_FEAT
178
179static void sched_feat_disable(int i)
180{
181 if (static_key_enabled(&sched_feat_keys[i]))
182 static_key_slow_dec(&sched_feat_keys[i]);
183}
184
185static void sched_feat_enable(int i)
186{
187 if (!static_key_enabled(&sched_feat_keys[i]))
188 static_key_slow_inc(&sched_feat_keys[i]);
189}
190#else
191static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { };
193#endif
194
195static ssize_t
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i;
203
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212
213 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1;
215 cmp += 3;
216 }
217
218 for (i = 0; i < __SCHED_FEAT_NR; i++) {
219 if (strcmp(cmp, sched_feat_names[i]) == 0) {
220 if (neg) {
221 sysctl_sched_features &= ~(1UL << i);
222 sched_feat_disable(i);
223 } else {
224 sysctl_sched_features |= (1UL << i);
225 sched_feat_enable(i);
226 }
227 break;
228 }
229 }
230
231 if (i == __SCHED_FEAT_NR)
232 return -EINVAL;
233
234 *ppos += cnt;
235
236 return cnt;
237}
238
239static int sched_feat_open(struct inode *inode, struct file *filp)
240{
241 return single_open(filp, sched_feat_show, NULL);
242}
243
244static const struct file_operations sched_feat_fops = {
245 .open = sched_feat_open,
246 .write = sched_feat_write,
247 .read = seq_read,
248 .llseek = seq_lseek,
249 .release = single_release,
250};
251
252static __init int sched_init_debug(void)
253{
254 debugfs_create_file("sched_features", 0644, NULL, NULL,
255 &sched_feat_fops);
256
257 return 0;
258}
259late_initcall(sched_init_debug);
260#endif
261
262
263
264
265
266const_debug unsigned int sysctl_sched_nr_migrate = 32;
267
268
269
270
271
272
273
274const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
275
276
277
278
279
280unsigned int sysctl_sched_rt_period = 1000000;
281
282__read_mostly int scheduler_running;
283
284
285
286
287
288int sysctl_sched_rt_runtime = 950000;
289
290
291
292
293
294
295static inline struct rq *__task_rq_lock(struct task_struct *p)
296 __acquires(rq->lock)
297{
298 struct rq *rq;
299
300 lockdep_assert_held(&p->pi_lock);
301
302 for (;;) {
303 rq = task_rq(p);
304 raw_spin_lock(&rq->lock);
305 if (likely(rq == task_rq(p)))
306 return rq;
307 raw_spin_unlock(&rq->lock);
308 }
309}
310
311
312
313
314static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
315 __acquires(p->pi_lock)
316 __acquires(rq->lock)
317{
318 struct rq *rq;
319
320 for (;;) {
321 raw_spin_lock_irqsave(&p->pi_lock, *flags);
322 rq = task_rq(p);
323 raw_spin_lock(&rq->lock);
324 if (likely(rq == task_rq(p)))
325 return rq;
326 raw_spin_unlock(&rq->lock);
327 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
328 }
329}
330
331static void __task_rq_unlock(struct rq *rq)
332 __releases(rq->lock)
333{
334 raw_spin_unlock(&rq->lock);
335}
336
337static inline void
338task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
339 __releases(rq->lock)
340 __releases(p->pi_lock)
341{
342 raw_spin_unlock(&rq->lock);
343 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
344}
345
346
347
348
349static struct rq *this_rq_lock(void)
350 __acquires(rq->lock)
351{
352 struct rq *rq;
353
354 local_irq_disable();
355 rq = this_rq();
356 raw_spin_lock(&rq->lock);
357
358 return rq;
359}
360
361#ifdef CONFIG_SCHED_HRTICK
362
363
364
365
366
367
368
369
370
371
372
373static void hrtick_clear(struct rq *rq)
374{
375 if (hrtimer_active(&rq->hrtick_timer))
376 hrtimer_cancel(&rq->hrtick_timer);
377}
378
379
380
381
382
383static enum hrtimer_restart hrtick(struct hrtimer *timer)
384{
385 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386
387 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388
389 raw_spin_lock(&rq->lock);
390 update_rq_clock(rq);
391 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392 raw_spin_unlock(&rq->lock);
393
394 return HRTIMER_NORESTART;
395}
396
397#ifdef CONFIG_SMP
398
399
400
401static void __hrtick_start(void *arg)
402{
403 struct rq *rq = arg;
404
405 raw_spin_lock(&rq->lock);
406 hrtimer_restart(&rq->hrtick_timer);
407 rq->hrtick_csd_pending = 0;
408 raw_spin_unlock(&rq->lock);
409}
410
411
412
413
414
415
416void hrtick_start(struct rq *rq, u64 delay)
417{
418 struct hrtimer *timer = &rq->hrtick_timer;
419 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
420
421 hrtimer_set_expires(timer, time);
422
423 if (rq == this_rq()) {
424 hrtimer_restart(timer);
425 } else if (!rq->hrtick_csd_pending) {
426 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
427 rq->hrtick_csd_pending = 1;
428 }
429}
430
431static int
432hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
433{
434 int cpu = (int)(long)hcpu;
435
436 switch (action) {
437 case CPU_UP_CANCELED:
438 case CPU_UP_CANCELED_FROZEN:
439 case CPU_DOWN_PREPARE:
440 case CPU_DOWN_PREPARE_FROZEN:
441 case CPU_DEAD:
442 case CPU_DEAD_FROZEN:
443 hrtick_clear(cpu_rq(cpu));
444 return NOTIFY_OK;
445 }
446
447 return NOTIFY_DONE;
448}
449
450static __init void init_hrtick(void)
451{
452 hotcpu_notifier(hotplug_hrtick, 0);
453}
454#else
455
456
457
458
459
460void hrtick_start(struct rq *rq, u64 delay)
461{
462 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
463 HRTIMER_MODE_REL_PINNED, 0);
464}
465
466static inline void init_hrtick(void)
467{
468}
469#endif
470
471static void init_rq_hrtick(struct rq *rq)
472{
473#ifdef CONFIG_SMP
474 rq->hrtick_csd_pending = 0;
475
476 rq->hrtick_csd.flags = 0;
477 rq->hrtick_csd.func = __hrtick_start;
478 rq->hrtick_csd.info = rq;
479#endif
480
481 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
482 rq->hrtick_timer.function = hrtick;
483}
484#else
485static inline void hrtick_clear(struct rq *rq)
486{
487}
488
489static inline void init_rq_hrtick(struct rq *rq)
490{
491}
492
493static inline void init_hrtick(void)
494{
495}
496#endif
497
498
499
500
501
502
503
504
505#ifdef CONFIG_SMP
506
507#ifndef tsk_is_polling
508#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
509#endif
510
511void resched_task(struct task_struct *p)
512{
513 int cpu;
514
515 assert_raw_spin_locked(&task_rq(p)->lock);
516
517 if (test_tsk_need_resched(p))
518 return;
519
520 set_tsk_need_resched(p);
521
522 cpu = task_cpu(p);
523 if (cpu == smp_processor_id())
524 return;
525
526
527 smp_mb();
528 if (!tsk_is_polling(p))
529 smp_send_reschedule(cpu);
530}
531
532void resched_cpu(int cpu)
533{
534 struct rq *rq = cpu_rq(cpu);
535 unsigned long flags;
536
537 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
538 return;
539 resched_task(cpu_curr(cpu));
540 raw_spin_unlock_irqrestore(&rq->lock, flags);
541}
542
543#ifdef CONFIG_NO_HZ
544
545
546
547
548
549
550
551
552int get_nohz_timer_target(void)
553{
554 int cpu = smp_processor_id();
555 int i;
556 struct sched_domain *sd;
557
558 rcu_read_lock();
559 for_each_domain(cpu, sd) {
560 for_each_cpu(i, sched_domain_span(sd)) {
561 if (!idle_cpu(i)) {
562 cpu = i;
563 goto unlock;
564 }
565 }
566 }
567unlock:
568 rcu_read_unlock();
569 return cpu;
570}
571
572
573
574
575
576
577
578
579
580
581void wake_up_idle_cpu(int cpu)
582{
583 struct rq *rq = cpu_rq(cpu);
584
585 if (cpu == smp_processor_id())
586 return;
587
588
589
590
591
592
593
594
595 if (rq->curr != rq->idle)
596 return;
597
598
599
600
601
602
603 set_tsk_need_resched(rq->idle);
604
605
606 smp_mb();
607 if (!tsk_is_polling(rq->idle))
608 smp_send_reschedule(cpu);
609}
610
611static inline bool got_nohz_idle_kick(void)
612{
613 int cpu = smp_processor_id();
614 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
615}
616
617#else
618
619static inline bool got_nohz_idle_kick(void)
620{
621 return false;
622}
623
624#endif
625
626void sched_avg_update(struct rq *rq)
627{
628 s64 period = sched_avg_period();
629
630 while ((s64)(rq->clock - rq->age_stamp) > period) {
631
632
633
634
635
636 asm("" : "+rm" (rq->age_stamp));
637 rq->age_stamp += period;
638 rq->rt_avg /= 2;
639 }
640}
641
642#else
643void resched_task(struct task_struct *p)
644{
645 assert_raw_spin_locked(&task_rq(p)->lock);
646 set_tsk_need_resched(p);
647}
648#endif
649
650#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
651 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
652
653
654
655
656
657
658int walk_tg_tree_from(struct task_group *from,
659 tg_visitor down, tg_visitor up, void *data)
660{
661 struct task_group *parent, *child;
662 int ret;
663
664 parent = from;
665
666down:
667 ret = (*down)(parent, data);
668 if (ret)
669 goto out;
670 list_for_each_entry_rcu(child, &parent->children, siblings) {
671 parent = child;
672 goto down;
673
674up:
675 continue;
676 }
677 ret = (*up)(parent, data);
678 if (ret || parent == from)
679 goto out;
680
681 child = parent;
682 parent = parent->parent;
683 if (parent)
684 goto up;
685out:
686 return ret;
687}
688
689int tg_nop(struct task_group *tg, void *data)
690{
691 return 0;
692}
693#endif
694
695static void set_load_weight(struct task_struct *p)
696{
697 int prio = p->static_prio - MAX_RT_PRIO;
698 struct load_weight *load = &p->se.load;
699
700
701
702
703 if (p->policy == SCHED_IDLE) {
704 load->weight = scale_load(WEIGHT_IDLEPRIO);
705 load->inv_weight = WMULT_IDLEPRIO;
706 return;
707 }
708
709 load->weight = scale_load(prio_to_weight[prio]);
710 load->inv_weight = prio_to_wmult[prio];
711}
712
713static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
714{
715 update_rq_clock(rq);
716 sched_info_queued(p);
717 p->sched_class->enqueue_task(rq, p, flags);
718}
719
720static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
721{
722 update_rq_clock(rq);
723 sched_info_dequeued(p);
724 p->sched_class->dequeue_task(rq, p, flags);
725}
726
727void activate_task(struct rq *rq, struct task_struct *p, int flags)
728{
729 if (task_contributes_to_load(p))
730 rq->nr_uninterruptible--;
731
732 enqueue_task(rq, p, flags);
733}
734
735void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
736{
737 if (task_contributes_to_load(p))
738 rq->nr_uninterruptible++;
739
740 dequeue_task(rq, p, flags);
741}
742
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745
746
747
748
749
750
751
752
753
754
755
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif
814
815
816
817
818
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835
836
837
838
839
840
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta)
864{
865
866
867
868
869#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
870 s64 steal = 0, irq_delta = 0;
871#endif
872#ifdef CONFIG_IRQ_TIME_ACCOUNTING
873 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890 if (irq_delta > delta)
891 irq_delta = delta;
892
893 rq->prev_irq_time += irq_delta;
894 delta -= irq_delta;
895#endif
896#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897 if (static_key_false((¶virt_steal_rq_enabled))) {
898 u64 st;
899
900 steal = paravirt_steal_clock(cpu_of(rq));
901 steal -= rq->prev_steal_time_rq;
902
903 if (unlikely(steal > delta))
904 steal = delta;
905
906 st = steal_ticks(steal);
907 steal = st * TICK_NSEC;
908
909 rq->prev_steal_time_rq += steal;
910
911 delta -= steal;
912 }
913#endif
914
915 rq->clock_task += delta;
916
917#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
918 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
919 sched_rt_avg_update(rq, irq_delta + steal);
920#endif
921}
922
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop)
961{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
963 struct task_struct *old_stop = cpu_rq(cpu)->stop;
964
965 if (stop) {
966
967
968
969
970
971
972
973
974 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
975
976 stop->sched_class = &stop_sched_class;
977 }
978
979 cpu_rq(cpu)->stop = stop;
980
981 if (old_stop) {
982
983
984
985
986 old_stop->sched_class = &rt_sched_class;
987 }
988}
989
990
991
992
993static inline int __normal_prio(struct task_struct *p)
994{
995 return p->static_prio;
996}
997
998
999
1000
1001
1002
1003
1004
1005static inline int normal_prio(struct task_struct *p)
1006{
1007 int prio;
1008
1009 if (task_has_rt_policy(p))
1010 prio = MAX_RT_PRIO-1 - p->rt_priority;
1011 else
1012 prio = __normal_prio(p);
1013 return prio;
1014}
1015
1016
1017
1018
1019
1020
1021
1022
1023static int effective_prio(struct task_struct *p)
1024{
1025 p->normal_prio = normal_prio(p);
1026
1027
1028
1029
1030
1031 if (!rt_prio(p->prio))
1032 return p->normal_prio;
1033 return p->prio;
1034}
1035
1036
1037
1038
1039
1040inline int task_curr(const struct task_struct *p)
1041{
1042 return cpu_curr(task_cpu(p)) == p;
1043}
1044
1045static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1046 const struct sched_class *prev_class,
1047 int oldprio)
1048{
1049 if (prev_class != p->sched_class) {
1050 if (prev_class->switched_from)
1051 prev_class->switched_from(rq, p);
1052 p->sched_class->switched_to(rq, p);
1053 } else if (oldprio != p->prio)
1054 p->sched_class->prio_changed(rq, p, oldprio);
1055}
1056
1057void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1058{
1059 const struct sched_class *class;
1060
1061 if (p->sched_class == rq->curr->sched_class) {
1062 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1063 } else {
1064 for_each_class(class) {
1065 if (class == rq->curr->sched_class)
1066 break;
1067 if (class == p->sched_class) {
1068 resched_task(rq->curr);
1069 break;
1070 }
1071 }
1072 }
1073
1074
1075
1076
1077
1078 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
1079 rq->skip_clock_update = 1;
1080}
1081
1082#ifdef CONFIG_SMP
1083void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1084{
1085#ifdef CONFIG_SCHED_DEBUG
1086
1087
1088
1089
1090 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1091 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1092
1093#ifdef CONFIG_LOCKDEP
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1105 lockdep_is_held(&task_rq(p)->lock)));
1106#endif
1107#endif
1108
1109 trace_sched_migrate_task(p, new_cpu);
1110
1111 if (task_cpu(p) != new_cpu) {
1112 p->se.nr_migrations++;
1113 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1114 }
1115
1116 __set_task_cpu(p, new_cpu);
1117}
1118
1119struct migration_arg {
1120 struct task_struct *task;
1121 int dest_cpu;
1122};
1123
1124static int migration_cpu_stop(void *data);
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1143{
1144 unsigned long flags;
1145 int running, on_rq;
1146 unsigned long ncsw;
1147 struct rq *rq;
1148
1149 for (;;) {
1150
1151
1152
1153
1154
1155
1156 rq = task_rq(p);
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169 while (task_running(rq, p)) {
1170 if (match_state && unlikely(p->state != match_state))
1171 return 0;
1172 cpu_relax();
1173 }
1174
1175
1176
1177
1178
1179
1180 rq = task_rq_lock(p, &flags);
1181 trace_sched_wait_task(p);
1182 running = task_running(rq, p);
1183 on_rq = p->on_rq;
1184 ncsw = 0;
1185 if (!match_state || p->state == match_state)
1186 ncsw = p->nvcsw | LONG_MIN;
1187 task_rq_unlock(rq, p, &flags);
1188
1189
1190
1191
1192 if (unlikely(!ncsw))
1193 break;
1194
1195
1196
1197
1198
1199
1200
1201 if (unlikely(running)) {
1202 cpu_relax();
1203 continue;
1204 }
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 if (unlikely(on_rq)) {
1216 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1217
1218 set_current_state(TASK_UNINTERRUPTIBLE);
1219 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1220 continue;
1221 }
1222
1223
1224
1225
1226
1227
1228 break;
1229 }
1230
1231 return ncsw;
1232}
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247void kick_process(struct task_struct *p)
1248{
1249 int cpu;
1250
1251 preempt_disable();
1252 cpu = task_cpu(p);
1253 if ((cpu != smp_processor_id()) && task_curr(p))
1254 smp_send_reschedule(cpu);
1255 preempt_enable();
1256}
1257EXPORT_SYMBOL_GPL(kick_process);
1258#endif
1259
1260#ifdef CONFIG_SMP
1261
1262
1263
1264static int select_fallback_rq(int cpu, struct task_struct *p)
1265{
1266 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1267 enum { cpuset, possible, fail } state = cpuset;
1268 int dest_cpu;
1269
1270
1271 for_each_cpu(dest_cpu, nodemask) {
1272 if (!cpu_online(dest_cpu))
1273 continue;
1274 if (!cpu_active(dest_cpu))
1275 continue;
1276 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1277 return dest_cpu;
1278 }
1279
1280 for (;;) {
1281
1282 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1283 if (!cpu_online(dest_cpu))
1284 continue;
1285 if (!cpu_active(dest_cpu))
1286 continue;
1287 goto out;
1288 }
1289
1290 switch (state) {
1291 case cpuset:
1292
1293 cpuset_cpus_allowed_fallback(p);
1294 state = possible;
1295 break;
1296
1297 case possible:
1298 do_set_cpus_allowed(p, cpu_possible_mask);
1299 state = fail;
1300 break;
1301
1302 case fail:
1303 BUG();
1304 break;
1305 }
1306 }
1307
1308out:
1309 if (state != cpuset) {
1310
1311
1312
1313
1314
1315 if (p->mm && printk_ratelimit()) {
1316 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1317 task_pid_nr(p), p->comm, cpu);
1318 }
1319 }
1320
1321 return dest_cpu;
1322}
1323
1324
1325
1326
1327static inline
1328int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1329{
1330 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1343 !cpu_online(cpu)))
1344 cpu = select_fallback_rq(task_cpu(p), p);
1345
1346 return cpu;
1347}
1348
1349static void update_avg(u64 *avg, u64 sample)
1350{
1351 s64 diff = sample - *avg;
1352 *avg += diff >> 3;
1353}
1354#endif
1355
1356static void
1357ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1358{
1359#ifdef CONFIG_SCHEDSTATS
1360 struct rq *rq = this_rq();
1361
1362#ifdef CONFIG_SMP
1363 int this_cpu = smp_processor_id();
1364
1365 if (cpu == this_cpu) {
1366 schedstat_inc(rq, ttwu_local);
1367 schedstat_inc(p, se.statistics.nr_wakeups_local);
1368 } else {
1369 struct sched_domain *sd;
1370
1371 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1372 rcu_read_lock();
1373 for_each_domain(this_cpu, sd) {
1374 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1375 schedstat_inc(sd, ttwu_wake_remote);
1376 break;
1377 }
1378 }
1379 rcu_read_unlock();
1380 }
1381
1382 if (wake_flags & WF_MIGRATED)
1383 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1384
1385#endif
1386
1387 schedstat_inc(rq, ttwu_count);
1388 schedstat_inc(p, se.statistics.nr_wakeups);
1389
1390 if (wake_flags & WF_SYNC)
1391 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1392
1393#endif
1394}
1395
1396static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1397{
1398 activate_task(rq, p, en_flags);
1399 p->on_rq = 1;
1400
1401
1402 if (p->flags & PF_WQ_WORKER)
1403 wq_worker_waking_up(p, cpu_of(rq));
1404}
1405
1406
1407
1408
1409static void
1410ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1411{
1412 trace_sched_wakeup(p, true);
1413 check_preempt_curr(rq, p, wake_flags);
1414
1415 p->state = TASK_RUNNING;
1416#ifdef CONFIG_SMP
1417 if (p->sched_class->task_woken)
1418 p->sched_class->task_woken(rq, p);
1419
1420 if (rq->idle_stamp) {
1421 u64 delta = rq->clock - rq->idle_stamp;
1422 u64 max = 2*sysctl_sched_migration_cost;
1423
1424 if (delta > max)
1425 rq->avg_idle = max;
1426 else
1427 update_avg(&rq->avg_idle, delta);
1428 rq->idle_stamp = 0;
1429 }
1430#endif
1431}
1432
1433static void
1434ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1435{
1436#ifdef CONFIG_SMP
1437 if (p->sched_contributes_to_load)
1438 rq->nr_uninterruptible--;
1439#endif
1440
1441 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1442 ttwu_do_wakeup(rq, p, wake_flags);
1443}
1444
1445
1446
1447
1448
1449
1450
1451static int ttwu_remote(struct task_struct *p, int wake_flags)
1452{
1453 struct rq *rq;
1454 int ret = 0;
1455
1456 rq = __task_rq_lock(p);
1457 if (p->on_rq) {
1458 ttwu_do_wakeup(rq, p, wake_flags);
1459 ret = 1;
1460 }
1461 __task_rq_unlock(rq);
1462
1463 return ret;
1464}
1465
1466#ifdef CONFIG_SMP
1467static void sched_ttwu_pending(void)
1468{
1469 struct rq *rq = this_rq();
1470 struct llist_node *llist = llist_del_all(&rq->wake_list);
1471 struct task_struct *p;
1472
1473 raw_spin_lock(&rq->lock);
1474
1475 while (llist) {
1476 p = llist_entry(llist, struct task_struct, wake_entry);
1477 llist = llist_next(llist);
1478 ttwu_do_activate(rq, p, 0);
1479 }
1480
1481 raw_spin_unlock(&rq->lock);
1482}
1483
1484void scheduler_ipi(void)
1485{
1486 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1487 return;
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502 irq_enter();
1503 sched_ttwu_pending();
1504
1505
1506
1507
1508 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1509 this_rq()->idle_balance = 1;
1510 raise_softirq_irqoff(SCHED_SOFTIRQ);
1511 }
1512 irq_exit();
1513}
1514
1515static void ttwu_queue_remote(struct task_struct *p, int cpu)
1516{
1517 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1518 smp_send_reschedule(cpu);
1519}
1520
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu)
1541{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1543}
1544#endif
1545
1546static void ttwu_queue(struct task_struct *p, int cpu)
1547{
1548 struct rq *rq = cpu_rq(cpu);
1549
1550#if defined(CONFIG_SMP)
1551 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1552 sched_clock_cpu(cpu);
1553 ttwu_queue_remote(p, cpu);
1554 return;
1555 }
1556#endif
1557
1558 raw_spin_lock(&rq->lock);
1559 ttwu_do_activate(rq, p, 0);
1560 raw_spin_unlock(&rq->lock);
1561}
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578static int
1579try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1580{
1581 unsigned long flags;
1582 int cpu, success = 0;
1583
1584 smp_wmb();
1585 raw_spin_lock_irqsave(&p->pi_lock, flags);
1586 if (!(p->state & state))
1587 goto out;
1588
1589 success = 1;
1590 cpu = task_cpu(p);
1591
1592 if (p->on_rq && ttwu_remote(p, wake_flags))
1593 goto stat;
1594
1595#ifdef CONFIG_SMP
1596
1597
1598
1599
1600 while (p->on_cpu) {
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602
1603
1604
1605
1606
1607
1608
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax();
1613#endif
1614 }
1615
1616
1617
1618 smp_rmb();
1619
1620 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1621 p->state = TASK_WAKING;
1622
1623 if (p->sched_class->task_waking)
1624 p->sched_class->task_waking(p);
1625
1626 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1627 if (task_cpu(p) != cpu) {
1628 wake_flags |= WF_MIGRATED;
1629 set_task_cpu(p, cpu);
1630 }
1631#endif
1632
1633 ttwu_queue(p, cpu);
1634stat:
1635 ttwu_stat(p, cpu, wake_flags);
1636out:
1637 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1638
1639 return success;
1640}
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650static void try_to_wake_up_local(struct task_struct *p)
1651{
1652 struct rq *rq = task_rq(p);
1653
1654 BUG_ON(rq != this_rq());
1655 BUG_ON(p == current);
1656 lockdep_assert_held(&rq->lock);
1657
1658 if (!raw_spin_trylock(&p->pi_lock)) {
1659 raw_spin_unlock(&rq->lock);
1660 raw_spin_lock(&p->pi_lock);
1661 raw_spin_lock(&rq->lock);
1662 }
1663
1664 if (!(p->state & TASK_NORMAL))
1665 goto out;
1666
1667 if (!p->on_rq)
1668 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1669
1670 ttwu_do_wakeup(rq, p, 0);
1671 ttwu_stat(p, smp_processor_id(), 0);
1672out:
1673 raw_spin_unlock(&p->pi_lock);
1674}
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687int wake_up_process(struct task_struct *p)
1688{
1689 return try_to_wake_up(p, TASK_ALL, 0);
1690}
1691EXPORT_SYMBOL(wake_up_process);
1692
1693int wake_up_state(struct task_struct *p, unsigned int state)
1694{
1695 return try_to_wake_up(p, state, 0);
1696}
1697
1698
1699
1700
1701
1702
1703
1704static void __sched_fork(struct task_struct *p)
1705{
1706 p->on_rq = 0;
1707
1708 p->se.on_rq = 0;
1709 p->se.exec_start = 0;
1710 p->se.sum_exec_runtime = 0;
1711 p->se.prev_sum_exec_runtime = 0;
1712 p->se.nr_migrations = 0;
1713 p->se.vruntime = 0;
1714 INIT_LIST_HEAD(&p->se.group_node);
1715
1716#ifdef CONFIG_SCHEDSTATS
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif
1719
1720 INIT_LIST_HEAD(&p->rt.run_list);
1721
1722#ifdef CONFIG_PREEMPT_NOTIFIERS
1723 INIT_HLIST_HEAD(&p->preempt_notifiers);
1724#endif
1725}
1726
1727
1728
1729
1730void sched_fork(struct task_struct *p)
1731{
1732 unsigned long flags;
1733 int cpu = get_cpu();
1734
1735 __sched_fork(p);
1736
1737
1738
1739
1740
1741 p->state = TASK_RUNNING;
1742
1743
1744
1745
1746 p->prio = current->normal_prio;
1747
1748
1749
1750
1751 if (unlikely(p->sched_reset_on_fork)) {
1752 if (task_has_rt_policy(p)) {
1753 p->policy = SCHED_NORMAL;
1754 p->static_prio = NICE_TO_PRIO(0);
1755 p->rt_priority = 0;
1756 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1757 p->static_prio = NICE_TO_PRIO(0);
1758
1759 p->prio = p->normal_prio = __normal_prio(p);
1760 set_load_weight(p);
1761
1762
1763
1764
1765
1766 p->sched_reset_on_fork = 0;
1767 }
1768
1769 if (!rt_prio(p->prio))
1770 p->sched_class = &fair_sched_class;
1771
1772 if (p->sched_class->task_fork)
1773 p->sched_class->task_fork(p);
1774
1775
1776
1777
1778
1779
1780
1781
1782 raw_spin_lock_irqsave(&p->pi_lock, flags);
1783 set_task_cpu(p, cpu);
1784 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1785
1786#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1787 if (likely(sched_info_on()))
1788 memset(&p->sched_info, 0, sizeof(p->sched_info));
1789#endif
1790#if defined(CONFIG_SMP)
1791 p->on_cpu = 0;
1792#endif
1793#ifdef CONFIG_PREEMPT_COUNT
1794
1795 task_thread_info(p)->preempt_count = 1;
1796#endif
1797#ifdef CONFIG_SMP
1798 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1799#endif
1800
1801 put_cpu();
1802}
1803
1804
1805
1806
1807
1808
1809
1810
1811void wake_up_new_task(struct task_struct *p)
1812{
1813 unsigned long flags;
1814 struct rq *rq;
1815
1816 raw_spin_lock_irqsave(&p->pi_lock, flags);
1817#ifdef CONFIG_SMP
1818
1819
1820
1821
1822
1823 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1824#endif
1825
1826 rq = __task_rq_lock(p);
1827 activate_task(rq, p, 0);
1828 p->on_rq = 1;
1829 trace_sched_wakeup_new(p, true);
1830 check_preempt_curr(rq, p, WF_FORK);
1831#ifdef CONFIG_SMP
1832 if (p->sched_class->task_woken)
1833 p->sched_class->task_woken(rq, p);
1834#endif
1835 task_rq_unlock(rq, p, &flags);
1836}
1837
1838#ifdef CONFIG_PREEMPT_NOTIFIERS
1839
1840
1841
1842
1843
1844void preempt_notifier_register(struct preempt_notifier *notifier)
1845{
1846 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1847}
1848EXPORT_SYMBOL_GPL(preempt_notifier_register);
1849
1850
1851
1852
1853
1854
1855
1856void preempt_notifier_unregister(struct preempt_notifier *notifier)
1857{
1858 hlist_del(¬ifier->link);
1859}
1860EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1861
1862static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1863{
1864 struct preempt_notifier *notifier;
1865 struct hlist_node *node;
1866
1867 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1868 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1869}
1870
1871static void
1872fire_sched_out_preempt_notifiers(struct task_struct *curr,
1873 struct task_struct *next)
1874{
1875 struct preempt_notifier *notifier;
1876 struct hlist_node *node;
1877
1878 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1879 notifier->ops->sched_out(notifier, next);
1880}
1881
1882#else
1883
1884static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1885{
1886}
1887
1888static void
1889fire_sched_out_preempt_notifiers(struct task_struct *curr,
1890 struct task_struct *next)
1891{
1892}
1893
1894#endif
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909static inline void
1910prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next)
1912{
1913 trace_sched_switch(prev, next);
1914 sched_info_switch(prev, next);
1915 perf_event_task_sched_out(prev, next);
1916 fire_sched_out_preempt_notifiers(prev, next);
1917 prepare_lock_switch(rq, next);
1918 prepare_arch_switch(next);
1919}
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1937 __releases(rq->lock)
1938{
1939 struct mm_struct *mm = rq->prev_mm;
1940 long prev_state;
1941
1942 rq->prev_mm = NULL;
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955 prev_state = prev->state;
1956 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif
1960 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif
1964 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch();
1966
1967 fire_sched_in_preempt_notifiers(current);
1968 if (mm)
1969 mmdrop(mm);
1970 if (unlikely(prev_state == TASK_DEAD)) {
1971
1972
1973
1974
1975 kprobe_flush_task(prev);
1976 put_task_struct(prev);
1977 }
1978}
1979
1980#ifdef CONFIG_SMP
1981
1982
1983static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1984{
1985 if (prev->sched_class->pre_schedule)
1986 prev->sched_class->pre_schedule(rq, prev);
1987}
1988
1989
1990static inline void post_schedule(struct rq *rq)
1991{
1992 if (rq->post_schedule) {
1993 unsigned long flags;
1994
1995 raw_spin_lock_irqsave(&rq->lock, flags);
1996 if (rq->curr->sched_class->post_schedule)
1997 rq->curr->sched_class->post_schedule(rq);
1998 raw_spin_unlock_irqrestore(&rq->lock, flags);
1999
2000 rq->post_schedule = 0;
2001 }
2002}
2003
2004#else
2005
2006static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2007{
2008}
2009
2010static inline void post_schedule(struct rq *rq)
2011{
2012}
2013
2014#endif
2015
2016
2017
2018
2019
2020asmlinkage void schedule_tail(struct task_struct *prev)
2021 __releases(rq->lock)
2022{
2023 struct rq *rq = this_rq();
2024
2025 finish_task_switch(rq, prev);
2026
2027
2028
2029
2030
2031 post_schedule(rq);
2032
2033#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2034
2035 preempt_enable();
2036#endif
2037 if (current->set_child_tid)
2038 put_user(task_pid_vnr(current), current->set_child_tid);
2039}
2040
2041
2042
2043
2044
2045static inline void
2046context_switch(struct rq *rq, struct task_struct *prev,
2047 struct task_struct *next)
2048{
2049 struct mm_struct *mm, *oldmm;
2050
2051 prepare_task_switch(rq, prev, next);
2052
2053 mm = next->mm;
2054 oldmm = prev->active_mm;
2055
2056
2057
2058
2059
2060 arch_start_context_switch(prev);
2061
2062 if (!mm) {
2063 next->active_mm = oldmm;
2064 atomic_inc(&oldmm->mm_count);
2065 enter_lazy_tlb(oldmm, next);
2066 } else
2067 switch_mm(oldmm, mm, next);
2068
2069 if (!prev->mm) {
2070 prev->active_mm = NULL;
2071 rq->prev_mm = oldmm;
2072 }
2073
2074
2075
2076
2077
2078
2079#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2080 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2081#endif
2082
2083
2084 switch_to(prev, next, prev);
2085
2086 barrier();
2087
2088
2089
2090
2091
2092 finish_task_switch(this_rq(), prev);
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102unsigned long nr_running(void)
2103{
2104 unsigned long i, sum = 0;
2105
2106 for_each_online_cpu(i)
2107 sum += cpu_rq(i)->nr_running;
2108
2109 return sum;
2110}
2111
2112unsigned long nr_uninterruptible(void)
2113{
2114 unsigned long i, sum = 0;
2115
2116 for_each_possible_cpu(i)
2117 sum += cpu_rq(i)->nr_uninterruptible;
2118
2119
2120
2121
2122
2123 if (unlikely((long)sum < 0))
2124 sum = 0;
2125
2126 return sum;
2127}
2128
2129unsigned long long nr_context_switches(void)
2130{
2131 int i;
2132 unsigned long long sum = 0;
2133
2134 for_each_possible_cpu(i)
2135 sum += cpu_rq(i)->nr_switches;
2136
2137 return sum;
2138}
2139
2140unsigned long nr_iowait(void)
2141{
2142 unsigned long i, sum = 0;
2143
2144 for_each_possible_cpu(i)
2145 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2146
2147 return sum;
2148}
2149
2150unsigned long nr_iowait_cpu(int cpu)
2151{
2152 struct rq *this = cpu_rq(cpu);
2153 return atomic_read(&this->nr_iowait);
2154}
2155
2156unsigned long this_cpu_load(void)
2157{
2158 struct rq *this = this_rq();
2159 return this->cpu_load[0];
2160}
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211static atomic_long_t calc_load_tasks;
2212static unsigned long calc_load_update;
2213unsigned long avenrun[3];
2214EXPORT_SYMBOL(avenrun);
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2230
2231static long calc_load_fold_active(struct rq *this_rq)
2232{
2233 long nr_active, delta = 0;
2234
2235 nr_active = this_rq->nr_running;
2236 nr_active += (long) this_rq->nr_uninterruptible;
2237
2238 if (nr_active != this_rq->calc_load_active) {
2239 delta = nr_active - this_rq->calc_load_active;
2240 this_rq->calc_load_active = nr_active;
2241 }
2242
2243 return delta;
2244}
2245
2246
2247
2248
2249static unsigned long
2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2251{
2252 load *= exp;
2253 load += active * (FIXED_1 - exp);
2254 load += 1UL << (FSHIFT - 1);
2255 return load >> FSHIFT;
2256}
2257
2258#ifdef CONFIG_NO_HZ
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2303
2304static inline int calc_load_write_idx(void)
2305{
2306 int idx = calc_load_idx;
2307
2308
2309
2310
2311
2312 smp_rmb();
2313
2314
2315
2316
2317
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2332 long delta;
2333
2334
2335
2336
2337
2338 delta = calc_load_fold_active(this_rq);
2339 if (delta) {
2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2343}
2344
2345void calc_load_exit_idle(void)
2346{
2347 struct rq *this_rq = this_rq();
2348
2349
2350
2351
2352 if (time_before(jiffies, this_rq->calc_load_update))
2353 return;
2354
2355
2356
2357
2358
2359
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2372
2373 return delta;
2374}
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391static unsigned long
2392fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2393{
2394 unsigned long result = 1UL << frac_bits;
2395
2396 if (n) for (;;) {
2397 if (n & 1) {
2398 result *= x;
2399 result += 1UL << (frac_bits - 1);
2400 result >>= frac_bits;
2401 }
2402 n >>= 1;
2403 if (!n)
2404 break;
2405 x *= x;
2406 x += 1UL << (frac_bits - 1);
2407 x >>= frac_bits;
2408 }
2409
2410 return result;
2411}
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436static unsigned long
2437calc_load_n(unsigned long load, unsigned long exp,
2438 unsigned long active, unsigned int n)
2439{
2440
2441 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2442}
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453static void calc_global_nohz(void)
2454{
2455 long delta, active, n;
2456
2457 if (!time_before(jiffies, calc_load_update + 10)) {
2458
2459
2460
2461 delta = jiffies - calc_load_update - 10;
2462 n = 1 + (delta / LOAD_FREQ);
2463
2464 active = atomic_long_read(&calc_load_tasks);
2465 active = active > 0 ? active * FIXED_1 : 0;
2466
2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2470
2471 calc_load_update += n * LOAD_FREQ;
2472 }
2473
2474
2475
2476
2477
2478
2479
2480
2481 smp_wmb();
2482 calc_load_idx++;
2483}
2484#else
2485
2486static inline long calc_load_fold_idle(void) { return 0; }
2487static inline void calc_global_nohz(void) { }
2488
2489#endif
2490
2491
2492
2493
2494
2495void calc_global_load(unsigned long ticks)
2496{
2497 long active, delta;
2498
2499 if (time_before(jiffies, calc_load_update + 10))
2500 return;
2501
2502
2503
2504
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2509 active = atomic_long_read(&calc_load_tasks);
2510 active = active > 0 ? active * FIXED_1 : 0;
2511
2512 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2513 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2514 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2515
2516 calc_load_update += LOAD_FREQ;
2517
2518
2519
2520
2521 calc_global_nohz();
2522}
2523
2524
2525
2526
2527
2528static void calc_load_account_active(struct rq *this_rq)
2529{
2530 long delta;
2531
2532 if (time_before(jiffies, this_rq->calc_load_update))
2533 return;
2534
2535 delta = calc_load_fold_active(this_rq);
2536 if (delta)
2537 atomic_long_add(delta, &calc_load_tasks);
2538
2539 this_rq->calc_load_update += LOAD_FREQ;
2540}
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573#define DEGRADE_SHIFT 7
2574static const unsigned char
2575 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2576static const unsigned char
2577 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2578 {0, 0, 0, 0, 0, 0, 0, 0},
2579 {64, 32, 8, 0, 0, 0, 0, 0},
2580 {96, 72, 40, 12, 1, 0, 0},
2581 {112, 98, 75, 43, 15, 1, 0},
2582 {120, 112, 98, 76, 45, 16, 2} };
2583
2584
2585
2586
2587
2588
2589static unsigned long
2590decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2591{
2592 int j = 0;
2593
2594 if (!missed_updates)
2595 return load;
2596
2597 if (missed_updates >= degrade_zero_ticks[idx])
2598 return 0;
2599
2600 if (idx == 1)
2601 return load >> missed_updates;
2602
2603 while (missed_updates) {
2604 if (missed_updates % 2)
2605 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2606
2607 missed_updates >>= 1;
2608 j++;
2609 }
2610 return load;
2611}
2612
2613
2614
2615
2616
2617
2618static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2619 unsigned long pending_updates)
2620{
2621 int i, scale;
2622
2623 this_rq->nr_load_updates++;
2624
2625
2626 this_rq->cpu_load[0] = this_load;
2627 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2628 unsigned long old_load, new_load;
2629
2630
2631
2632 old_load = this_rq->cpu_load[i];
2633 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2634 new_load = this_load;
2635
2636
2637
2638
2639
2640 if (new_load > old_load)
2641 new_load += scale - 1;
2642
2643 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2644 }
2645
2646 sched_avg_update(this_rq);
2647}
2648
2649#ifdef CONFIG_NO_HZ
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667void update_idle_cpu_load(struct rq *this_rq)
2668{
2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2670 unsigned long load = this_rq->load.weight;
2671 unsigned long pending_updates;
2672
2673
2674
2675
2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2677 return;
2678
2679 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2680 this_rq->last_load_update_tick = curr_jiffies;
2681
2682 __update_cpu_load(this_rq, load, pending_updates);
2683}
2684
2685
2686
2687
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701
2702
2703
2704
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif
2710
2711
2712
2713
2714static void update_cpu_load_active(struct rq *this_rq)
2715{
2716
2717
2718
2719 this_rq->last_load_update_tick = jiffies;
2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2721
2722 calc_load_account_active(this_rq);
2723}
2724
2725#ifdef CONFIG_SMP
2726
2727
2728
2729
2730
2731void sched_exec(void)
2732{
2733 struct task_struct *p = current;
2734 unsigned long flags;
2735 int dest_cpu;
2736
2737 raw_spin_lock_irqsave(&p->pi_lock, flags);
2738 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2739 if (dest_cpu == smp_processor_id())
2740 goto unlock;
2741
2742 if (likely(cpu_active(dest_cpu))) {
2743 struct migration_arg arg = { p, dest_cpu };
2744
2745 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2746 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2747 return;
2748 }
2749unlock:
2750 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2751}
2752
2753#endif
2754
2755DEFINE_PER_CPU(struct kernel_stat, kstat);
2756DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2757
2758EXPORT_PER_CPU_SYMBOL(kstat);
2759EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2760
2761
2762
2763
2764
2765
2766
2767static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2768{
2769 u64 ns = 0;
2770
2771 if (task_current(rq, p)) {
2772 update_rq_clock(rq);
2773 ns = rq->clock_task - p->se.exec_start;
2774 if ((s64)ns < 0)
2775 ns = 0;
2776 }
2777
2778 return ns;
2779}
2780
2781unsigned long long task_delta_exec(struct task_struct *p)
2782{
2783 unsigned long flags;
2784 struct rq *rq;
2785 u64 ns = 0;
2786
2787 rq = task_rq_lock(p, &flags);
2788 ns = do_task_delta_exec(p, rq);
2789 task_rq_unlock(rq, p, &flags);
2790
2791 return ns;
2792}
2793
2794
2795
2796
2797
2798
2799unsigned long long task_sched_runtime(struct task_struct *p)
2800{
2801 unsigned long flags;
2802 struct rq *rq;
2803 u64 ns = 0;
2804
2805 rq = task_rq_lock(p, &flags);
2806 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2807 task_rq_unlock(rq, p, &flags);
2808
2809 return ns;
2810}
2811
2812#ifdef CONFIG_CGROUP_CPUACCT
2813struct cgroup_subsys cpuacct_subsys;
2814struct cpuacct root_cpuacct;
2815#endif
2816
2817static inline void task_group_account_field(struct task_struct *p, int index,
2818 u64 tmp)
2819{
2820#ifdef CONFIG_CGROUP_CPUACCT
2821 struct kernel_cpustat *kcpustat;
2822 struct cpuacct *ca;
2823#endif
2824
2825
2826
2827
2828
2829
2830 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2831
2832#ifdef CONFIG_CGROUP_CPUACCT
2833 if (unlikely(!cpuacct_subsys.active))
2834 return;
2835
2836 rcu_read_lock();
2837 ca = task_ca(p);
2838 while (ca && (ca != &root_cpuacct)) {
2839 kcpustat = this_cpu_ptr(ca->cpustat);
2840 kcpustat->cpustat[index] += tmp;
2841 ca = parent_ca(ca);
2842 }
2843 rcu_read_unlock();
2844#endif
2845}
2846
2847
2848
2849
2850
2851
2852
2853
2854void account_user_time(struct task_struct *p, cputime_t cputime,
2855 cputime_t cputime_scaled)
2856{
2857 int index;
2858
2859
2860 p->utime += cputime;
2861 p->utimescaled += cputime_scaled;
2862 account_group_user_time(p, cputime);
2863
2864 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2865
2866
2867 task_group_account_field(p, index, (__force u64) cputime);
2868
2869
2870 acct_update_integrals(p);
2871}
2872
2873
2874
2875
2876
2877
2878
2879static void account_guest_time(struct task_struct *p, cputime_t cputime,
2880 cputime_t cputime_scaled)
2881{
2882 u64 *cpustat = kcpustat_this_cpu->cpustat;
2883
2884
2885 p->utime += cputime;
2886 p->utimescaled += cputime_scaled;
2887 account_group_user_time(p, cputime);
2888 p->gtime += cputime;
2889
2890
2891 if (TASK_NICE(p) > 0) {
2892 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2893 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2894 } else {
2895 cpustat[CPUTIME_USER] += (__force u64) cputime;
2896 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2897 }
2898}
2899
2900
2901
2902
2903
2904
2905
2906
2907static inline
2908void __account_system_time(struct task_struct *p, cputime_t cputime,
2909 cputime_t cputime_scaled, int index)
2910{
2911
2912 p->stime += cputime;
2913 p->stimescaled += cputime_scaled;
2914 account_group_system_time(p, cputime);
2915
2916
2917 task_group_account_field(p, index, (__force u64) cputime);
2918
2919
2920 acct_update_integrals(p);
2921}
2922
2923
2924
2925
2926
2927
2928
2929
2930void account_system_time(struct task_struct *p, int hardirq_offset,
2931 cputime_t cputime, cputime_t cputime_scaled)
2932{
2933 int index;
2934
2935 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2936 account_guest_time(p, cputime, cputime_scaled);
2937 return;
2938 }
2939
2940 if (hardirq_count() - hardirq_offset)
2941 index = CPUTIME_IRQ;
2942 else if (in_serving_softirq())
2943 index = CPUTIME_SOFTIRQ;
2944 else
2945 index = CPUTIME_SYSTEM;
2946
2947 __account_system_time(p, cputime, cputime_scaled, index);
2948}
2949
2950
2951
2952
2953
2954void account_steal_time(cputime_t cputime)
2955{
2956 u64 *cpustat = kcpustat_this_cpu->cpustat;
2957
2958 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2959}
2960
2961
2962
2963
2964
2965void account_idle_time(cputime_t cputime)
2966{
2967 u64 *cpustat = kcpustat_this_cpu->cpustat;
2968 struct rq *rq = this_rq();
2969
2970 if (atomic_read(&rq->nr_iowait) > 0)
2971 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2972 else
2973 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2974}
2975
2976static __always_inline bool steal_account_process_tick(void)
2977{
2978#ifdef CONFIG_PARAVIRT
2979 if (static_key_false(¶virt_steal_enabled)) {
2980 u64 steal, st = 0;
2981
2982 steal = paravirt_steal_clock(smp_processor_id());
2983 steal -= this_rq()->prev_steal_time;
2984
2985 st = steal_ticks(steal);
2986 this_rq()->prev_steal_time += st * TICK_NSEC;
2987
2988 account_steal_time(st);
2989 return st;
2990 }
2991#endif
2992 return false;
2993}
2994
2995#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2996
2997#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3020 struct rq *rq)
3021{
3022 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3023 u64 *cpustat = kcpustat_this_cpu->cpustat;
3024
3025 if (steal_account_process_tick())
3026 return;
3027
3028 if (irqtime_account_hi_update()) {
3029 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3030 } else if (irqtime_account_si_update()) {
3031 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3032 } else if (this_cpu_ksoftirqd() == p) {
3033
3034
3035
3036
3037
3038 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3039 CPUTIME_SOFTIRQ);
3040 } else if (user_tick) {
3041 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3042 } else if (p == rq->idle) {
3043 account_idle_time(cputime_one_jiffy);
3044 } else if (p->flags & PF_VCPU) {
3045 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3046 } else {
3047 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3048 CPUTIME_SYSTEM);
3049 }
3050}
3051
3052static void irqtime_account_idle_ticks(int ticks)
3053{
3054 int i;
3055 struct rq *rq = this_rq();
3056
3057 for (i = 0; i < ticks; i++)
3058 irqtime_account_process_tick(current, 0, rq);
3059}
3060#else
3061static void irqtime_account_idle_ticks(int ticks) {}
3062static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3063 struct rq *rq) {}
3064#endif
3065
3066
3067
3068
3069
3070
3071void account_process_tick(struct task_struct *p, int user_tick)
3072{
3073 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3074 struct rq *rq = this_rq();
3075
3076 if (sched_clock_irqtime) {
3077 irqtime_account_process_tick(p, user_tick, rq);
3078 return;
3079 }
3080
3081 if (steal_account_process_tick())
3082 return;
3083
3084 if (user_tick)
3085 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3086 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3087 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3088 one_jiffy_scaled);
3089 else
3090 account_idle_time(cputime_one_jiffy);
3091}
3092
3093
3094
3095
3096
3097
3098void account_steal_ticks(unsigned long ticks)
3099{
3100 account_steal_time(jiffies_to_cputime(ticks));
3101}
3102
3103
3104
3105
3106
3107void account_idle_ticks(unsigned long ticks)
3108{
3109
3110 if (sched_clock_irqtime) {
3111 irqtime_account_idle_ticks(ticks);
3112 return;
3113 }
3114
3115 account_idle_time(jiffies_to_cputime(ticks));
3116}
3117
3118#endif
3119
3120
3121
3122
3123#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3124void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3125{
3126 *ut = p->utime;
3127 *st = p->stime;
3128}
3129
3130void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3131{
3132 struct task_cputime cputime;
3133
3134 thread_group_cputime(p, &cputime);
3135
3136 *ut = cputime.utime;
3137 *st = cputime.stime;
3138}
3139#else
3140
3141#ifndef nsecs_to_cputime
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif
3144
3145static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3146{
3147 u64 temp = (__force u64) rtime;
3148
3149 temp *= (__force u64) utime;
3150
3151 if (sizeof(cputime_t) == 4)
3152 temp = div_u64(temp, (__force u32) total);
3153 else
3154 temp = div64_u64(temp, (__force u64) total);
3155
3156 return (__force cputime_t) temp;
3157}
3158
3159void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3160{
3161 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3162
3163
3164
3165
3166 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3167
3168 if (total)
3169 utime = scale_utime(utime, rtime, total);
3170 else
3171 utime = rtime;
3172
3173
3174
3175
3176 p->prev_utime = max(p->prev_utime, utime);
3177 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3178
3179 *ut = p->prev_utime;
3180 *st = p->prev_stime;
3181}
3182
3183
3184
3185
3186void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3187{
3188 struct signal_struct *sig = p->signal;
3189 struct task_cputime cputime;
3190 cputime_t rtime, utime, total;
3191
3192 thread_group_cputime(p, &cputime);
3193
3194 total = cputime.utime + cputime.stime;
3195 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3196
3197 if (total)
3198 utime = scale_utime(cputime.utime, rtime, total);
3199 else
3200 utime = rtime;
3201
3202 sig->prev_utime = max(sig->prev_utime, utime);
3203 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3204
3205 *ut = sig->prev_utime;
3206 *st = sig->prev_stime;
3207}
3208#endif
3209
3210
3211
3212
3213
3214void scheduler_tick(void)
3215{
3216 int cpu = smp_processor_id();
3217 struct rq *rq = cpu_rq(cpu);
3218 struct task_struct *curr = rq->curr;
3219
3220 sched_clock_tick();
3221
3222 raw_spin_lock(&rq->lock);
3223 update_rq_clock(rq);
3224 update_cpu_load_active(rq);
3225 curr->sched_class->task_tick(rq, curr, 0);
3226 raw_spin_unlock(&rq->lock);
3227
3228 perf_event_task_tick();
3229
3230#ifdef CONFIG_SMP
3231 rq->idle_balance = idle_cpu(cpu);
3232 trigger_load_balance(rq, cpu);
3233#endif
3234}
3235
3236notrace unsigned long get_parent_ip(unsigned long addr)
3237{
3238 if (in_lock_functions(addr)) {
3239 addr = CALLER_ADDR2;
3240 if (in_lock_functions(addr))
3241 addr = CALLER_ADDR3;
3242 }
3243 return addr;
3244}
3245
3246#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3247 defined(CONFIG_PREEMPT_TRACER))
3248
3249void __kprobes add_preempt_count(int val)
3250{
3251#ifdef CONFIG_DEBUG_PREEMPT
3252
3253
3254
3255 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3256 return;
3257#endif
3258 preempt_count() += val;
3259#ifdef CONFIG_DEBUG_PREEMPT
3260
3261
3262
3263 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3264 PREEMPT_MASK - 10);
3265#endif
3266 if (preempt_count() == val)
3267 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3268}
3269EXPORT_SYMBOL(add_preempt_count);
3270
3271void __kprobes sub_preempt_count(int val)
3272{
3273#ifdef CONFIG_DEBUG_PREEMPT
3274
3275
3276
3277 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3278 return;
3279
3280
3281
3282 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3283 !(preempt_count() & PREEMPT_MASK)))
3284 return;
3285#endif
3286
3287 if (preempt_count() == val)
3288 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3289 preempt_count() -= val;
3290}
3291EXPORT_SYMBOL(sub_preempt_count);
3292
3293#endif
3294
3295
3296
3297
3298static noinline void __schedule_bug(struct task_struct *prev)
3299{
3300 if (oops_in_progress)
3301 return;
3302
3303 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3304 prev->comm, prev->pid, preempt_count());
3305
3306 debug_show_held_locks(prev);
3307 print_modules();
3308 if (irqs_disabled())
3309 print_irqtrace_events(prev);
3310 dump_stack();
3311 add_taint(TAINT_WARN);
3312}
3313
3314
3315
3316
3317static inline void schedule_debug(struct task_struct *prev)
3318{
3319
3320
3321
3322
3323
3324 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3325 __schedule_bug(prev);
3326 rcu_sleep_check();
3327
3328 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3329
3330 schedstat_inc(this_rq(), sched_count);
3331}
3332
3333static void put_prev_task(struct rq *rq, struct task_struct *prev)
3334{
3335 if (prev->on_rq || rq->skip_clock_update < 0)
3336 update_rq_clock(rq);
3337 prev->sched_class->put_prev_task(rq, prev);
3338}
3339
3340
3341
3342
3343static inline struct task_struct *
3344pick_next_task(struct rq *rq)
3345{
3346 const struct sched_class *class;
3347 struct task_struct *p;
3348
3349
3350
3351
3352
3353 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3354 p = fair_sched_class.pick_next_task(rq);
3355 if (likely(p))
3356 return p;
3357 }
3358
3359 for_each_class(class) {
3360 p = class->pick_next_task(rq);
3361 if (p)
3362 return p;
3363 }
3364
3365 BUG();
3366}
3367
3368
3369
3370
3371static void __sched __schedule(void)
3372{
3373 struct task_struct *prev, *next;
3374 unsigned long *switch_count;
3375 struct rq *rq;
3376 int cpu;
3377
3378need_resched:
3379 preempt_disable();
3380 cpu = smp_processor_id();
3381 rq = cpu_rq(cpu);
3382 rcu_note_context_switch(cpu);
3383 prev = rq->curr;
3384
3385 schedule_debug(prev);
3386
3387 if (sched_feat(HRTICK))
3388 hrtick_clear(rq);
3389
3390 raw_spin_lock_irq(&rq->lock);
3391
3392 switch_count = &prev->nivcsw;
3393 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3394 if (unlikely(signal_pending_state(prev->state, prev))) {
3395 prev->state = TASK_RUNNING;
3396 } else {
3397 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3398 prev->on_rq = 0;
3399
3400
3401
3402
3403
3404
3405 if (prev->flags & PF_WQ_WORKER) {
3406 struct task_struct *to_wakeup;
3407
3408 to_wakeup = wq_worker_sleeping(prev, cpu);
3409 if (to_wakeup)
3410 try_to_wake_up_local(to_wakeup);
3411 }
3412 }
3413 switch_count = &prev->nvcsw;
3414 }
3415
3416 pre_schedule(rq, prev);
3417
3418 if (unlikely(!rq->nr_running))
3419 idle_balance(cpu, rq);
3420
3421 put_prev_task(rq, prev);
3422 next = pick_next_task(rq);
3423 clear_tsk_need_resched(prev);
3424 rq->skip_clock_update = 0;
3425
3426 if (likely(prev != next)) {
3427 rq->nr_switches++;
3428 rq->curr = next;
3429 ++*switch_count;
3430
3431 context_switch(rq, prev, next);
3432
3433
3434
3435
3436
3437
3438 cpu = smp_processor_id();
3439 rq = cpu_rq(cpu);
3440 } else
3441 raw_spin_unlock_irq(&rq->lock);
3442
3443 post_schedule(rq);
3444
3445 sched_preempt_enable_no_resched();
3446 if (need_resched())
3447 goto need_resched;
3448}
3449
3450static inline void sched_submit_work(struct task_struct *tsk)
3451{
3452 if (!tsk->state || tsk_is_pi_blocked(tsk))
3453 return;
3454
3455
3456
3457
3458 if (blk_needs_flush_plug(tsk))
3459 blk_schedule_flush_plug(tsk);
3460}
3461
3462asmlinkage void __sched schedule(void)
3463{
3464 struct task_struct *tsk = current;
3465
3466 sched_submit_work(tsk);
3467 __schedule();
3468}
3469EXPORT_SYMBOL(schedule);
3470
3471
3472
3473
3474
3475
3476void __sched schedule_preempt_disabled(void)
3477{
3478 sched_preempt_enable_no_resched();
3479 schedule();
3480 preempt_disable();
3481}
3482
3483#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3484
3485static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3486{
3487 if (lock->owner != owner)
3488 return false;
3489
3490
3491
3492
3493
3494
3495
3496 barrier();
3497
3498 return owner->on_cpu;
3499}
3500
3501
3502
3503
3504
3505int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3506{
3507 if (!sched_feat(OWNER_SPIN))
3508 return 0;
3509
3510 rcu_read_lock();
3511 while (owner_running(lock, owner)) {
3512 if (need_resched())
3513 break;
3514
3515 arch_mutex_cpu_relax();
3516 }
3517 rcu_read_unlock();
3518
3519
3520
3521
3522
3523
3524 return lock->owner == NULL;
3525}
3526#endif
3527
3528#ifdef CONFIG_PREEMPT
3529
3530
3531
3532
3533
3534asmlinkage void __sched notrace preempt_schedule(void)
3535{
3536 struct thread_info *ti = current_thread_info();
3537
3538
3539
3540
3541
3542 if (likely(ti->preempt_count || irqs_disabled()))
3543 return;
3544
3545 do {
3546 add_preempt_count_notrace(PREEMPT_ACTIVE);
3547 __schedule();
3548 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3549
3550
3551
3552
3553
3554 barrier();
3555 } while (need_resched());
3556}
3557EXPORT_SYMBOL(preempt_schedule);
3558
3559
3560
3561
3562
3563
3564
3565asmlinkage void __sched preempt_schedule_irq(void)
3566{
3567 struct thread_info *ti = current_thread_info();
3568
3569
3570 BUG_ON(ti->preempt_count || !irqs_disabled());
3571
3572 do {
3573 add_preempt_count(PREEMPT_ACTIVE);
3574 local_irq_enable();
3575 __schedule();
3576 local_irq_disable();
3577 sub_preempt_count(PREEMPT_ACTIVE);
3578
3579
3580
3581
3582
3583 barrier();
3584 } while (need_resched());
3585}
3586
3587#endif
3588
3589int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3590 void *key)
3591{
3592 return try_to_wake_up(curr->private, mode, wake_flags);
3593}
3594EXPORT_SYMBOL(default_wake_function);
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3606 int nr_exclusive, int wake_flags, void *key)
3607{
3608 wait_queue_t *curr, *next;
3609
3610 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3611 unsigned flags = curr->flags;
3612
3613 if (curr->func(curr, mode, wake_flags, key) &&
3614 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3615 break;
3616 }
3617}
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629void __wake_up(wait_queue_head_t *q, unsigned int mode,
3630 int nr_exclusive, void *key)
3631{
3632 unsigned long flags;
3633
3634 spin_lock_irqsave(&q->lock, flags);
3635 __wake_up_common(q, mode, nr_exclusive, 0, key);
3636 spin_unlock_irqrestore(&q->lock, flags);
3637}
3638EXPORT_SYMBOL(__wake_up);
3639
3640
3641
3642
3643void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3644{
3645 __wake_up_common(q, mode, nr, 0, NULL);
3646}
3647EXPORT_SYMBOL_GPL(__wake_up_locked);
3648
3649void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3650{
3651 __wake_up_common(q, mode, 1, 0, key);
3652}
3653EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3673 int nr_exclusive, void *key)
3674{
3675 unsigned long flags;
3676 int wake_flags = WF_SYNC;
3677
3678 if (unlikely(!q))
3679 return;
3680
3681 if (unlikely(!nr_exclusive))
3682 wake_flags = 0;
3683
3684 spin_lock_irqsave(&q->lock, flags);
3685 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3686 spin_unlock_irqrestore(&q->lock, flags);
3687}
3688EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3689
3690
3691
3692
3693void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3694{
3695 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3696}
3697EXPORT_SYMBOL_GPL(__wake_up_sync);
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711void complete(struct completion *x)
3712{
3713 unsigned long flags;
3714
3715 spin_lock_irqsave(&x->wait.lock, flags);
3716 x->done++;
3717 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3718 spin_unlock_irqrestore(&x->wait.lock, flags);
3719}
3720EXPORT_SYMBOL(complete);
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731void complete_all(struct completion *x)
3732{
3733 unsigned long flags;
3734
3735 spin_lock_irqsave(&x->wait.lock, flags);
3736 x->done += UINT_MAX/2;
3737 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3738 spin_unlock_irqrestore(&x->wait.lock, flags);
3739}
3740EXPORT_SYMBOL(complete_all);
3741
3742static inline long __sched
3743do_wait_for_common(struct completion *x, long timeout, int state)
3744{
3745 if (!x->done) {
3746 DECLARE_WAITQUEUE(wait, current);
3747
3748 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3749 do {
3750 if (signal_pending_state(state, current)) {
3751 timeout = -ERESTARTSYS;
3752 break;
3753 }
3754 __set_current_state(state);
3755 spin_unlock_irq(&x->wait.lock);
3756 timeout = schedule_timeout(timeout);
3757 spin_lock_irq(&x->wait.lock);
3758 } while (!x->done && timeout);
3759 __remove_wait_queue(&x->wait, &wait);
3760 if (!x->done)
3761 return timeout;
3762 }
3763 x->done--;
3764 return timeout ?: 1;
3765}
3766
3767static long __sched
3768wait_for_common(struct completion *x, long timeout, int state)
3769{
3770 might_sleep();
3771
3772 spin_lock_irq(&x->wait.lock);
3773 timeout = do_wait_for_common(x, timeout, state);
3774 spin_unlock_irq(&x->wait.lock);
3775 return timeout;
3776}
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788void __sched wait_for_completion(struct completion *x)
3789{
3790 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3791}
3792EXPORT_SYMBOL(wait_for_completion);
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806unsigned long __sched
3807wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3808{
3809 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3810}
3811EXPORT_SYMBOL(wait_for_completion_timeout);
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822int __sched wait_for_completion_interruptible(struct completion *x)
3823{
3824 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3825 if (t == -ERESTARTSYS)
3826 return t;
3827 return 0;
3828}
3829EXPORT_SYMBOL(wait_for_completion_interruptible);
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842long __sched
3843wait_for_completion_interruptible_timeout(struct completion *x,
3844 unsigned long timeout)
3845{
3846 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3847}
3848EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859int __sched wait_for_completion_killable(struct completion *x)
3860{
3861 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3862 if (t == -ERESTARTSYS)
3863 return t;
3864 return 0;
3865}
3866EXPORT_SYMBOL(wait_for_completion_killable);
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880long __sched
3881wait_for_completion_killable_timeout(struct completion *x,
3882 unsigned long timeout)
3883{
3884 return wait_for_common(x, timeout, TASK_KILLABLE);
3885}
3886EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900bool try_wait_for_completion(struct completion *x)
3901{
3902 unsigned long flags;
3903 int ret = 1;
3904
3905 spin_lock_irqsave(&x->wait.lock, flags);
3906 if (!x->done)
3907 ret = 0;
3908 else
3909 x->done--;
3910 spin_unlock_irqrestore(&x->wait.lock, flags);
3911 return ret;
3912}
3913EXPORT_SYMBOL(try_wait_for_completion);
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923bool completion_done(struct completion *x)
3924{
3925 unsigned long flags;
3926 int ret = 1;
3927
3928 spin_lock_irqsave(&x->wait.lock, flags);
3929 if (!x->done)
3930 ret = 0;
3931 spin_unlock_irqrestore(&x->wait.lock, flags);
3932 return ret;
3933}
3934EXPORT_SYMBOL(completion_done);
3935
3936static long __sched
3937sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3938{
3939 unsigned long flags;
3940 wait_queue_t wait;
3941
3942 init_waitqueue_entry(&wait, current);
3943
3944 __set_current_state(state);
3945
3946 spin_lock_irqsave(&q->lock, flags);
3947 __add_wait_queue(q, &wait);
3948 spin_unlock(&q->lock);
3949 timeout = schedule_timeout(timeout);
3950 spin_lock_irq(&q->lock);
3951 __remove_wait_queue(q, &wait);
3952 spin_unlock_irqrestore(&q->lock, flags);
3953
3954 return timeout;
3955}
3956
3957void __sched interruptible_sleep_on(wait_queue_head_t *q)
3958{
3959 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3960}
3961EXPORT_SYMBOL(interruptible_sleep_on);
3962
3963long __sched
3964interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3965{
3966 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3967}
3968EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3969
3970void __sched sleep_on(wait_queue_head_t *q)
3971{
3972 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3973}
3974EXPORT_SYMBOL(sleep_on);
3975
3976long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3977{
3978 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3979}
3980EXPORT_SYMBOL(sleep_on_timeout);
3981
3982#ifdef CONFIG_RT_MUTEXES
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994void rt_mutex_setprio(struct task_struct *p, int prio)
3995{
3996 int oldprio, on_rq, running;
3997 struct rq *rq;
3998 const struct sched_class *prev_class;
3999
4000 BUG_ON(prio < 0 || prio > MAX_PRIO);
4001
4002 rq = __task_rq_lock(p);
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016 if (unlikely(p == rq->idle)) {
4017 WARN_ON(p != rq->curr);
4018 WARN_ON(p->pi_blocked_on);
4019 goto out_unlock;
4020 }
4021
4022 trace_sched_pi_setprio(p, prio);
4023 oldprio = p->prio;
4024 prev_class = p->sched_class;
4025 on_rq = p->on_rq;
4026 running = task_current(rq, p);
4027 if (on_rq)
4028 dequeue_task(rq, p, 0);
4029 if (running)
4030 p->sched_class->put_prev_task(rq, p);
4031
4032 if (rt_prio(prio))
4033 p->sched_class = &rt_sched_class;
4034 else
4035 p->sched_class = &fair_sched_class;
4036
4037 p->prio = prio;
4038
4039 if (running)
4040 p->sched_class->set_curr_task(rq);
4041 if (on_rq)
4042 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4043
4044 check_class_changed(rq, p, prev_class, oldprio);
4045out_unlock:
4046 __task_rq_unlock(rq);
4047}
4048#endif
4049void set_user_nice(struct task_struct *p, long nice)
4050{
4051 int old_prio, delta, on_rq;
4052 unsigned long flags;
4053 struct rq *rq;
4054
4055 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4056 return;
4057
4058
4059
4060
4061 rq = task_rq_lock(p, &flags);
4062
4063
4064
4065
4066
4067
4068 if (task_has_rt_policy(p)) {
4069 p->static_prio = NICE_TO_PRIO(nice);
4070 goto out_unlock;
4071 }
4072 on_rq = p->on_rq;
4073 if (on_rq)
4074 dequeue_task(rq, p, 0);
4075
4076 p->static_prio = NICE_TO_PRIO(nice);
4077 set_load_weight(p);
4078 old_prio = p->prio;
4079 p->prio = effective_prio(p);
4080 delta = p->prio - old_prio;
4081
4082 if (on_rq) {
4083 enqueue_task(rq, p, 0);
4084
4085
4086
4087
4088 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4089 resched_task(rq->curr);
4090 }
4091out_unlock:
4092 task_rq_unlock(rq, p, &flags);
4093}
4094EXPORT_SYMBOL(set_user_nice);
4095
4096
4097
4098
4099
4100
4101int can_nice(const struct task_struct *p, const int nice)
4102{
4103
4104 int nice_rlim = 20 - nice;
4105
4106 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4107 capable(CAP_SYS_NICE));
4108}
4109
4110#ifdef __ARCH_WANT_SYS_NICE
4111
4112
4113
4114
4115
4116
4117
4118
4119SYSCALL_DEFINE1(nice, int, increment)
4120{
4121 long nice, retval;
4122
4123
4124
4125
4126
4127
4128 if (increment < -40)
4129 increment = -40;
4130 if (increment > 40)
4131 increment = 40;
4132
4133 nice = TASK_NICE(current) + increment;
4134 if (nice < -20)
4135 nice = -20;
4136 if (nice > 19)
4137 nice = 19;
4138
4139 if (increment < 0 && !can_nice(current, nice))
4140 return -EPERM;
4141
4142 retval = security_task_setnice(current, nice);
4143 if (retval)
4144 return retval;
4145
4146 set_user_nice(current, nice);
4147 return 0;
4148}
4149
4150#endif
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160int task_prio(const struct task_struct *p)
4161{
4162 return p->prio - MAX_RT_PRIO;
4163}
4164
4165
4166
4167
4168
4169int task_nice(const struct task_struct *p)
4170{
4171 return TASK_NICE(p);
4172}
4173EXPORT_SYMBOL(task_nice);
4174
4175
4176
4177
4178
4179int idle_cpu(int cpu)
4180{
4181 struct rq *rq = cpu_rq(cpu);
4182
4183 if (rq->curr != rq->idle)
4184 return 0;
4185
4186 if (rq->nr_running)
4187 return 0;
4188
4189#ifdef CONFIG_SMP
4190 if (!llist_empty(&rq->wake_list))
4191 return 0;
4192#endif
4193
4194 return 1;
4195}
4196
4197
4198
4199
4200
4201struct task_struct *idle_task(int cpu)
4202{
4203 return cpu_rq(cpu)->idle;
4204}
4205
4206
4207
4208
4209
4210static struct task_struct *find_process_by_pid(pid_t pid)
4211{
4212 return pid ? find_task_by_vpid(pid) : current;
4213}
4214
4215
4216static void
4217__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4218{
4219 p->policy = policy;
4220 p->rt_priority = prio;
4221 p->normal_prio = normal_prio(p);
4222
4223 p->prio = rt_mutex_getprio(p);
4224 if (rt_prio(p->prio))
4225 p->sched_class = &rt_sched_class;
4226 else
4227 p->sched_class = &fair_sched_class;
4228 set_load_weight(p);
4229}
4230
4231
4232
4233
4234static bool check_same_owner(struct task_struct *p)
4235{
4236 const struct cred *cred = current_cred(), *pcred;
4237 bool match;
4238
4239 rcu_read_lock();
4240 pcred = __task_cred(p);
4241 match = (uid_eq(cred->euid, pcred->euid) ||
4242 uid_eq(cred->euid, pcred->uid));
4243 rcu_read_unlock();
4244 return match;
4245}
4246
4247static int __sched_setscheduler(struct task_struct *p, int policy,
4248 const struct sched_param *param, bool user)
4249{
4250 int retval, oldprio, oldpolicy = -1, on_rq, running;
4251 unsigned long flags;
4252 const struct sched_class *prev_class;
4253 struct rq *rq;
4254 int reset_on_fork;
4255
4256
4257 BUG_ON(in_interrupt());
4258recheck:
4259
4260 if (policy < 0) {
4261 reset_on_fork = p->sched_reset_on_fork;
4262 policy = oldpolicy = p->policy;
4263 } else {
4264 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4265 policy &= ~SCHED_RESET_ON_FORK;
4266
4267 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4268 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4269 policy != SCHED_IDLE)
4270 return -EINVAL;
4271 }
4272
4273
4274
4275
4276
4277
4278 if (param->sched_priority < 0 ||
4279 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4280 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4281 return -EINVAL;
4282 if (rt_policy(policy) != (param->sched_priority != 0))
4283 return -EINVAL;
4284
4285
4286
4287
4288 if (user && !capable(CAP_SYS_NICE)) {
4289 if (rt_policy(policy)) {
4290 unsigned long rlim_rtprio =
4291 task_rlimit(p, RLIMIT_RTPRIO);
4292
4293
4294 if (policy != p->policy && !rlim_rtprio)
4295 return -EPERM;
4296
4297
4298 if (param->sched_priority > p->rt_priority &&
4299 param->sched_priority > rlim_rtprio)
4300 return -EPERM;
4301 }
4302
4303
4304
4305
4306
4307 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4308 if (!can_nice(p, TASK_NICE(p)))
4309 return -EPERM;
4310 }
4311
4312
4313 if (!check_same_owner(p))
4314 return -EPERM;
4315
4316
4317 if (p->sched_reset_on_fork && !reset_on_fork)
4318 return -EPERM;
4319 }
4320
4321 if (user) {
4322 retval = security_task_setscheduler(p);
4323 if (retval)
4324 return retval;
4325 }
4326
4327
4328
4329
4330
4331
4332
4333
4334 rq = task_rq_lock(p, &flags);
4335
4336
4337
4338
4339 if (p == rq->stop) {
4340 task_rq_unlock(rq, p, &flags);
4341 return -EINVAL;
4342 }
4343
4344
4345
4346
4347 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4348 param->sched_priority == p->rt_priority))) {
4349 task_rq_unlock(rq, p, &flags);
4350 return 0;
4351 }
4352
4353#ifdef CONFIG_RT_GROUP_SCHED
4354 if (user) {
4355
4356
4357
4358
4359 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4360 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4361 !task_group_is_autogroup(task_group(p))) {
4362 task_rq_unlock(rq, p, &flags);
4363 return -EPERM;
4364 }
4365 }
4366#endif
4367
4368
4369 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4370 policy = oldpolicy = -1;
4371 task_rq_unlock(rq, p, &flags);
4372 goto recheck;
4373 }
4374 on_rq = p->on_rq;
4375 running = task_current(rq, p);
4376 if (on_rq)
4377 dequeue_task(rq, p, 0);
4378 if (running)
4379 p->sched_class->put_prev_task(rq, p);
4380
4381 p->sched_reset_on_fork = reset_on_fork;
4382
4383 oldprio = p->prio;
4384 prev_class = p->sched_class;
4385 __setscheduler(rq, p, policy, param->sched_priority);
4386
4387 if (running)
4388 p->sched_class->set_curr_task(rq);
4389 if (on_rq)
4390 enqueue_task(rq, p, 0);
4391
4392 check_class_changed(rq, p, prev_class, oldprio);
4393 task_rq_unlock(rq, p, &flags);
4394
4395 rt_mutex_adjust_pi(p);
4396
4397 return 0;
4398}
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408int sched_setscheduler(struct task_struct *p, int policy,
4409 const struct sched_param *param)
4410{
4411 return __sched_setscheduler(p, policy, param, true);
4412}
4413EXPORT_SYMBOL_GPL(sched_setscheduler);
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4427 const struct sched_param *param)
4428{
4429 return __sched_setscheduler(p, policy, param, false);
4430}
4431
4432static int
4433do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4434{
4435 struct sched_param lparam;
4436 struct task_struct *p;
4437 int retval;
4438
4439 if (!param || pid < 0)
4440 return -EINVAL;
4441 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4442 return -EFAULT;
4443
4444 rcu_read_lock();
4445 retval = -ESRCH;
4446 p = find_process_by_pid(pid);
4447 if (p != NULL)
4448 retval = sched_setscheduler(p, policy, &lparam);
4449 rcu_read_unlock();
4450
4451 return retval;
4452}
4453
4454
4455
4456
4457
4458
4459
4460SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4461 struct sched_param __user *, param)
4462{
4463
4464 if (policy < 0)
4465 return -EINVAL;
4466
4467 return do_sched_setscheduler(pid, policy, param);
4468}
4469
4470
4471
4472
4473
4474
4475SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4476{
4477 return do_sched_setscheduler(pid, -1, param);
4478}
4479
4480
4481
4482
4483
4484SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4485{
4486 struct task_struct *p;
4487 int retval;
4488
4489 if (pid < 0)
4490 return -EINVAL;
4491
4492 retval = -ESRCH;
4493 rcu_read_lock();
4494 p = find_process_by_pid(pid);
4495 if (p) {
4496 retval = security_task_getscheduler(p);
4497 if (!retval)
4498 retval = p->policy
4499 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4500 }
4501 rcu_read_unlock();
4502 return retval;
4503}
4504
4505
4506
4507
4508
4509
4510SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4511{
4512 struct sched_param lp;
4513 struct task_struct *p;
4514 int retval;
4515
4516 if (!param || pid < 0)
4517 return -EINVAL;
4518
4519 rcu_read_lock();
4520 p = find_process_by_pid(pid);
4521 retval = -ESRCH;
4522 if (!p)
4523 goto out_unlock;
4524
4525 retval = security_task_getscheduler(p);
4526 if (retval)
4527 goto out_unlock;
4528
4529 lp.sched_priority = p->rt_priority;
4530 rcu_read_unlock();
4531
4532
4533
4534
4535 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4536
4537 return retval;
4538
4539out_unlock:
4540 rcu_read_unlock();
4541 return retval;
4542}
4543
4544long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4545{
4546 cpumask_var_t cpus_allowed, new_mask;
4547 struct task_struct *p;
4548 int retval;
4549
4550 get_online_cpus();
4551 rcu_read_lock();
4552
4553 p = find_process_by_pid(pid);
4554 if (!p) {
4555 rcu_read_unlock();
4556 put_online_cpus();
4557 return -ESRCH;
4558 }
4559
4560
4561 get_task_struct(p);
4562 rcu_read_unlock();
4563
4564 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4565 retval = -ENOMEM;
4566 goto out_put_task;
4567 }
4568 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4569 retval = -ENOMEM;
4570 goto out_free_cpus_allowed;
4571 }
4572 retval = -EPERM;
4573 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4574 goto out_unlock;
4575
4576 retval = security_task_setscheduler(p);
4577 if (retval)
4578 goto out_unlock;
4579
4580 cpuset_cpus_allowed(p, cpus_allowed);
4581 cpumask_and(new_mask, in_mask, cpus_allowed);
4582again:
4583 retval = set_cpus_allowed_ptr(p, new_mask);
4584
4585 if (!retval) {
4586 cpuset_cpus_allowed(p, cpus_allowed);
4587 if (!cpumask_subset(new_mask, cpus_allowed)) {
4588
4589
4590
4591
4592
4593 cpumask_copy(new_mask, cpus_allowed);
4594 goto again;
4595 }
4596 }
4597out_unlock:
4598 free_cpumask_var(new_mask);
4599out_free_cpus_allowed:
4600 free_cpumask_var(cpus_allowed);
4601out_put_task:
4602 put_task_struct(p);
4603 put_online_cpus();
4604 return retval;
4605}
4606
4607static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4608 struct cpumask *new_mask)
4609{
4610 if (len < cpumask_size())
4611 cpumask_clear(new_mask);
4612 else if (len > cpumask_size())
4613 len = cpumask_size();
4614
4615 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4616}
4617
4618
4619
4620
4621
4622
4623
4624SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4625 unsigned long __user *, user_mask_ptr)
4626{
4627 cpumask_var_t new_mask;
4628 int retval;
4629
4630 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4631 return -ENOMEM;
4632
4633 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4634 if (retval == 0)
4635 retval = sched_setaffinity(pid, new_mask);
4636 free_cpumask_var(new_mask);
4637 return retval;
4638}
4639
4640long sched_getaffinity(pid_t pid, struct cpumask *mask)
4641{
4642 struct task_struct *p;
4643 unsigned long flags;
4644 int retval;
4645
4646 get_online_cpus();
4647 rcu_read_lock();
4648
4649 retval = -ESRCH;
4650 p = find_process_by_pid(pid);
4651 if (!p)
4652 goto out_unlock;
4653
4654 retval = security_task_getscheduler(p);
4655 if (retval)
4656 goto out_unlock;
4657
4658 raw_spin_lock_irqsave(&p->pi_lock, flags);
4659 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4660 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4661
4662out_unlock:
4663 rcu_read_unlock();
4664 put_online_cpus();
4665
4666 return retval;
4667}
4668
4669
4670
4671
4672
4673
4674
4675SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4676 unsigned long __user *, user_mask_ptr)
4677{
4678 int ret;
4679 cpumask_var_t mask;
4680
4681 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4682 return -EINVAL;
4683 if (len & (sizeof(unsigned long)-1))
4684 return -EINVAL;
4685
4686 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4687 return -ENOMEM;
4688
4689 ret = sched_getaffinity(pid, mask);
4690 if (ret == 0) {
4691 size_t retlen = min_t(size_t, len, cpumask_size());
4692
4693 if (copy_to_user(user_mask_ptr, mask, retlen))
4694 ret = -EFAULT;
4695 else
4696 ret = retlen;
4697 }
4698 free_cpumask_var(mask);
4699
4700 return ret;
4701}
4702
4703
4704
4705
4706
4707
4708
4709SYSCALL_DEFINE0(sched_yield)
4710{
4711 struct rq *rq = this_rq_lock();
4712
4713 schedstat_inc(rq, yld_count);
4714 current->sched_class->yield_task(rq);
4715
4716
4717
4718
4719
4720 __release(rq->lock);
4721 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4722 do_raw_spin_unlock(&rq->lock);
4723 sched_preempt_enable_no_resched();
4724
4725 schedule();
4726
4727 return 0;
4728}
4729
4730static inline int should_resched(void)
4731{
4732 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4733}
4734
4735static void __cond_resched(void)
4736{
4737 add_preempt_count(PREEMPT_ACTIVE);
4738 __schedule();
4739 sub_preempt_count(PREEMPT_ACTIVE);
4740}
4741
4742int __sched _cond_resched(void)
4743{
4744 if (should_resched()) {
4745 __cond_resched();
4746 return 1;
4747 }
4748 return 0;
4749}
4750EXPORT_SYMBOL(_cond_resched);
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760int __cond_resched_lock(spinlock_t *lock)
4761{
4762 int resched = should_resched();
4763 int ret = 0;
4764
4765 lockdep_assert_held(lock);
4766
4767 if (spin_needbreak(lock) || resched) {
4768 spin_unlock(lock);
4769 if (resched)
4770 __cond_resched();
4771 else
4772 cpu_relax();
4773 ret = 1;
4774 spin_lock(lock);
4775 }
4776 return ret;
4777}
4778EXPORT_SYMBOL(__cond_resched_lock);
4779
4780int __sched __cond_resched_softirq(void)
4781{
4782 BUG_ON(!in_softirq());
4783
4784 if (should_resched()) {
4785 local_bh_enable();
4786 __cond_resched();
4787 local_bh_disable();
4788 return 1;
4789 }
4790 return 0;
4791}
4792EXPORT_SYMBOL(__cond_resched_softirq);
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816void __sched yield(void)
4817{
4818 set_current_state(TASK_RUNNING);
4819 sys_sched_yield();
4820}
4821EXPORT_SYMBOL(yield);
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835bool __sched yield_to(struct task_struct *p, bool preempt)
4836{
4837 struct task_struct *curr = current;
4838 struct rq *rq, *p_rq;
4839 unsigned long flags;
4840 bool yielded = 0;
4841
4842 local_irq_save(flags);
4843 rq = this_rq();
4844
4845again:
4846 p_rq = task_rq(p);
4847 double_rq_lock(rq, p_rq);
4848 while (task_rq(p) != p_rq) {
4849 double_rq_unlock(rq, p_rq);
4850 goto again;
4851 }
4852
4853 if (!curr->sched_class->yield_to_task)
4854 goto out;
4855
4856 if (curr->sched_class != p->sched_class)
4857 goto out;
4858
4859 if (task_running(p_rq, p) || p->state)
4860 goto out;
4861
4862 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4863 if (yielded) {
4864 schedstat_inc(rq, yld_count);
4865
4866
4867
4868
4869 if (preempt && rq != p_rq)
4870 resched_task(p_rq->curr);
4871 } else {
4872
4873
4874
4875
4876
4877 rq->skip_clock_update = 0;
4878 }
4879
4880out:
4881 double_rq_unlock(rq, p_rq);
4882 local_irq_restore(flags);
4883
4884 if (yielded)
4885 schedule();
4886
4887 return yielded;
4888}
4889EXPORT_SYMBOL_GPL(yield_to);
4890
4891
4892
4893
4894
4895void __sched io_schedule(void)
4896{
4897 struct rq *rq = raw_rq();
4898
4899 delayacct_blkio_start();
4900 atomic_inc(&rq->nr_iowait);
4901 blk_flush_plug(current);
4902 current->in_iowait = 1;
4903 schedule();
4904 current->in_iowait = 0;
4905 atomic_dec(&rq->nr_iowait);
4906 delayacct_blkio_end();
4907}
4908EXPORT_SYMBOL(io_schedule);
4909
4910long __sched io_schedule_timeout(long timeout)
4911{
4912 struct rq *rq = raw_rq();
4913 long ret;
4914
4915 delayacct_blkio_start();
4916 atomic_inc(&rq->nr_iowait);
4917 blk_flush_plug(current);
4918 current->in_iowait = 1;
4919 ret = schedule_timeout(timeout);
4920 current->in_iowait = 0;
4921 atomic_dec(&rq->nr_iowait);
4922 delayacct_blkio_end();
4923 return ret;
4924}
4925
4926
4927
4928
4929
4930
4931
4932
4933SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4934{
4935 int ret = -EINVAL;
4936
4937 switch (policy) {
4938 case SCHED_FIFO:
4939 case SCHED_RR:
4940 ret = MAX_USER_RT_PRIO-1;
4941 break;
4942 case SCHED_NORMAL:
4943 case SCHED_BATCH:
4944 case SCHED_IDLE:
4945 ret = 0;
4946 break;
4947 }
4948 return ret;
4949}
4950
4951
4952
4953
4954
4955
4956
4957
4958SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4959{
4960 int ret = -EINVAL;
4961
4962 switch (policy) {
4963 case SCHED_FIFO:
4964 case SCHED_RR:
4965 ret = 1;
4966 break;
4967 case SCHED_NORMAL:
4968 case SCHED_BATCH:
4969 case SCHED_IDLE:
4970 ret = 0;
4971 }
4972 return ret;
4973}
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4984 struct timespec __user *, interval)
4985{
4986 struct task_struct *p;
4987 unsigned int time_slice;
4988 unsigned long flags;
4989 struct rq *rq;
4990 int retval;
4991 struct timespec t;
4992
4993 if (pid < 0)
4994 return -EINVAL;
4995
4996 retval = -ESRCH;
4997 rcu_read_lock();
4998 p = find_process_by_pid(pid);
4999 if (!p)
5000 goto out_unlock;
5001
5002 retval = security_task_getscheduler(p);
5003 if (retval)
5004 goto out_unlock;
5005
5006 rq = task_rq_lock(p, &flags);
5007 time_slice = p->sched_class->get_rr_interval(rq, p);
5008 task_rq_unlock(rq, p, &flags);
5009
5010 rcu_read_unlock();
5011 jiffies_to_timespec(time_slice, &t);
5012 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5013 return retval;
5014
5015out_unlock:
5016 rcu_read_unlock();
5017 return retval;
5018}
5019
5020static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5021
5022void sched_show_task(struct task_struct *p)
5023{
5024 unsigned long free = 0;
5025 unsigned state;
5026
5027 state = p->state ? __ffs(p->state) + 1 : 0;
5028 printk(KERN_INFO "%-15.15s %c", p->comm,
5029 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5030#if BITS_PER_LONG == 32
5031 if (state == TASK_RUNNING)
5032 printk(KERN_CONT " running ");
5033 else
5034 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5035#else
5036 if (state == TASK_RUNNING)
5037 printk(KERN_CONT " running task ");
5038 else
5039 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5040#endif
5041#ifdef CONFIG_DEBUG_STACK_USAGE
5042 free = stack_not_used(p);
5043#endif
5044 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5045 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
5046 (unsigned long)task_thread_info(p)->flags);
5047
5048 show_stack(p, NULL);
5049}
5050
5051void show_state_filter(unsigned long state_filter)
5052{
5053 struct task_struct *g, *p;
5054
5055#if BITS_PER_LONG == 32
5056 printk(KERN_INFO
5057 " task PC stack pid father\n");
5058#else
5059 printk(KERN_INFO
5060 " task PC stack pid father\n");
5061#endif
5062 rcu_read_lock();
5063 do_each_thread(g, p) {
5064
5065
5066
5067
5068 touch_nmi_watchdog();
5069 if (!state_filter || (p->state & state_filter))
5070 sched_show_task(p);
5071 } while_each_thread(g, p);
5072
5073 touch_all_softlockup_watchdogs();
5074
5075#ifdef CONFIG_SCHED_DEBUG
5076 sysrq_sched_debug_show();
5077#endif
5078 rcu_read_unlock();
5079
5080
5081
5082 if (!state_filter)
5083 debug_show_all_locks();
5084}
5085
5086void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5087{
5088 idle->sched_class = &idle_sched_class;
5089}
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099void __cpuinit init_idle(struct task_struct *idle, int cpu)
5100{
5101 struct rq *rq = cpu_rq(cpu);
5102 unsigned long flags;
5103
5104 raw_spin_lock_irqsave(&rq->lock, flags);
5105
5106 __sched_fork(idle);
5107 idle->state = TASK_RUNNING;
5108 idle->se.exec_start = sched_clock();
5109
5110 do_set_cpus_allowed(idle, cpumask_of(cpu));
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121 rcu_read_lock();
5122 __set_task_cpu(idle, cpu);
5123 rcu_read_unlock();
5124
5125 rq->curr = rq->idle = idle;
5126#if defined(CONFIG_SMP)
5127 idle->on_cpu = 1;
5128#endif
5129 raw_spin_unlock_irqrestore(&rq->lock, flags);
5130
5131
5132 task_thread_info(idle)->preempt_count = 0;
5133
5134
5135
5136
5137 idle->sched_class = &idle_sched_class;
5138 ftrace_graph_init_idle_task(idle, cpu);
5139#if defined(CONFIG_SMP)
5140 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5141#endif
5142}
5143
5144#ifdef CONFIG_SMP
5145void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5146{
5147 if (p->sched_class && p->sched_class->set_cpus_allowed)
5148 p->sched_class->set_cpus_allowed(p, new_mask);
5149
5150 cpumask_copy(&p->cpus_allowed, new_mask);
5151 p->nr_cpus_allowed = cpumask_weight(new_mask);
5152}
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5178{
5179 unsigned long flags;
5180 struct rq *rq;
5181 unsigned int dest_cpu;
5182 int ret = 0;
5183
5184 rq = task_rq_lock(p, &flags);
5185
5186 if (cpumask_equal(&p->cpus_allowed, new_mask))
5187 goto out;
5188
5189 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5190 ret = -EINVAL;
5191 goto out;
5192 }
5193
5194 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5195 ret = -EINVAL;
5196 goto out;
5197 }
5198
5199 do_set_cpus_allowed(p, new_mask);
5200
5201
5202 if (cpumask_test_cpu(task_cpu(p), new_mask))
5203 goto out;
5204
5205 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5206 if (p->on_rq) {
5207 struct migration_arg arg = { p, dest_cpu };
5208
5209 task_rq_unlock(rq, p, &flags);
5210 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5211 tlb_migrate_finish(p->mm);
5212 return 0;
5213 }
5214out:
5215 task_rq_unlock(rq, p, &flags);
5216
5217 return ret;
5218}
5219EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5233{
5234 struct rq *rq_dest, *rq_src;
5235 int ret = 0;
5236
5237 if (unlikely(!cpu_active(dest_cpu)))
5238 return ret;
5239
5240 rq_src = cpu_rq(src_cpu);
5241 rq_dest = cpu_rq(dest_cpu);
5242
5243 raw_spin_lock(&p->pi_lock);
5244 double_rq_lock(rq_src, rq_dest);
5245
5246 if (task_cpu(p) != src_cpu)
5247 goto done;
5248
5249 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5250 goto fail;
5251
5252
5253
5254
5255
5256 if (p->on_rq) {
5257 dequeue_task(rq_src, p, 0);
5258 set_task_cpu(p, dest_cpu);
5259 enqueue_task(rq_dest, p, 0);
5260 check_preempt_curr(rq_dest, p, 0);
5261 }
5262done:
5263 ret = 1;
5264fail:
5265 double_rq_unlock(rq_src, rq_dest);
5266 raw_spin_unlock(&p->pi_lock);
5267 return ret;
5268}
5269
5270
5271
5272
5273
5274
5275static int migration_cpu_stop(void *data)
5276{
5277 struct migration_arg *arg = data;
5278
5279
5280
5281
5282
5283 local_irq_disable();
5284 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5285 local_irq_enable();
5286 return 0;
5287}
5288
5289#ifdef CONFIG_HOTPLUG_CPU
5290
5291
5292
5293
5294
5295void idle_task_exit(void)
5296{
5297 struct mm_struct *mm = current->active_mm;
5298
5299 BUG_ON(cpu_online(smp_processor_id()));
5300
5301 if (mm != &init_mm)
5302 switch_mm(mm, &init_mm, current);
5303 mmdrop(mm);
5304}
5305
5306
5307
5308
5309
5310
5311
5312
5313static void calc_load_migrate(struct rq *rq)
5314{
5315 long delta = calc_load_fold_active(rq);
5316 if (delta)
5317 atomic_long_add(delta, &calc_load_tasks);
5318}
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328static void migrate_tasks(unsigned int dead_cpu)
5329{
5330 struct rq *rq = cpu_rq(dead_cpu);
5331 struct task_struct *next, *stop = rq->stop;
5332 int dest_cpu;
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343 rq->stop = NULL;
5344
5345 for ( ; ; ) {
5346
5347
5348
5349
5350 if (rq->nr_running == 1)
5351 break;
5352
5353 next = pick_next_task(rq);
5354 BUG_ON(!next);
5355 next->sched_class->put_prev_task(rq, next);
5356
5357
5358 dest_cpu = select_fallback_rq(dead_cpu, next);
5359 raw_spin_unlock(&rq->lock);
5360
5361 __migrate_task(next, dead_cpu, dest_cpu);
5362
5363 raw_spin_lock(&rq->lock);
5364 }
5365
5366 rq->stop = stop;
5367}
5368
5369#endif
5370
5371#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5372
5373static struct ctl_table sd_ctl_dir[] = {
5374 {
5375 .procname = "sched_domain",
5376 .mode = 0555,
5377 },
5378 {}
5379};
5380
5381static struct ctl_table sd_ctl_root[] = {
5382 {
5383 .procname = "kernel",
5384 .mode = 0555,
5385 .child = sd_ctl_dir,
5386 },
5387 {}
5388};
5389
5390static struct ctl_table *sd_alloc_ctl_entry(int n)
5391{
5392 struct ctl_table *entry =
5393 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5394
5395 return entry;
5396}
5397
5398static void sd_free_ctl_entry(struct ctl_table **tablep)
5399{
5400 struct ctl_table *entry;
5401
5402
5403
5404
5405
5406
5407
5408 for (entry = *tablep; entry->mode; entry++) {
5409 if (entry->child)
5410 sd_free_ctl_entry(&entry->child);
5411 if (entry->proc_handler == NULL)
5412 kfree(entry->procname);
5413 }
5414
5415 kfree(*tablep);
5416 *tablep = NULL;
5417}
5418
5419static void
5420set_table_entry(struct ctl_table *entry,
5421 const char *procname, void *data, int maxlen,
5422 umode_t mode, proc_handler *proc_handler)
5423{
5424 entry->procname = procname;
5425 entry->data = data;
5426 entry->maxlen = maxlen;
5427 entry->mode = mode;
5428 entry->proc_handler = proc_handler;
5429}
5430
5431static struct ctl_table *
5432sd_alloc_ctl_domain_table(struct sched_domain *sd)
5433{
5434 struct ctl_table *table = sd_alloc_ctl_entry(13);
5435
5436 if (table == NULL)
5437 return NULL;
5438
5439 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5440 sizeof(long), 0644, proc_doulongvec_minmax);
5441 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5442 sizeof(long), 0644, proc_doulongvec_minmax);
5443 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5444 sizeof(int), 0644, proc_dointvec_minmax);
5445 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5446 sizeof(int), 0644, proc_dointvec_minmax);
5447 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5448 sizeof(int), 0644, proc_dointvec_minmax);
5449 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5450 sizeof(int), 0644, proc_dointvec_minmax);
5451 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5452 sizeof(int), 0644, proc_dointvec_minmax);
5453 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5454 sizeof(int), 0644, proc_dointvec_minmax);
5455 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5456 sizeof(int), 0644, proc_dointvec_minmax);
5457 set_table_entry(&table[9], "cache_nice_tries",
5458 &sd->cache_nice_tries,
5459 sizeof(int), 0644, proc_dointvec_minmax);
5460 set_table_entry(&table[10], "flags", &sd->flags,
5461 sizeof(int), 0644, proc_dointvec_minmax);
5462 set_table_entry(&table[11], "name", sd->name,
5463 CORENAME_MAX_SIZE, 0444, proc_dostring);
5464
5465
5466 return table;
5467}
5468
5469static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5470{
5471 struct ctl_table *entry, *table;
5472 struct sched_domain *sd;
5473 int domain_num = 0, i;
5474 char buf[32];
5475
5476 for_each_domain(cpu, sd)
5477 domain_num++;
5478 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5479 if (table == NULL)
5480 return NULL;
5481
5482 i = 0;
5483 for_each_domain(cpu, sd) {
5484 snprintf(buf, 32, "domain%d", i);
5485 entry->procname = kstrdup(buf, GFP_KERNEL);
5486 entry->mode = 0555;
5487 entry->child = sd_alloc_ctl_domain_table(sd);
5488 entry++;
5489 i++;
5490 }
5491 return table;
5492}
5493
5494static struct ctl_table_header *sd_sysctl_header;
5495static void register_sched_domain_sysctl(void)
5496{
5497 int i, cpu_num = num_possible_cpus();
5498 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5499 char buf[32];
5500
5501 WARN_ON(sd_ctl_dir[0].child);
5502 sd_ctl_dir[0].child = entry;
5503
5504 if (entry == NULL)
5505 return;
5506
5507 for_each_possible_cpu(i) {
5508 snprintf(buf, 32, "cpu%d", i);
5509 entry->procname = kstrdup(buf, GFP_KERNEL);
5510 entry->mode = 0555;
5511 entry->child = sd_alloc_ctl_cpu_table(i);
5512 entry++;
5513 }
5514
5515 WARN_ON(sd_sysctl_header);
5516 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5517}
5518
5519
5520static void unregister_sched_domain_sysctl(void)
5521{
5522 if (sd_sysctl_header)
5523 unregister_sysctl_table(sd_sysctl_header);
5524 sd_sysctl_header = NULL;
5525 if (sd_ctl_dir[0].child)
5526 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5527}
5528#else
5529static void register_sched_domain_sysctl(void)
5530{
5531}
5532static void unregister_sched_domain_sysctl(void)
5533{
5534}
5535#endif
5536
5537static void set_rq_online(struct rq *rq)
5538{
5539 if (!rq->online) {
5540 const struct sched_class *class;
5541
5542 cpumask_set_cpu(rq->cpu, rq->rd->online);
5543 rq->online = 1;
5544
5545 for_each_class(class) {
5546 if (class->rq_online)
5547 class->rq_online(rq);
5548 }
5549 }
5550}
5551
5552static void set_rq_offline(struct rq *rq)
5553{
5554 if (rq->online) {
5555 const struct sched_class *class;
5556
5557 for_each_class(class) {
5558 if (class->rq_offline)
5559 class->rq_offline(rq);
5560 }
5561
5562 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5563 rq->online = 0;
5564 }
5565}
5566
5567
5568
5569
5570
5571static int __cpuinit
5572migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5573{
5574 int cpu = (long)hcpu;
5575 unsigned long flags;
5576 struct rq *rq = cpu_rq(cpu);
5577
5578 switch (action & ~CPU_TASKS_FROZEN) {
5579
5580 case CPU_UP_PREPARE:
5581 rq->calc_load_update = calc_load_update;
5582 break;
5583
5584 case CPU_ONLINE:
5585
5586 raw_spin_lock_irqsave(&rq->lock, flags);
5587 if (rq->rd) {
5588 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5589
5590 set_rq_online(rq);
5591 }
5592 raw_spin_unlock_irqrestore(&rq->lock, flags);
5593 break;
5594
5595#ifdef CONFIG_HOTPLUG_CPU
5596 case CPU_DYING:
5597 sched_ttwu_pending();
5598
5599 raw_spin_lock_irqsave(&rq->lock, flags);
5600 if (rq->rd) {
5601 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5602 set_rq_offline(rq);
5603 }
5604 migrate_tasks(cpu);
5605 BUG_ON(rq->nr_running != 1);
5606 raw_spin_unlock_irqrestore(&rq->lock, flags);
5607
5608 calc_load_migrate(rq);
5609 break;
5610#endif
5611 }
5612
5613 update_max_interval();
5614
5615 return NOTIFY_OK;
5616}
5617
5618
5619
5620
5621
5622
5623static struct notifier_block __cpuinitdata migration_notifier = {
5624 .notifier_call = migration_call,
5625 .priority = CPU_PRI_MIGRATION,
5626};
5627
5628static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5629 unsigned long action, void *hcpu)
5630{
5631 switch (action & ~CPU_TASKS_FROZEN) {
5632 case CPU_STARTING:
5633 case CPU_DOWN_FAILED:
5634 set_cpu_active((long)hcpu, true);
5635 return NOTIFY_OK;
5636 default:
5637 return NOTIFY_DONE;
5638 }
5639}
5640
5641static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5642 unsigned long action, void *hcpu)
5643{
5644 switch (action & ~CPU_TASKS_FROZEN) {
5645 case CPU_DOWN_PREPARE:
5646 set_cpu_active((long)hcpu, false);
5647 return NOTIFY_OK;
5648 default:
5649 return NOTIFY_DONE;
5650 }
5651}
5652
5653static int __init migration_init(void)
5654{
5655 void *cpu = (void *)(long)smp_processor_id();
5656 int err;
5657
5658
5659 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5660 BUG_ON(err == NOTIFY_BAD);
5661 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5662 register_cpu_notifier(&migration_notifier);
5663
5664
5665 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5666 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5667
5668 return 0;
5669}
5670early_initcall(migration_init);
5671#endif
5672
5673#ifdef CONFIG_SMP
5674
5675static cpumask_var_t sched_domains_tmpmask;
5676
5677#ifdef CONFIG_SCHED_DEBUG
5678
5679static __read_mostly int sched_debug_enabled;
5680
5681static int __init sched_debug_setup(char *str)
5682{
5683 sched_debug_enabled = 1;
5684
5685 return 0;
5686}
5687early_param("sched_debug", sched_debug_setup);
5688
5689static inline bool sched_debug(void)
5690{
5691 return sched_debug_enabled;
5692}
5693
5694static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5695 struct cpumask *groupmask)
5696{
5697 struct sched_group *group = sd->groups;
5698 char str[256];
5699
5700 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5701 cpumask_clear(groupmask);
5702
5703 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5704
5705 if (!(sd->flags & SD_LOAD_BALANCE)) {
5706 printk("does not load-balance\n");
5707 if (sd->parent)
5708 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5709 " has parent");
5710 return -1;
5711 }
5712
5713 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5714
5715 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5716 printk(KERN_ERR "ERROR: domain->span does not contain "
5717 "CPU%d\n", cpu);
5718 }
5719 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5720 printk(KERN_ERR "ERROR: domain->groups does not contain"
5721 " CPU%d\n", cpu);
5722 }
5723
5724 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5725 do {
5726 if (!group) {
5727 printk("\n");
5728 printk(KERN_ERR "ERROR: group is NULL\n");
5729 break;
5730 }
5731
5732
5733
5734
5735
5736
5737 if (!group->sgp->power_orig) {
5738 printk(KERN_CONT "\n");
5739 printk(KERN_ERR "ERROR: domain->cpu_power not "
5740 "set\n");
5741 break;
5742 }
5743
5744 if (!cpumask_weight(sched_group_cpus(group))) {
5745 printk(KERN_CONT "\n");
5746 printk(KERN_ERR "ERROR: empty group\n");
5747 break;
5748 }
5749
5750 if (!(sd->flags & SD_OVERLAP) &&
5751 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5752 printk(KERN_CONT "\n");
5753 printk(KERN_ERR "ERROR: repeated CPUs\n");
5754 break;
5755 }
5756
5757 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5758
5759 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5760
5761 printk(KERN_CONT " %s", str);
5762 if (group->sgp->power != SCHED_POWER_SCALE) {
5763 printk(KERN_CONT " (cpu_power = %d)",
5764 group->sgp->power);
5765 }
5766
5767 group = group->next;
5768 } while (group != sd->groups);
5769 printk(KERN_CONT "\n");
5770
5771 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5772 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5773
5774 if (sd->parent &&
5775 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5776 printk(KERN_ERR "ERROR: parent span is not a superset "
5777 "of domain->span\n");
5778 return 0;
5779}
5780
5781static void sched_domain_debug(struct sched_domain *sd, int cpu)
5782{
5783 int level = 0;
5784
5785 if (!sched_debug_enabled)
5786 return;
5787
5788 if (!sd) {
5789 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5790 return;
5791 }
5792
5793 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5794
5795 for (;;) {
5796 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5797 break;
5798 level++;
5799 sd = sd->parent;
5800 if (!sd)
5801 break;
5802 }
5803}
5804#else
5805# define sched_domain_debug(sd, cpu) do { } while (0)
5806static inline bool sched_debug(void)
5807{
5808 return false;
5809}
5810#endif
5811
5812static int sd_degenerate(struct sched_domain *sd)
5813{
5814 if (cpumask_weight(sched_domain_span(sd)) == 1)
5815 return 1;
5816
5817
5818 if (sd->flags & (SD_LOAD_BALANCE |
5819 SD_BALANCE_NEWIDLE |
5820 SD_BALANCE_FORK |
5821 SD_BALANCE_EXEC |
5822 SD_SHARE_CPUPOWER |
5823 SD_SHARE_PKG_RESOURCES)) {
5824 if (sd->groups != sd->groups->next)
5825 return 0;
5826 }
5827
5828
5829 if (sd->flags & (SD_WAKE_AFFINE))
5830 return 0;
5831
5832 return 1;
5833}
5834
5835static int
5836sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5837{
5838 unsigned long cflags = sd->flags, pflags = parent->flags;
5839
5840 if (sd_degenerate(parent))
5841 return 1;
5842
5843 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5844 return 0;
5845
5846
5847 if (parent->groups == parent->groups->next) {
5848 pflags &= ~(SD_LOAD_BALANCE |
5849 SD_BALANCE_NEWIDLE |
5850 SD_BALANCE_FORK |
5851 SD_BALANCE_EXEC |
5852 SD_SHARE_CPUPOWER |
5853 SD_SHARE_PKG_RESOURCES);
5854 if (nr_node_ids == 1)
5855 pflags &= ~SD_SERIALIZE;
5856 }
5857 if (~cflags & pflags)
5858 return 0;
5859
5860 return 1;
5861}
5862
5863static void free_rootdomain(struct rcu_head *rcu)
5864{
5865 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5866
5867 cpupri_cleanup(&rd->cpupri);
5868 free_cpumask_var(rd->rto_mask);
5869 free_cpumask_var(rd->online);
5870 free_cpumask_var(rd->span);
5871 kfree(rd);
5872}
5873
5874static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5875{
5876 struct root_domain *old_rd = NULL;
5877 unsigned long flags;
5878
5879 raw_spin_lock_irqsave(&rq->lock, flags);
5880
5881 if (rq->rd) {
5882 old_rd = rq->rd;
5883
5884 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5885 set_rq_offline(rq);
5886
5887 cpumask_clear_cpu(rq->cpu, old_rd->span);
5888
5889
5890
5891
5892
5893
5894 if (!atomic_dec_and_test(&old_rd->refcount))
5895 old_rd = NULL;
5896 }
5897
5898 atomic_inc(&rd->refcount);
5899 rq->rd = rd;
5900
5901 cpumask_set_cpu(rq->cpu, rd->span);
5902 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5903 set_rq_online(rq);
5904
5905 raw_spin_unlock_irqrestore(&rq->lock, flags);
5906
5907 if (old_rd)
5908 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5909}
5910
5911static int init_rootdomain(struct root_domain *rd)
5912{
5913 memset(rd, 0, sizeof(*rd));
5914
5915 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5916 goto out;
5917 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5918 goto free_span;
5919 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5920 goto free_online;
5921
5922 if (cpupri_init(&rd->cpupri) != 0)
5923 goto free_rto_mask;
5924 return 0;
5925
5926free_rto_mask:
5927 free_cpumask_var(rd->rto_mask);
5928free_online:
5929 free_cpumask_var(rd->online);
5930free_span:
5931 free_cpumask_var(rd->span);
5932out:
5933 return -ENOMEM;
5934}
5935
5936
5937
5938
5939
5940struct root_domain def_root_domain;
5941
5942static void init_defrootdomain(void)
5943{
5944 init_rootdomain(&def_root_domain);
5945
5946 atomic_set(&def_root_domain.refcount, 1);
5947}
5948
5949static struct root_domain *alloc_rootdomain(void)
5950{
5951 struct root_domain *rd;
5952
5953 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5954 if (!rd)
5955 return NULL;
5956
5957 if (init_rootdomain(rd) != 0) {
5958 kfree(rd);
5959 return NULL;
5960 }
5961
5962 return rd;
5963}
5964
5965static void free_sched_groups(struct sched_group *sg, int free_sgp)
5966{
5967 struct sched_group *tmp, *first;
5968
5969 if (!sg)
5970 return;
5971
5972 first = sg;
5973 do {
5974 tmp = sg->next;
5975
5976 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5977 kfree(sg->sgp);
5978
5979 kfree(sg);
5980 sg = tmp;
5981 } while (sg != first);
5982}
5983
5984static void free_sched_domain(struct rcu_head *rcu)
5985{
5986 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5987
5988
5989
5990
5991
5992 if (sd->flags & SD_OVERLAP) {
5993 free_sched_groups(sd->groups, 1);
5994 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5995 kfree(sd->groups->sgp);
5996 kfree(sd->groups);
5997 }
5998 kfree(sd);
5999}
6000
6001static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6002{
6003 call_rcu(&sd->rcu, free_sched_domain);
6004}
6005
6006static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6007{
6008 for (; sd; sd = sd->parent)
6009 destroy_sched_domain(sd, cpu);
6010}
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021DEFINE_PER_CPU(struct sched_domain *, sd_llc);
6022DEFINE_PER_CPU(int, sd_llc_id);
6023
6024static void update_top_cache_domain(int cpu)
6025{
6026 struct sched_domain *sd;
6027 int id = cpu;
6028
6029 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6030 if (sd)
6031 id = cpumask_first(sched_domain_span(sd));
6032
6033 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6034 per_cpu(sd_llc_id, cpu) = id;
6035}
6036
6037
6038
6039
6040
6041static void
6042cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6043{
6044 struct rq *rq = cpu_rq(cpu);
6045 struct sched_domain *tmp;
6046
6047
6048 for (tmp = sd; tmp; ) {
6049 struct sched_domain *parent = tmp->parent;
6050 if (!parent)
6051 break;
6052
6053 if (sd_parent_degenerate(tmp, parent)) {
6054 tmp->parent = parent->parent;
6055 if (parent->parent)
6056 parent->parent->child = tmp;
6057 destroy_sched_domain(parent, cpu);
6058 } else
6059 tmp = tmp->parent;
6060 }
6061
6062 if (sd && sd_degenerate(sd)) {
6063 tmp = sd;
6064 sd = sd->parent;
6065 destroy_sched_domain(tmp, cpu);
6066 if (sd)
6067 sd->child = NULL;
6068 }
6069
6070 sched_domain_debug(sd, cpu);
6071
6072 rq_attach_root(rq, rd);
6073 tmp = rq->sd;
6074 rcu_assign_pointer(rq->sd, sd);
6075 destroy_sched_domains(tmp, cpu);
6076
6077 update_top_cache_domain(cpu);
6078}
6079
6080
6081static cpumask_var_t cpu_isolated_map;
6082
6083
6084static int __init isolated_cpu_setup(char *str)
6085{
6086 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6087 cpulist_parse(str, cpu_isolated_map);
6088 return 1;
6089}
6090
6091__setup("isolcpus=", isolated_cpu_setup);
6092
6093static const struct cpumask *cpu_cpu_mask(int cpu)
6094{
6095 return cpumask_of_node(cpu_to_node(cpu));
6096}
6097
6098struct sd_data {
6099 struct sched_domain **__percpu sd;
6100 struct sched_group **__percpu sg;
6101 struct sched_group_power **__percpu sgp;
6102};
6103
6104struct s_data {
6105 struct sched_domain ** __percpu sd;
6106 struct root_domain *rd;
6107};
6108
6109enum s_alloc {
6110 sa_rootdomain,
6111 sa_sd,
6112 sa_sd_storage,
6113 sa_none,
6114};
6115
6116struct sched_domain_topology_level;
6117
6118typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6119typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6120
6121#define SDTL_OVERLAP 0x01
6122
6123struct sched_domain_topology_level {
6124 sched_domain_init_f init;
6125 sched_domain_mask_f mask;
6126 int flags;
6127 int numa_level;
6128 struct sd_data data;
6129};
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6145{
6146 const struct cpumask *span = sched_domain_span(sd);
6147 struct sd_data *sdd = sd->private;
6148 struct sched_domain *sibling;
6149 int i;
6150
6151 for_each_cpu(i, span) {
6152 sibling = *per_cpu_ptr(sdd->sd, i);
6153 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6154 continue;
6155
6156 cpumask_set_cpu(i, sched_group_mask(sg));
6157 }
6158}
6159
6160
6161
6162
6163
6164int group_balance_cpu(struct sched_group *sg)
6165{
6166 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6167}
6168
6169static int
6170build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6171{
6172 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6173 const struct cpumask *span = sched_domain_span(sd);
6174 struct cpumask *covered = sched_domains_tmpmask;
6175 struct sd_data *sdd = sd->private;
6176 struct sched_domain *child;
6177 int i;
6178
6179 cpumask_clear(covered);
6180
6181 for_each_cpu(i, span) {
6182 struct cpumask *sg_span;
6183
6184 if (cpumask_test_cpu(i, covered))
6185 continue;
6186
6187 child = *per_cpu_ptr(sdd->sd, i);
6188
6189
6190 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6191 continue;
6192
6193 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6194 GFP_KERNEL, cpu_to_node(cpu));
6195
6196 if (!sg)
6197 goto fail;
6198
6199 sg_span = sched_group_cpus(sg);
6200 if (child->child) {
6201 child = child->child;
6202 cpumask_copy(sg_span, sched_domain_span(child));
6203 } else
6204 cpumask_set_cpu(i, sg_span);
6205
6206 cpumask_or(covered, covered, sg_span);
6207
6208 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6209 if (atomic_inc_return(&sg->sgp->ref) == 1)
6210 build_group_mask(sd, sg);
6211
6212
6213
6214
6215
6216
6217 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6218
6219
6220
6221
6222
6223
6224 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6225 group_balance_cpu(sg) == cpu)
6226 groups = sg;
6227
6228 if (!first)
6229 first = sg;
6230 if (last)
6231 last->next = sg;
6232 last = sg;
6233 last->next = first;
6234 }
6235 sd->groups = groups;
6236
6237 return 0;
6238
6239fail:
6240 free_sched_groups(first, 0);
6241
6242 return -ENOMEM;
6243}
6244
6245static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6246{
6247 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6248 struct sched_domain *child = sd->child;
6249
6250 if (child)
6251 cpu = cpumask_first(sched_domain_span(child));
6252
6253 if (sg) {
6254 *sg = *per_cpu_ptr(sdd->sg, cpu);
6255 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
6256 atomic_set(&(*sg)->sgp->ref, 1);
6257 }
6258
6259 return cpu;
6260}
6261
6262
6263
6264
6265
6266
6267
6268
6269static int
6270build_sched_groups(struct sched_domain *sd, int cpu)
6271{
6272 struct sched_group *first = NULL, *last = NULL;
6273 struct sd_data *sdd = sd->private;
6274 const struct cpumask *span = sched_domain_span(sd);
6275 struct cpumask *covered;
6276 int i;
6277
6278 get_group(cpu, sdd, &sd->groups);
6279 atomic_inc(&sd->groups->ref);
6280
6281 if (cpu != cpumask_first(sched_domain_span(sd)))
6282 return 0;
6283
6284 lockdep_assert_held(&sched_domains_mutex);
6285 covered = sched_domains_tmpmask;
6286
6287 cpumask_clear(covered);
6288
6289 for_each_cpu(i, span) {
6290 struct sched_group *sg;
6291 int group = get_group(i, sdd, &sg);
6292 int j;
6293
6294 if (cpumask_test_cpu(i, covered))
6295 continue;
6296
6297 cpumask_clear(sched_group_cpus(sg));
6298 sg->sgp->power = 0;
6299 cpumask_setall(sched_group_mask(sg));
6300
6301 for_each_cpu(j, span) {
6302 if (get_group(j, sdd, NULL) != group)
6303 continue;
6304
6305 cpumask_set_cpu(j, covered);
6306 cpumask_set_cpu(j, sched_group_cpus(sg));
6307 }
6308
6309 if (!first)
6310 first = sg;
6311 if (last)
6312 last->next = sg;
6313 last = sg;
6314 }
6315 last->next = first;
6316
6317 return 0;
6318}
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6331{
6332 struct sched_group *sg = sd->groups;
6333
6334 WARN_ON(!sd || !sg);
6335
6336 do {
6337 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6338 sg = sg->next;
6339 } while (sg != sd->groups);
6340
6341 if (cpu != group_balance_cpu(sg))
6342 return;
6343
6344 update_group_power(sd, cpu);
6345 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6346}
6347
6348int __weak arch_sd_sibling_asym_packing(void)
6349{
6350 return 0*SD_ASYM_PACKING;
6351}
6352
6353
6354
6355
6356
6357
6358#ifdef CONFIG_SCHED_DEBUG
6359# define SD_INIT_NAME(sd, type) sd->name = #type
6360#else
6361# define SD_INIT_NAME(sd, type) do { } while (0)
6362#endif
6363
6364#define SD_INIT_FUNC(type) \
6365static noinline struct sched_domain * \
6366sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6367{ \
6368 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6369 *sd = SD_##type##_INIT; \
6370 SD_INIT_NAME(sd, type); \
6371 sd->private = &tl->data; \
6372 return sd; \
6373}
6374
6375SD_INIT_FUNC(CPU)
6376#ifdef CONFIG_SCHED_SMT
6377 SD_INIT_FUNC(SIBLING)
6378#endif
6379#ifdef CONFIG_SCHED_MC
6380 SD_INIT_FUNC(MC)
6381#endif
6382#ifdef CONFIG_SCHED_BOOK
6383 SD_INIT_FUNC(BOOK)
6384#endif
6385
6386static int default_relax_domain_level = -1;
6387int sched_domain_level_max;
6388
6389static int __init setup_relax_domain_level(char *str)
6390{
6391 if (kstrtoint(str, 0, &default_relax_domain_level))
6392 pr_warn("Unable to set relax_domain_level\n");
6393
6394 return 1;
6395}
6396__setup("relax_domain_level=", setup_relax_domain_level);
6397
6398static void set_domain_attribute(struct sched_domain *sd,
6399 struct sched_domain_attr *attr)
6400{
6401 int request;
6402
6403 if (!attr || attr->relax_domain_level < 0) {
6404 if (default_relax_domain_level < 0)
6405 return;
6406 else
6407 request = default_relax_domain_level;
6408 } else
6409 request = attr->relax_domain_level;
6410 if (request < sd->level) {
6411
6412 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6413 } else {
6414
6415 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6416 }
6417}
6418
6419static void __sdt_free(const struct cpumask *cpu_map);
6420static int __sdt_alloc(const struct cpumask *cpu_map);
6421
6422static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6423 const struct cpumask *cpu_map)
6424{
6425 switch (what) {
6426 case sa_rootdomain:
6427 if (!atomic_read(&d->rd->refcount))
6428 free_rootdomain(&d->rd->rcu);
6429 case sa_sd:
6430 free_percpu(d->sd);
6431 case sa_sd_storage:
6432 __sdt_free(cpu_map);
6433 case sa_none:
6434 break;
6435 }
6436}
6437
6438static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6439 const struct cpumask *cpu_map)
6440{
6441 memset(d, 0, sizeof(*d));
6442
6443 if (__sdt_alloc(cpu_map))
6444 return sa_sd_storage;
6445 d->sd = alloc_percpu(struct sched_domain *);
6446 if (!d->sd)
6447 return sa_sd_storage;
6448 d->rd = alloc_rootdomain();
6449 if (!d->rd)
6450 return sa_sd;
6451 return sa_rootdomain;
6452}
6453
6454
6455
6456
6457
6458
6459static void claim_allocations(int cpu, struct sched_domain *sd)
6460{
6461 struct sd_data *sdd = sd->private;
6462
6463 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6464 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6465
6466 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6467 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6468
6469 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6470 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6471}
6472
6473#ifdef CONFIG_SCHED_SMT
6474static const struct cpumask *cpu_smt_mask(int cpu)
6475{
6476 return topology_thread_cpumask(cpu);
6477}
6478#endif
6479
6480
6481
6482
6483static struct sched_domain_topology_level default_topology[] = {
6484#ifdef CONFIG_SCHED_SMT
6485 { sd_init_SIBLING, cpu_smt_mask, },
6486#endif
6487#ifdef CONFIG_SCHED_MC
6488 { sd_init_MC, cpu_coregroup_mask, },
6489#endif
6490#ifdef CONFIG_SCHED_BOOK
6491 { sd_init_BOOK, cpu_book_mask, },
6492#endif
6493 { sd_init_CPU, cpu_cpu_mask, },
6494 { NULL, },
6495};
6496
6497static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6498
6499#ifdef CONFIG_NUMA
6500
6501static int sched_domains_numa_levels;
6502static int *sched_domains_numa_distance;
6503static struct cpumask ***sched_domains_numa_masks;
6504static int sched_domains_curr_level;
6505
6506static inline int sd_local_flags(int level)
6507{
6508 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6509 return 0;
6510
6511 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6512}
6513
6514static struct sched_domain *
6515sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6516{
6517 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6518 int level = tl->numa_level;
6519 int sd_weight = cpumask_weight(
6520 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6521
6522 *sd = (struct sched_domain){
6523 .min_interval = sd_weight,
6524 .max_interval = 2*sd_weight,
6525 .busy_factor = 32,
6526 .imbalance_pct = 125,
6527 .cache_nice_tries = 2,
6528 .busy_idx = 3,
6529 .idle_idx = 2,
6530 .newidle_idx = 0,
6531 .wake_idx = 0,
6532 .forkexec_idx = 0,
6533
6534 .flags = 1*SD_LOAD_BALANCE
6535 | 1*SD_BALANCE_NEWIDLE
6536 | 0*SD_BALANCE_EXEC
6537 | 0*SD_BALANCE_FORK
6538 | 0*SD_BALANCE_WAKE
6539 | 0*SD_WAKE_AFFINE
6540 | 0*SD_PREFER_LOCAL
6541 | 0*SD_SHARE_CPUPOWER
6542 | 0*SD_SHARE_PKG_RESOURCES
6543 | 1*SD_SERIALIZE
6544 | 0*SD_PREFER_SIBLING
6545 | sd_local_flags(level)
6546 ,
6547 .last_balance = jiffies,
6548 .balance_interval = sd_weight,
6549 };
6550 SD_INIT_NAME(sd, NUMA);
6551 sd->private = &tl->data;
6552
6553
6554
6555
6556 sched_domains_curr_level = tl->numa_level;
6557
6558 return sd;
6559}
6560
6561static const struct cpumask *sd_numa_mask(int cpu)
6562{
6563 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6564}
6565
6566static void sched_numa_warn(const char *str)
6567{
6568 static int done = false;
6569 int i,j;
6570
6571 if (done)
6572 return;
6573
6574 done = true;
6575
6576 printk(KERN_WARNING "ERROR: %s\n\n", str);
6577
6578 for (i = 0; i < nr_node_ids; i++) {
6579 printk(KERN_WARNING " ");
6580 for (j = 0; j < nr_node_ids; j++)
6581 printk(KERN_CONT "%02d ", node_distance(i,j));
6582 printk(KERN_CONT "\n");
6583 }
6584 printk(KERN_WARNING "\n");
6585}
6586
6587static bool find_numa_distance(int distance)
6588{
6589 int i;
6590
6591 if (distance == node_distance(0, 0))
6592 return true;
6593
6594 for (i = 0; i < sched_domains_numa_levels; i++) {
6595 if (sched_domains_numa_distance[i] == distance)
6596 return true;
6597 }
6598
6599 return false;
6600}
6601
6602static void sched_init_numa(void)
6603{
6604 int next_distance, curr_distance = node_distance(0, 0);
6605 struct sched_domain_topology_level *tl;
6606 int level = 0;
6607 int i, j, k;
6608
6609 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6610 if (!sched_domains_numa_distance)
6611 return;
6612
6613
6614
6615
6616
6617
6618
6619
6620 next_distance = curr_distance;
6621 for (i = 0; i < nr_node_ids; i++) {
6622 for (j = 0; j < nr_node_ids; j++) {
6623 for (k = 0; k < nr_node_ids; k++) {
6624 int distance = node_distance(i, k);
6625
6626 if (distance > curr_distance &&
6627 (distance < next_distance ||
6628 next_distance == curr_distance))
6629 next_distance = distance;
6630
6631
6632
6633
6634
6635
6636 if (sched_debug() && node_distance(k, i) != distance)
6637 sched_numa_warn("Node-distance not symmetric");
6638
6639 if (sched_debug() && i && !find_numa_distance(distance))
6640 sched_numa_warn("Node-0 not representative");
6641 }
6642 if (next_distance != curr_distance) {
6643 sched_domains_numa_distance[level++] = next_distance;
6644 sched_domains_numa_levels = level;
6645 curr_distance = next_distance;
6646 } else break;
6647 }
6648
6649
6650
6651
6652 if (!sched_debug())
6653 break;
6654 }
6655
6656
6657
6658
6659
6660
6661
6662
6663 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6664 if (!sched_domains_numa_masks)
6665 return;
6666
6667
6668
6669
6670
6671 for (i = 0; i < level; i++) {
6672 sched_domains_numa_masks[i] =
6673 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6674 if (!sched_domains_numa_masks[i])
6675 return;
6676
6677 for (j = 0; j < nr_node_ids; j++) {
6678 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6679 if (!mask)
6680 return;
6681
6682 sched_domains_numa_masks[i][j] = mask;
6683
6684 for (k = 0; k < nr_node_ids; k++) {
6685 if (node_distance(j, k) > sched_domains_numa_distance[i])
6686 continue;
6687
6688 cpumask_or(mask, mask, cpumask_of_node(k));
6689 }
6690 }
6691 }
6692
6693 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6694 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6695 if (!tl)
6696 return;
6697
6698
6699
6700
6701 for (i = 0; default_topology[i].init; i++)
6702 tl[i] = default_topology[i];
6703
6704
6705
6706
6707 for (j = 0; j < level; i++, j++) {
6708 tl[i] = (struct sched_domain_topology_level){
6709 .init = sd_numa_init,
6710 .mask = sd_numa_mask,
6711 .flags = SDTL_OVERLAP,
6712 .numa_level = j,
6713 };
6714 }
6715
6716 sched_domain_topology = tl;
6717}
6718#else
6719static inline void sched_init_numa(void)
6720{
6721}
6722#endif
6723
6724static int __sdt_alloc(const struct cpumask *cpu_map)
6725{
6726 struct sched_domain_topology_level *tl;
6727 int j;
6728
6729 for (tl = sched_domain_topology; tl->init; tl++) {
6730 struct sd_data *sdd = &tl->data;
6731
6732 sdd->sd = alloc_percpu(struct sched_domain *);
6733 if (!sdd->sd)
6734 return -ENOMEM;
6735
6736 sdd->sg = alloc_percpu(struct sched_group *);
6737 if (!sdd->sg)
6738 return -ENOMEM;
6739
6740 sdd->sgp = alloc_percpu(struct sched_group_power *);
6741 if (!sdd->sgp)
6742 return -ENOMEM;
6743
6744 for_each_cpu(j, cpu_map) {
6745 struct sched_domain *sd;
6746 struct sched_group *sg;
6747 struct sched_group_power *sgp;
6748
6749 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6750 GFP_KERNEL, cpu_to_node(j));
6751 if (!sd)
6752 return -ENOMEM;
6753
6754 *per_cpu_ptr(sdd->sd, j) = sd;
6755
6756 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6757 GFP_KERNEL, cpu_to_node(j));
6758 if (!sg)
6759 return -ENOMEM;
6760
6761 sg->next = sg;
6762
6763 *per_cpu_ptr(sdd->sg, j) = sg;
6764
6765 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6766 GFP_KERNEL, cpu_to_node(j));
6767 if (!sgp)
6768 return -ENOMEM;
6769
6770 *per_cpu_ptr(sdd->sgp, j) = sgp;
6771 }
6772 }
6773
6774 return 0;
6775}
6776
6777static void __sdt_free(const struct cpumask *cpu_map)
6778{
6779 struct sched_domain_topology_level *tl;
6780 int j;
6781
6782 for (tl = sched_domain_topology; tl->init; tl++) {
6783 struct sd_data *sdd = &tl->data;
6784
6785 for_each_cpu(j, cpu_map) {
6786 struct sched_domain *sd;
6787
6788 if (sdd->sd) {
6789 sd = *per_cpu_ptr(sdd->sd, j);
6790 if (sd && (sd->flags & SD_OVERLAP))
6791 free_sched_groups(sd->groups, 0);
6792 kfree(*per_cpu_ptr(sdd->sd, j));
6793 }
6794
6795 if (sdd->sg)
6796 kfree(*per_cpu_ptr(sdd->sg, j));
6797 if (sdd->sgp)
6798 kfree(*per_cpu_ptr(sdd->sgp, j));
6799 }
6800 free_percpu(sdd->sd);
6801 sdd->sd = NULL;
6802 free_percpu(sdd->sg);
6803 sdd->sg = NULL;
6804 free_percpu(sdd->sgp);
6805 sdd->sgp = NULL;
6806 }
6807}
6808
6809struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6810 struct s_data *d, const struct cpumask *cpu_map,
6811 struct sched_domain_attr *attr, struct sched_domain *child,
6812 int cpu)
6813{
6814 struct sched_domain *sd = tl->init(tl, cpu);
6815 if (!sd)
6816 return child;
6817
6818 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6819 if (child) {
6820 sd->level = child->level + 1;
6821 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6822 child->parent = sd;
6823 }
6824 sd->child = child;
6825 set_domain_attribute(sd, attr);
6826
6827 return sd;
6828}
6829
6830
6831
6832
6833
6834static int build_sched_domains(const struct cpumask *cpu_map,
6835 struct sched_domain_attr *attr)
6836{
6837 enum s_alloc alloc_state = sa_none;
6838 struct sched_domain *sd;
6839 struct s_data d;
6840 int i, ret = -ENOMEM;
6841
6842 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6843 if (alloc_state != sa_rootdomain)
6844 goto error;
6845
6846
6847 for_each_cpu(i, cpu_map) {
6848 struct sched_domain_topology_level *tl;
6849
6850 sd = NULL;
6851 for (tl = sched_domain_topology; tl->init; tl++) {
6852 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6853 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6854 sd->flags |= SD_OVERLAP;
6855 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6856 break;
6857 }
6858
6859 while (sd->child)
6860 sd = sd->child;
6861
6862 *per_cpu_ptr(d.sd, i) = sd;
6863 }
6864
6865
6866 for_each_cpu(i, cpu_map) {
6867 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6868 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6869 if (sd->flags & SD_OVERLAP) {
6870 if (build_overlap_sched_groups(sd, i))
6871 goto error;
6872 } else {
6873 if (build_sched_groups(sd, i))
6874 goto error;
6875 }
6876 }
6877 }
6878
6879
6880 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6881 if (!cpumask_test_cpu(i, cpu_map))
6882 continue;
6883
6884 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6885 claim_allocations(i, sd);
6886 init_sched_groups_power(i, sd);
6887 }
6888 }
6889
6890
6891 rcu_read_lock();
6892 for_each_cpu(i, cpu_map) {
6893 sd = *per_cpu_ptr(d.sd, i);
6894 cpu_attach_domain(sd, d.rd, i);
6895 }
6896 rcu_read_unlock();
6897
6898 ret = 0;
6899error:
6900 __free_domain_allocs(&d, alloc_state, cpu_map);
6901 return ret;
6902}
6903
6904static cpumask_var_t *doms_cur;
6905static int ndoms_cur;
6906static struct sched_domain_attr *dattr_cur;
6907
6908
6909
6910
6911
6912
6913
6914static cpumask_var_t fallback_doms;
6915
6916
6917
6918
6919
6920
6921int __attribute__((weak)) arch_update_cpu_topology(void)
6922{
6923 return 0;
6924}
6925
6926cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6927{
6928 int i;
6929 cpumask_var_t *doms;
6930
6931 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6932 if (!doms)
6933 return NULL;
6934 for (i = 0; i < ndoms; i++) {
6935 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6936 free_sched_domains(doms, i);
6937 return NULL;
6938 }
6939 }
6940 return doms;
6941}
6942
6943void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6944{
6945 unsigned int i;
6946 for (i = 0; i < ndoms; i++)
6947 free_cpumask_var(doms[i]);
6948 kfree(doms);
6949}
6950
6951
6952
6953
6954
6955
6956static int init_sched_domains(const struct cpumask *cpu_map)
6957{
6958 int err;
6959
6960 arch_update_cpu_topology();
6961 ndoms_cur = 1;
6962 doms_cur = alloc_sched_domains(ndoms_cur);
6963 if (!doms_cur)
6964 doms_cur = &fallback_doms;
6965 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6966 err = build_sched_domains(doms_cur[0], NULL);
6967 register_sched_domain_sysctl();
6968
6969 return err;
6970}
6971
6972
6973
6974
6975
6976static void detach_destroy_domains(const struct cpumask *cpu_map)
6977{
6978 int i;
6979
6980 rcu_read_lock();
6981 for_each_cpu(i, cpu_map)
6982 cpu_attach_domain(NULL, &def_root_domain, i);
6983 rcu_read_unlock();
6984}
6985
6986
6987static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6988 struct sched_domain_attr *new, int idx_new)
6989{
6990 struct sched_domain_attr tmp;
6991
6992
6993 if (!new && !cur)
6994 return 1;
6995
6996 tmp = SD_ATTR_INIT;
6997 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6998 new ? (new + idx_new) : &tmp,
6999 sizeof(struct sched_domain_attr));
7000}
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7029 struct sched_domain_attr *dattr_new)
7030{
7031 int i, j, n;
7032 int new_topology;
7033
7034 mutex_lock(&sched_domains_mutex);
7035
7036
7037 unregister_sched_domain_sysctl();
7038
7039
7040 new_topology = arch_update_cpu_topology();
7041
7042 n = doms_new ? ndoms_new : 0;
7043
7044
7045 for (i = 0; i < ndoms_cur; i++) {
7046 for (j = 0; j < n && !new_topology; j++) {
7047 if (cpumask_equal(doms_cur[i], doms_new[j])
7048 && dattrs_equal(dattr_cur, i, dattr_new, j))
7049 goto match1;
7050 }
7051
7052 detach_destroy_domains(doms_cur[i]);
7053match1:
7054 ;
7055 }
7056
7057 if (doms_new == NULL) {
7058 ndoms_cur = 0;
7059 doms_new = &fallback_doms;
7060 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7061 WARN_ON_ONCE(dattr_new);
7062 }
7063
7064
7065 for (i = 0; i < ndoms_new; i++) {
7066 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7067 if (cpumask_equal(doms_new[i], doms_cur[j])
7068 && dattrs_equal(dattr_new, i, dattr_cur, j))
7069 goto match2;
7070 }
7071
7072 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7073match2:
7074 ;
7075 }
7076
7077
7078 if (doms_cur != &fallback_doms)
7079 free_sched_domains(doms_cur, ndoms_cur);
7080 kfree(dattr_cur);
7081 doms_cur = doms_new;
7082 dattr_cur = dattr_new;
7083 ndoms_cur = ndoms_new;
7084
7085 register_sched_domain_sysctl();
7086
7087 mutex_unlock(&sched_domains_mutex);
7088}
7089
7090static int num_cpus_frozen;
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7101 void *hcpu)
7102{
7103 switch (action) {
7104 case CPU_ONLINE_FROZEN:
7105 case CPU_DOWN_FAILED_FROZEN:
7106
7107
7108
7109
7110
7111
7112
7113 num_cpus_frozen--;
7114 if (likely(num_cpus_frozen)) {
7115 partition_sched_domains(1, NULL, NULL);
7116 break;
7117 }
7118
7119
7120
7121
7122
7123
7124
7125 case CPU_ONLINE:
7126 case CPU_DOWN_FAILED:
7127 cpuset_update_active_cpus(true);
7128 break;
7129 default:
7130 return NOTIFY_DONE;
7131 }
7132 return NOTIFY_OK;
7133}
7134
7135static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7136 void *hcpu)
7137{
7138 switch (action) {
7139 case CPU_DOWN_PREPARE:
7140 cpuset_update_active_cpus(false);
7141 break;
7142 case CPU_DOWN_PREPARE_FROZEN:
7143 num_cpus_frozen++;
7144 partition_sched_domains(1, NULL, NULL);
7145 break;
7146 default:
7147 return NOTIFY_DONE;
7148 }
7149 return NOTIFY_OK;
7150}
7151
7152void __init sched_init_smp(void)
7153{
7154 cpumask_var_t non_isolated_cpus;
7155
7156 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7157 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7158
7159 sched_init_numa();
7160
7161 get_online_cpus();
7162 mutex_lock(&sched_domains_mutex);
7163 init_sched_domains(cpu_active_mask);
7164 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7165 if (cpumask_empty(non_isolated_cpus))
7166 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7167 mutex_unlock(&sched_domains_mutex);
7168 put_online_cpus();
7169
7170 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7171 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7172
7173
7174 hotcpu_notifier(update_runtime, 0);
7175
7176 init_hrtick();
7177
7178
7179 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7180 BUG();
7181 sched_init_granularity();
7182 free_cpumask_var(non_isolated_cpus);
7183
7184 init_sched_rt_class();
7185}
7186#else
7187void __init sched_init_smp(void)
7188{
7189 sched_init_granularity();
7190}
7191#endif
7192
7193const_debug unsigned int sysctl_timer_migration = 1;
7194
7195int in_sched_functions(unsigned long addr)
7196{
7197 return in_lock_functions(addr) ||
7198 (addr >= (unsigned long)__sched_text_start
7199 && addr < (unsigned long)__sched_text_end);
7200}
7201
7202#ifdef CONFIG_CGROUP_SCHED
7203struct task_group root_task_group;
7204LIST_HEAD(task_groups);
7205#endif
7206
7207DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
7208
7209void __init sched_init(void)
7210{
7211 int i, j;
7212 unsigned long alloc_size = 0, ptr;
7213
7214#ifdef CONFIG_FAIR_GROUP_SCHED
7215 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7216#endif
7217#ifdef CONFIG_RT_GROUP_SCHED
7218 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7219#endif
7220#ifdef CONFIG_CPUMASK_OFFSTACK
7221 alloc_size += num_possible_cpus() * cpumask_size();
7222#endif
7223 if (alloc_size) {
7224 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7225
7226#ifdef CONFIG_FAIR_GROUP_SCHED
7227 root_task_group.se = (struct sched_entity **)ptr;
7228 ptr += nr_cpu_ids * sizeof(void **);
7229
7230 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7231 ptr += nr_cpu_ids * sizeof(void **);
7232
7233#endif
7234#ifdef CONFIG_RT_GROUP_SCHED
7235 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7236 ptr += nr_cpu_ids * sizeof(void **);
7237
7238 root_task_group.rt_rq = (struct rt_rq **)ptr;
7239 ptr += nr_cpu_ids * sizeof(void **);
7240
7241#endif
7242#ifdef CONFIG_CPUMASK_OFFSTACK
7243 for_each_possible_cpu(i) {
7244 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
7245 ptr += cpumask_size();
7246 }
7247#endif
7248 }
7249
7250#ifdef CONFIG_SMP
7251 init_defrootdomain();
7252#endif
7253
7254 init_rt_bandwidth(&def_rt_bandwidth,
7255 global_rt_period(), global_rt_runtime());
7256
7257#ifdef CONFIG_RT_GROUP_SCHED
7258 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7259 global_rt_period(), global_rt_runtime());
7260#endif
7261
7262#ifdef CONFIG_CGROUP_SCHED
7263 list_add(&root_task_group.list, &task_groups);
7264 INIT_LIST_HEAD(&root_task_group.children);
7265 INIT_LIST_HEAD(&root_task_group.siblings);
7266 autogroup_init(&init_task);
7267
7268#endif
7269
7270#ifdef CONFIG_CGROUP_CPUACCT
7271 root_cpuacct.cpustat = &kernel_cpustat;
7272 root_cpuacct.cpuusage = alloc_percpu(u64);
7273
7274 BUG_ON(!root_cpuacct.cpuusage);
7275#endif
7276 for_each_possible_cpu(i) {
7277 struct rq *rq;
7278
7279 rq = cpu_rq(i);
7280 raw_spin_lock_init(&rq->lock);
7281 rq->nr_running = 0;
7282 rq->calc_load_active = 0;
7283 rq->calc_load_update = jiffies + LOAD_FREQ;
7284 init_cfs_rq(&rq->cfs);
7285 init_rt_rq(&rq->rt, rq);
7286#ifdef CONFIG_FAIR_GROUP_SCHED
7287 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7288 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7309 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7310#endif
7311
7312 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7313#ifdef CONFIG_RT_GROUP_SCHED
7314 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7315 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7316#endif
7317
7318 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7319 rq->cpu_load[j] = 0;
7320
7321 rq->last_load_update_tick = jiffies;
7322
7323#ifdef CONFIG_SMP
7324 rq->sd = NULL;
7325 rq->rd = NULL;
7326 rq->cpu_power = SCHED_POWER_SCALE;
7327 rq->post_schedule = 0;
7328 rq->active_balance = 0;
7329 rq->next_balance = jiffies;
7330 rq->push_cpu = 0;
7331 rq->cpu = i;
7332 rq->online = 0;
7333 rq->idle_stamp = 0;
7334 rq->avg_idle = 2*sysctl_sched_migration_cost;
7335
7336 INIT_LIST_HEAD(&rq->cfs_tasks);
7337
7338 rq_attach_root(rq, &def_root_domain);
7339#ifdef CONFIG_NO_HZ
7340 rq->nohz_flags = 0;
7341#endif
7342#endif
7343 init_rq_hrtick(rq);
7344 atomic_set(&rq->nr_iowait, 0);
7345 }
7346
7347 set_load_weight(&init_task);
7348
7349#ifdef CONFIG_PREEMPT_NOTIFIERS
7350 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7351#endif
7352
7353#ifdef CONFIG_RT_MUTEXES
7354 plist_head_init(&init_task.pi_waiters);
7355#endif
7356
7357
7358
7359
7360 atomic_inc(&init_mm.mm_count);
7361 enter_lazy_tlb(&init_mm, current);
7362
7363
7364
7365
7366
7367
7368
7369 init_idle(current, smp_processor_id());
7370
7371 calc_load_update = jiffies + LOAD_FREQ;
7372
7373
7374
7375
7376 current->sched_class = &fair_sched_class;
7377
7378#ifdef CONFIG_SMP
7379 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7380
7381 if (cpu_isolated_map == NULL)
7382 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7383 idle_thread_set_boot_cpu();
7384#endif
7385 init_sched_fair_class();
7386
7387 scheduler_running = 1;
7388}
7389
7390#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7391static inline int preempt_count_equals(int preempt_offset)
7392{
7393 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7394
7395 return (nested == preempt_offset);
7396}
7397
7398void __might_sleep(const char *file, int line, int preempt_offset)
7399{
7400 static unsigned long prev_jiffy;
7401
7402 rcu_sleep_check();
7403 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7404 system_state != SYSTEM_RUNNING || oops_in_progress)
7405 return;
7406 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7407 return;
7408 prev_jiffy = jiffies;
7409
7410 printk(KERN_ERR
7411 "BUG: sleeping function called from invalid context at %s:%d\n",
7412 file, line);
7413 printk(KERN_ERR
7414 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7415 in_atomic(), irqs_disabled(),
7416 current->pid, current->comm);
7417
7418 debug_show_held_locks(current);
7419 if (irqs_disabled())
7420 print_irqtrace_events(current);
7421 dump_stack();
7422}
7423EXPORT_SYMBOL(__might_sleep);
7424#endif
7425
7426#ifdef CONFIG_MAGIC_SYSRQ
7427static void normalize_task(struct rq *rq, struct task_struct *p)
7428{
7429 const struct sched_class *prev_class = p->sched_class;
7430 int old_prio = p->prio;
7431 int on_rq;
7432
7433 on_rq = p->on_rq;
7434 if (on_rq)
7435 dequeue_task(rq, p, 0);
7436 __setscheduler(rq, p, SCHED_NORMAL, 0);
7437 if (on_rq) {
7438 enqueue_task(rq, p, 0);
7439 resched_task(rq->curr);
7440 }
7441
7442 check_class_changed(rq, p, prev_class, old_prio);
7443}
7444
7445void normalize_rt_tasks(void)
7446{
7447 struct task_struct *g, *p;
7448 unsigned long flags;
7449 struct rq *rq;
7450
7451 read_lock_irqsave(&tasklist_lock, flags);
7452 do_each_thread(g, p) {
7453
7454
7455
7456 if (!p->mm)
7457 continue;
7458
7459 p->se.exec_start = 0;
7460#ifdef CONFIG_SCHEDSTATS
7461 p->se.statistics.wait_start = 0;
7462 p->se.statistics.sleep_start = 0;
7463 p->se.statistics.block_start = 0;
7464#endif
7465
7466 if (!rt_task(p)) {
7467
7468
7469
7470
7471 if (TASK_NICE(p) < 0 && p->mm)
7472 set_user_nice(p, 0);
7473 continue;
7474 }
7475
7476 raw_spin_lock(&p->pi_lock);
7477 rq = __task_rq_lock(p);
7478
7479 normalize_task(rq, p);
7480
7481 __task_rq_unlock(rq);
7482 raw_spin_unlock(&p->pi_lock);
7483 } while_each_thread(g, p);
7484
7485 read_unlock_irqrestore(&tasklist_lock, flags);
7486}
7487
7488#endif
7489
7490#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507struct task_struct *curr_task(int cpu)
7508{
7509 return cpu_curr(cpu);
7510}
7511
7512#endif
7513
7514#ifdef CONFIG_IA64
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530void set_curr_task(int cpu, struct task_struct *p)
7531{
7532 cpu_curr(cpu) = p;
7533}
7534
7535#endif
7536
7537#ifdef CONFIG_CGROUP_SCHED
7538
7539static DEFINE_SPINLOCK(task_group_lock);
7540
7541static void free_sched_group(struct task_group *tg)
7542{
7543 free_fair_sched_group(tg);
7544 free_rt_sched_group(tg);
7545 autogroup_free(tg);
7546 kfree(tg);
7547}
7548
7549
7550struct task_group *sched_create_group(struct task_group *parent)
7551{
7552 struct task_group *tg;
7553 unsigned long flags;
7554
7555 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7556 if (!tg)
7557 return ERR_PTR(-ENOMEM);
7558
7559 if (!alloc_fair_sched_group(tg, parent))
7560 goto err;
7561
7562 if (!alloc_rt_sched_group(tg, parent))
7563 goto err;
7564
7565 spin_lock_irqsave(&task_group_lock, flags);
7566 list_add_rcu(&tg->list, &task_groups);
7567
7568 WARN_ON(!parent);
7569
7570 tg->parent = parent;
7571 INIT_LIST_HEAD(&tg->children);
7572 list_add_rcu(&tg->siblings, &parent->children);
7573 spin_unlock_irqrestore(&task_group_lock, flags);
7574
7575 return tg;
7576
7577err:
7578 free_sched_group(tg);
7579 return ERR_PTR(-ENOMEM);
7580}
7581
7582
7583static void free_sched_group_rcu(struct rcu_head *rhp)
7584{
7585
7586 free_sched_group(container_of(rhp, struct task_group, rcu));
7587}
7588
7589
7590void sched_destroy_group(struct task_group *tg)
7591{
7592 unsigned long flags;
7593 int i;
7594
7595
7596 for_each_possible_cpu(i)
7597 unregister_fair_sched_group(tg, i);
7598
7599 spin_lock_irqsave(&task_group_lock, flags);
7600 list_del_rcu(&tg->list);
7601 list_del_rcu(&tg->siblings);
7602 spin_unlock_irqrestore(&task_group_lock, flags);
7603
7604
7605 call_rcu(&tg->rcu, free_sched_group_rcu);
7606}
7607
7608
7609
7610
7611
7612
7613void sched_move_task(struct task_struct *tsk)
7614{
7615 struct task_group *tg;
7616 int on_rq, running;
7617 unsigned long flags;
7618 struct rq *rq;
7619
7620 rq = task_rq_lock(tsk, &flags);
7621
7622 running = task_current(rq, tsk);
7623 on_rq = tsk->on_rq;
7624
7625 if (on_rq)
7626 dequeue_task(rq, tsk, 0);
7627 if (unlikely(running))
7628 tsk->sched_class->put_prev_task(rq, tsk);
7629
7630 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7631 lockdep_is_held(&tsk->sighand->siglock)),
7632 struct task_group, css);
7633 tg = autogroup_task_group(tsk, tg);
7634 tsk->sched_task_group = tg;
7635
7636#ifdef CONFIG_FAIR_GROUP_SCHED
7637 if (tsk->sched_class->task_move_group)
7638 tsk->sched_class->task_move_group(tsk, on_rq);
7639 else
7640#endif
7641 set_task_rq(tsk, task_cpu(tsk));
7642
7643 if (unlikely(running))
7644 tsk->sched_class->set_curr_task(rq);
7645 if (on_rq)
7646 enqueue_task(rq, tsk, 0);
7647
7648 task_rq_unlock(rq, tsk, &flags);
7649}
7650#endif
7651
7652#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7653static unsigned long to_ratio(u64 period, u64 runtime)
7654{
7655 if (runtime == RUNTIME_INF)
7656 return 1ULL << 20;
7657
7658 return div64_u64(runtime << 20, period);
7659}
7660#endif
7661
7662#ifdef CONFIG_RT_GROUP_SCHED
7663
7664
7665
7666static DEFINE_MUTEX(rt_constraints_mutex);
7667
7668
7669static inline int tg_has_rt_tasks(struct task_group *tg)
7670{
7671 struct task_struct *g, *p;
7672
7673 do_each_thread(g, p) {
7674 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7675 return 1;
7676 } while_each_thread(g, p);
7677
7678 return 0;
7679}
7680
7681struct rt_schedulable_data {
7682 struct task_group *tg;
7683 u64 rt_period;
7684 u64 rt_runtime;
7685};
7686
7687static int tg_rt_schedulable(struct task_group *tg, void *data)
7688{
7689 struct rt_schedulable_data *d = data;
7690 struct task_group *child;
7691 unsigned long total, sum = 0;
7692 u64 period, runtime;
7693
7694 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7695 runtime = tg->rt_bandwidth.rt_runtime;
7696
7697 if (tg == d->tg) {
7698 period = d->rt_period;
7699 runtime = d->rt_runtime;
7700 }
7701
7702
7703
7704
7705 if (runtime > period && runtime != RUNTIME_INF)
7706 return -EINVAL;
7707
7708
7709
7710
7711 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7712 return -EBUSY;
7713
7714 total = to_ratio(period, runtime);
7715
7716
7717
7718
7719 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7720 return -EINVAL;
7721
7722
7723
7724
7725 list_for_each_entry_rcu(child, &tg->children, siblings) {
7726 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7727 runtime = child->rt_bandwidth.rt_runtime;
7728
7729 if (child == d->tg) {
7730 period = d->rt_period;
7731 runtime = d->rt_runtime;
7732 }
7733
7734 sum += to_ratio(period, runtime);
7735 }
7736
7737 if (sum > total)
7738 return -EINVAL;
7739
7740 return 0;
7741}
7742
7743static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7744{
7745 int ret;
7746
7747 struct rt_schedulable_data data = {
7748 .tg = tg,
7749 .rt_period = period,
7750 .rt_runtime = runtime,
7751 };
7752
7753 rcu_read_lock();
7754 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7755 rcu_read_unlock();
7756
7757 return ret;
7758}
7759
7760static int tg_set_rt_bandwidth(struct task_group *tg,
7761 u64 rt_period, u64 rt_runtime)
7762{
7763 int i, err = 0;
7764
7765 mutex_lock(&rt_constraints_mutex);
7766 read_lock(&tasklist_lock);
7767 err = __rt_schedulable(tg, rt_period, rt_runtime);
7768 if (err)
7769 goto unlock;
7770
7771 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7772 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7773 tg->rt_bandwidth.rt_runtime = rt_runtime;
7774
7775 for_each_possible_cpu(i) {
7776 struct rt_rq *rt_rq = tg->rt_rq[i];
7777
7778 raw_spin_lock(&rt_rq->rt_runtime_lock);
7779 rt_rq->rt_runtime = rt_runtime;
7780 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7781 }
7782 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7783unlock:
7784 read_unlock(&tasklist_lock);
7785 mutex_unlock(&rt_constraints_mutex);
7786
7787 return err;
7788}
7789
7790int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7791{
7792 u64 rt_runtime, rt_period;
7793
7794 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7795 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7796 if (rt_runtime_us < 0)
7797 rt_runtime = RUNTIME_INF;
7798
7799 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7800}
7801
7802long sched_group_rt_runtime(struct task_group *tg)
7803{
7804 u64 rt_runtime_us;
7805
7806 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7807 return -1;
7808
7809 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7810 do_div(rt_runtime_us, NSEC_PER_USEC);
7811 return rt_runtime_us;
7812}
7813
7814int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7815{
7816 u64 rt_runtime, rt_period;
7817
7818 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7819 rt_runtime = tg->rt_bandwidth.rt_runtime;
7820
7821 if (rt_period == 0)
7822 return -EINVAL;
7823
7824 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7825}
7826
7827long sched_group_rt_period(struct task_group *tg)
7828{
7829 u64 rt_period_us;
7830
7831 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7832 do_div(rt_period_us, NSEC_PER_USEC);
7833 return rt_period_us;
7834}
7835
7836static int sched_rt_global_constraints(void)
7837{
7838 u64 runtime, period;
7839 int ret = 0;
7840
7841 if (sysctl_sched_rt_period <= 0)
7842 return -EINVAL;
7843
7844 runtime = global_rt_runtime();
7845 period = global_rt_period();
7846
7847
7848
7849
7850 if (runtime > period && runtime != RUNTIME_INF)
7851 return -EINVAL;
7852
7853 mutex_lock(&rt_constraints_mutex);
7854 read_lock(&tasklist_lock);
7855 ret = __rt_schedulable(NULL, 0, 0);
7856 read_unlock(&tasklist_lock);
7857 mutex_unlock(&rt_constraints_mutex);
7858
7859 return ret;
7860}
7861
7862int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7863{
7864
7865 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7866 return 0;
7867
7868 return 1;
7869}
7870
7871#else
7872static int sched_rt_global_constraints(void)
7873{
7874 unsigned long flags;
7875 int i;
7876
7877 if (sysctl_sched_rt_period <= 0)
7878 return -EINVAL;
7879
7880
7881
7882
7883
7884 if (sysctl_sched_rt_runtime == 0)
7885 return -EBUSY;
7886
7887 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7888 for_each_possible_cpu(i) {
7889 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7890
7891 raw_spin_lock(&rt_rq->rt_runtime_lock);
7892 rt_rq->rt_runtime = global_rt_runtime();
7893 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7894 }
7895 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7896
7897 return 0;
7898}
7899#endif
7900
7901int sched_rt_handler(struct ctl_table *table, int write,
7902 void __user *buffer, size_t *lenp,
7903 loff_t *ppos)
7904{
7905 int ret;
7906 int old_period, old_runtime;
7907 static DEFINE_MUTEX(mutex);
7908
7909 mutex_lock(&mutex);
7910 old_period = sysctl_sched_rt_period;
7911 old_runtime = sysctl_sched_rt_runtime;
7912
7913 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7914
7915 if (!ret && write) {
7916 ret = sched_rt_global_constraints();
7917 if (ret) {
7918 sysctl_sched_rt_period = old_period;
7919 sysctl_sched_rt_runtime = old_runtime;
7920 } else {
7921 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7922 def_rt_bandwidth.rt_period =
7923 ns_to_ktime(global_rt_period());
7924 }
7925 }
7926 mutex_unlock(&mutex);
7927
7928 return ret;
7929}
7930
7931#ifdef CONFIG_CGROUP_SCHED
7932
7933
7934static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7935{
7936 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7937 struct task_group, css);
7938}
7939
7940static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7941{
7942 struct task_group *tg, *parent;
7943
7944 if (!cgrp->parent) {
7945
7946 return &root_task_group.css;
7947 }
7948
7949 parent = cgroup_tg(cgrp->parent);
7950 tg = sched_create_group(parent);
7951 if (IS_ERR(tg))
7952 return ERR_PTR(-ENOMEM);
7953
7954 return &tg->css;
7955}
7956
7957static void cpu_cgroup_destroy(struct cgroup *cgrp)
7958{
7959 struct task_group *tg = cgroup_tg(cgrp);
7960
7961 sched_destroy_group(tg);
7962}
7963
7964static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7965 struct cgroup_taskset *tset)
7966{
7967 struct task_struct *task;
7968
7969 cgroup_taskset_for_each(task, cgrp, tset) {
7970#ifdef CONFIG_RT_GROUP_SCHED
7971 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7972 return -EINVAL;
7973#else
7974
7975 if (task->sched_class != &fair_sched_class)
7976 return -EINVAL;
7977#endif
7978 }
7979 return 0;
7980}
7981
7982static void cpu_cgroup_attach(struct cgroup *cgrp,
7983 struct cgroup_taskset *tset)
7984{
7985 struct task_struct *task;
7986
7987 cgroup_taskset_for_each(task, cgrp, tset)
7988 sched_move_task(task);
7989}
7990
7991static void
7992cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7993 struct task_struct *task)
7994{
7995
7996
7997
7998
7999
8000 if (!(task->flags & PF_EXITING))
8001 return;
8002
8003 sched_move_task(task);
8004}
8005
8006#ifdef CONFIG_FAIR_GROUP_SCHED
8007static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8008 u64 shareval)
8009{
8010 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8011}
8012
8013static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8014{
8015 struct task_group *tg = cgroup_tg(cgrp);
8016
8017 return (u64) scale_load_down(tg->shares);
8018}
8019
8020#ifdef CONFIG_CFS_BANDWIDTH
8021static DEFINE_MUTEX(cfs_constraints_mutex);
8022
8023const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
8024const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
8025
8026static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8027
8028static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8029{
8030 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8031 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8032
8033 if (tg == &root_task_group)
8034 return -EINVAL;
8035
8036
8037
8038
8039
8040
8041 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8042 return -EINVAL;
8043
8044
8045
8046
8047
8048
8049 if (period > max_cfs_quota_period)
8050 return -EINVAL;
8051
8052 mutex_lock(&cfs_constraints_mutex);
8053 ret = __cfs_schedulable(tg, period, quota);
8054 if (ret)
8055 goto out_unlock;
8056
8057 runtime_enabled = quota != RUNTIME_INF;
8058 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8059 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
8060 raw_spin_lock_irq(&cfs_b->lock);
8061 cfs_b->period = ns_to_ktime(period);
8062 cfs_b->quota = quota;
8063
8064 __refill_cfs_bandwidth_runtime(cfs_b);
8065
8066 if (runtime_enabled && cfs_b->timer_active) {
8067
8068 cfs_b->timer_active = 0;
8069 __start_cfs_bandwidth(cfs_b);
8070 }
8071 raw_spin_unlock_irq(&cfs_b->lock);
8072
8073 for_each_possible_cpu(i) {
8074 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8075 struct rq *rq = cfs_rq->rq;
8076
8077 raw_spin_lock_irq(&rq->lock);
8078 cfs_rq->runtime_enabled = runtime_enabled;
8079 cfs_rq->runtime_remaining = 0;
8080
8081 if (cfs_rq->throttled)
8082 unthrottle_cfs_rq(cfs_rq);
8083 raw_spin_unlock_irq(&rq->lock);
8084 }
8085out_unlock:
8086 mutex_unlock(&cfs_constraints_mutex);
8087
8088 return ret;
8089}
8090
8091int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8092{
8093 u64 quota, period;
8094
8095 period = ktime_to_ns(tg->cfs_bandwidth.period);
8096 if (cfs_quota_us < 0)
8097 quota = RUNTIME_INF;
8098 else
8099 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8100
8101 return tg_set_cfs_bandwidth(tg, period, quota);
8102}
8103
8104long tg_get_cfs_quota(struct task_group *tg)
8105{
8106 u64 quota_us;
8107
8108 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8109 return -1;
8110
8111 quota_us = tg->cfs_bandwidth.quota;
8112 do_div(quota_us, NSEC_PER_USEC);
8113
8114 return quota_us;
8115}
8116
8117int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8118{
8119 u64 quota, period;
8120
8121 period = (u64)cfs_period_us * NSEC_PER_USEC;
8122 quota = tg->cfs_bandwidth.quota;
8123
8124 return tg_set_cfs_bandwidth(tg, period, quota);
8125}
8126
8127long tg_get_cfs_period(struct task_group *tg)
8128{
8129 u64 cfs_period_us;
8130
8131 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8132 do_div(cfs_period_us, NSEC_PER_USEC);
8133
8134 return cfs_period_us;
8135}
8136
8137static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
8138{
8139 return tg_get_cfs_quota(cgroup_tg(cgrp));
8140}
8141
8142static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
8143 s64 cfs_quota_us)
8144{
8145 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
8146}
8147
8148static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
8149{
8150 return tg_get_cfs_period(cgroup_tg(cgrp));
8151}
8152
8153static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8154 u64 cfs_period_us)
8155{
8156 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
8157}
8158
8159struct cfs_schedulable_data {
8160 struct task_group *tg;
8161 u64 period, quota;
8162};
8163
8164
8165
8166
8167
8168static u64 normalize_cfs_quota(struct task_group *tg,
8169 struct cfs_schedulable_data *d)
8170{
8171 u64 quota, period;
8172
8173 if (tg == d->tg) {
8174 period = d->period;
8175 quota = d->quota;
8176 } else {
8177 period = tg_get_cfs_period(tg);
8178 quota = tg_get_cfs_quota(tg);
8179 }
8180
8181
8182 if (quota == RUNTIME_INF || quota == -1)
8183 return RUNTIME_INF;
8184
8185 return to_ratio(period, quota);
8186}
8187
8188static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8189{
8190 struct cfs_schedulable_data *d = data;
8191 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8192 s64 quota = 0, parent_quota = -1;
8193
8194 if (!tg->parent) {
8195 quota = RUNTIME_INF;
8196 } else {
8197 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8198
8199 quota = normalize_cfs_quota(tg, d);
8200 parent_quota = parent_b->hierarchal_quota;
8201
8202
8203
8204
8205
8206 if (quota == RUNTIME_INF)
8207 quota = parent_quota;
8208 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8209 return -EINVAL;
8210 }
8211 cfs_b->hierarchal_quota = quota;
8212
8213 return 0;
8214}
8215
8216static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8217{
8218 int ret;
8219 struct cfs_schedulable_data data = {
8220 .tg = tg,
8221 .period = period,
8222 .quota = quota,
8223 };
8224
8225 if (quota != RUNTIME_INF) {
8226 do_div(data.period, NSEC_PER_USEC);
8227 do_div(data.quota, NSEC_PER_USEC);
8228 }
8229
8230 rcu_read_lock();
8231 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8232 rcu_read_unlock();
8233
8234 return ret;
8235}
8236
8237static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
8238 struct cgroup_map_cb *cb)
8239{
8240 struct task_group *tg = cgroup_tg(cgrp);
8241 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8242
8243 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
8244 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
8245 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
8246
8247 return 0;
8248}
8249#endif
8250#endif
8251
8252#ifdef CONFIG_RT_GROUP_SCHED
8253static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8254 s64 val)
8255{
8256 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8257}
8258
8259static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8260{
8261 return sched_group_rt_runtime(cgroup_tg(cgrp));
8262}
8263
8264static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8265 u64 rt_period_us)
8266{
8267 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8268}
8269
8270static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8271{
8272 return sched_group_rt_period(cgroup_tg(cgrp));
8273}
8274#endif
8275
8276static struct cftype cpu_files[] = {
8277#ifdef CONFIG_FAIR_GROUP_SCHED
8278 {
8279 .name = "shares",
8280 .read_u64 = cpu_shares_read_u64,
8281 .write_u64 = cpu_shares_write_u64,
8282 },
8283#endif
8284#ifdef CONFIG_CFS_BANDWIDTH
8285 {
8286 .name = "cfs_quota_us",
8287 .read_s64 = cpu_cfs_quota_read_s64,
8288 .write_s64 = cpu_cfs_quota_write_s64,
8289 },
8290 {
8291 .name = "cfs_period_us",
8292 .read_u64 = cpu_cfs_period_read_u64,
8293 .write_u64 = cpu_cfs_period_write_u64,
8294 },
8295 {
8296 .name = "stat",
8297 .read_map = cpu_stats_show,
8298 },
8299#endif
8300#ifdef CONFIG_RT_GROUP_SCHED
8301 {
8302 .name = "rt_runtime_us",
8303 .read_s64 = cpu_rt_runtime_read,
8304 .write_s64 = cpu_rt_runtime_write,
8305 },
8306 {
8307 .name = "rt_period_us",
8308 .read_u64 = cpu_rt_period_read_uint,
8309 .write_u64 = cpu_rt_period_write_uint,
8310 },
8311#endif
8312 { }
8313};
8314
8315struct cgroup_subsys cpu_cgroup_subsys = {
8316 .name = "cpu",
8317 .create = cpu_cgroup_create,
8318 .destroy = cpu_cgroup_destroy,
8319 .can_attach = cpu_cgroup_can_attach,
8320 .attach = cpu_cgroup_attach,
8321 .exit = cpu_cgroup_exit,
8322 .subsys_id = cpu_cgroup_subsys_id,
8323 .base_cftypes = cpu_files,
8324 .early_init = 1,
8325};
8326
8327#endif
8328
8329#ifdef CONFIG_CGROUP_CPUACCT
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8340{
8341 struct cpuacct *ca;
8342
8343 if (!cgrp->parent)
8344 return &root_cpuacct.css;
8345
8346 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8347 if (!ca)
8348 goto out;
8349
8350 ca->cpuusage = alloc_percpu(u64);
8351 if (!ca->cpuusage)
8352 goto out_free_ca;
8353
8354 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8355 if (!ca->cpustat)
8356 goto out_free_cpuusage;
8357
8358 return &ca->css;
8359
8360out_free_cpuusage:
8361 free_percpu(ca->cpuusage);
8362out_free_ca:
8363 kfree(ca);
8364out:
8365 return ERR_PTR(-ENOMEM);
8366}
8367
8368
8369static void cpuacct_destroy(struct cgroup *cgrp)
8370{
8371 struct cpuacct *ca = cgroup_ca(cgrp);
8372
8373 free_percpu(ca->cpustat);
8374 free_percpu(ca->cpuusage);
8375 kfree(ca);
8376}
8377
8378static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8379{
8380 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8381 u64 data;
8382
8383#ifndef CONFIG_64BIT
8384
8385
8386
8387 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8388 data = *cpuusage;
8389 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8390#else
8391 data = *cpuusage;
8392#endif
8393
8394 return data;
8395}
8396
8397static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8398{
8399 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8400
8401#ifndef CONFIG_64BIT
8402
8403
8404
8405 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8406 *cpuusage = val;
8407 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8408#else
8409 *cpuusage = val;
8410#endif
8411}
8412
8413
8414static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8415{
8416 struct cpuacct *ca = cgroup_ca(cgrp);
8417 u64 totalcpuusage = 0;
8418 int i;
8419
8420 for_each_present_cpu(i)
8421 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8422
8423 return totalcpuusage;
8424}
8425
8426static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8427 u64 reset)
8428{
8429 struct cpuacct *ca = cgroup_ca(cgrp);
8430 int err = 0;
8431 int i;
8432
8433 if (reset) {
8434 err = -EINVAL;
8435 goto out;
8436 }
8437
8438 for_each_present_cpu(i)
8439 cpuacct_cpuusage_write(ca, i, 0);
8440
8441out:
8442 return err;
8443}
8444
8445static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8446 struct seq_file *m)
8447{
8448 struct cpuacct *ca = cgroup_ca(cgroup);
8449 u64 percpu;
8450 int i;
8451
8452 for_each_present_cpu(i) {
8453 percpu = cpuacct_cpuusage_read(ca, i);
8454 seq_printf(m, "%llu ", (unsigned long long) percpu);
8455 }
8456 seq_printf(m, "\n");
8457 return 0;
8458}
8459
8460static const char *cpuacct_stat_desc[] = {
8461 [CPUACCT_STAT_USER] = "user",
8462 [CPUACCT_STAT_SYSTEM] = "system",
8463};
8464
8465static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8466 struct cgroup_map_cb *cb)
8467{
8468 struct cpuacct *ca = cgroup_ca(cgrp);
8469 int cpu;
8470 s64 val = 0;
8471
8472 for_each_online_cpu(cpu) {
8473 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8474 val += kcpustat->cpustat[CPUTIME_USER];
8475 val += kcpustat->cpustat[CPUTIME_NICE];
8476 }
8477 val = cputime64_to_clock_t(val);
8478 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8479
8480 val = 0;
8481 for_each_online_cpu(cpu) {
8482 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8483 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8484 val += kcpustat->cpustat[CPUTIME_IRQ];
8485 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8486 }
8487
8488 val = cputime64_to_clock_t(val);
8489 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8490
8491 return 0;
8492}
8493
8494static struct cftype files[] = {
8495 {
8496 .name = "usage",
8497 .read_u64 = cpuusage_read,
8498 .write_u64 = cpuusage_write,
8499 },
8500 {
8501 .name = "usage_percpu",
8502 .read_seq_string = cpuacct_percpu_seq_read,
8503 },
8504 {
8505 .name = "stat",
8506 .read_map = cpuacct_stats_show,
8507 },
8508 { }
8509};
8510
8511
8512
8513
8514
8515
8516void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8517{
8518 struct cpuacct *ca;
8519 int cpu;
8520
8521 if (unlikely(!cpuacct_subsys.active))
8522 return;
8523
8524 cpu = task_cpu(tsk);
8525
8526 rcu_read_lock();
8527
8528 ca = task_ca(tsk);
8529
8530 for (; ca; ca = parent_ca(ca)) {
8531 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8532 *cpuusage += cputime;
8533 }
8534
8535 rcu_read_unlock();
8536}
8537
8538struct cgroup_subsys cpuacct_subsys = {
8539 .name = "cpuacct",
8540 .create = cpuacct_create,
8541 .destroy = cpuacct_destroy,
8542 .subsys_id = cpuacct_subsys_id,
8543 .base_cftypes = files,
8544};
8545#endif
8546