1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75
76#include <asm/switch_to.h>
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79#include <asm/mutex.h>
80#ifdef CONFIG_PARAVIRT
81#include <asm/paravirt.h>
82#endif
83
84#include "sched.h"
85#include "../workqueue_sched.h"
86#include "../smpboot.h"
87
88#define CREATE_TRACE_POINTS
89#include <trace/events/sched.h>
90
91void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92{
93 unsigned long delta;
94 ktime_t soft, hard, now;
95
96 for (;;) {
97 if (hrtimer_active(period_timer))
98 break;
99
100 now = hrtimer_cb_get_time(period_timer);
101 hrtimer_forward(period_timer, now, period);
102
103 soft = hrtimer_get_softexpires(period_timer);
104 hard = hrtimer_get_expires(period_timer);
105 delta = ktime_to_ns(ktime_sub(hard, soft));
106 __hrtimer_start_range_ns(period_timer, soft, delta,
107 HRTIMER_MODE_ABS_PINNED, 0);
108 }
109}
110
111DEFINE_MUTEX(sched_domains_mutex);
112DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
113
114static void update_rq_clock_task(struct rq *rq, s64 delta);
115
116void update_rq_clock(struct rq *rq)
117{
118 s64 delta;
119
120 if (rq->skip_clock_update > 0)
121 return;
122
123 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
124 rq->clock += delta;
125 update_rq_clock_task(rq, delta);
126}
127
128
129
130
131
132#define SCHED_FEAT(name, enabled) \
133 (1UL << __SCHED_FEAT_##name) * enabled |
134
135const_debug unsigned int sysctl_sched_features =
136#include "features.h"
137 0;
138
139#undef SCHED_FEAT
140
141#ifdef CONFIG_SCHED_DEBUG
142#define SCHED_FEAT(name, enabled) \
143 #name ,
144
145static const char * const sched_feat_names[] = {
146#include "features.h"
147};
148
149#undef SCHED_FEAT
150
151static int sched_feat_show(struct seq_file *m, void *v)
152{
153 int i;
154
155 for (i = 0; i < __SCHED_FEAT_NR; i++) {
156 if (!(sysctl_sched_features & (1UL << i)))
157 seq_puts(m, "NO_");
158 seq_printf(m, "%s ", sched_feat_names[i]);
159 }
160 seq_puts(m, "\n");
161
162 return 0;
163}
164
165#ifdef HAVE_JUMP_LABEL
166
167#define jump_label_key__true STATIC_KEY_INIT_TRUE
168#define jump_label_key__false STATIC_KEY_INIT_FALSE
169
170#define SCHED_FEAT(name, enabled) \
171 jump_label_key__##enabled ,
172
173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
174#include "features.h"
175};
176
177#undef SCHED_FEAT
178
179static void sched_feat_disable(int i)
180{
181 if (static_key_enabled(&sched_feat_keys[i]))
182 static_key_slow_dec(&sched_feat_keys[i]);
183}
184
185static void sched_feat_enable(int i)
186{
187 if (!static_key_enabled(&sched_feat_keys[i]))
188 static_key_slow_inc(&sched_feat_keys[i]);
189}
190#else
191static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { };
193#endif
194
195static ssize_t
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i;
203
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212
213 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1;
215 cmp += 3;
216 }
217
218 for (i = 0; i < __SCHED_FEAT_NR; i++) {
219 if (strcmp(cmp, sched_feat_names[i]) == 0) {
220 if (neg) {
221 sysctl_sched_features &= ~(1UL << i);
222 sched_feat_disable(i);
223 } else {
224 sysctl_sched_features |= (1UL << i);
225 sched_feat_enable(i);
226 }
227 break;
228 }
229 }
230
231 if (i == __SCHED_FEAT_NR)
232 return -EINVAL;
233
234 *ppos += cnt;
235
236 return cnt;
237}
238
239static int sched_feat_open(struct inode *inode, struct file *filp)
240{
241 return single_open(filp, sched_feat_show, NULL);
242}
243
244static const struct file_operations sched_feat_fops = {
245 .open = sched_feat_open,
246 .write = sched_feat_write,
247 .read = seq_read,
248 .llseek = seq_lseek,
249 .release = single_release,
250};
251
252static __init int sched_init_debug(void)
253{
254 debugfs_create_file("sched_features", 0644, NULL, NULL,
255 &sched_feat_fops);
256
257 return 0;
258}
259late_initcall(sched_init_debug);
260#endif
261
262
263
264
265
266const_debug unsigned int sysctl_sched_nr_migrate = 32;
267
268
269
270
271
272
273
274const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
275
276
277
278
279
280unsigned int sysctl_sched_rt_period = 1000000;
281
282__read_mostly int scheduler_running;
283
284
285
286
287
288int sysctl_sched_rt_runtime = 950000;
289
290
291
292
293
294
295static inline struct rq *__task_rq_lock(struct task_struct *p)
296 __acquires(rq->lock)
297{
298 struct rq *rq;
299
300 lockdep_assert_held(&p->pi_lock);
301
302 for (;;) {
303 rq = task_rq(p);
304 raw_spin_lock(&rq->lock);
305 if (likely(rq == task_rq(p)))
306 return rq;
307 raw_spin_unlock(&rq->lock);
308 }
309}
310
311
312
313
314static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
315 __acquires(p->pi_lock)
316 __acquires(rq->lock)
317{
318 struct rq *rq;
319
320 for (;;) {
321 raw_spin_lock_irqsave(&p->pi_lock, *flags);
322 rq = task_rq(p);
323 raw_spin_lock(&rq->lock);
324 if (likely(rq == task_rq(p)))
325 return rq;
326 raw_spin_unlock(&rq->lock);
327 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
328 }
329}
330
331static void __task_rq_unlock(struct rq *rq)
332 __releases(rq->lock)
333{
334 raw_spin_unlock(&rq->lock);
335}
336
337static inline void
338task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
339 __releases(rq->lock)
340 __releases(p->pi_lock)
341{
342 raw_spin_unlock(&rq->lock);
343 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
344}
345
346
347
348
349static struct rq *this_rq_lock(void)
350 __acquires(rq->lock)
351{
352 struct rq *rq;
353
354 local_irq_disable();
355 rq = this_rq();
356 raw_spin_lock(&rq->lock);
357
358 return rq;
359}
360
361#ifdef CONFIG_SCHED_HRTICK
362
363
364
365
366
367
368
369
370
371
372
373static void hrtick_clear(struct rq *rq)
374{
375 if (hrtimer_active(&rq->hrtick_timer))
376 hrtimer_cancel(&rq->hrtick_timer);
377}
378
379
380
381
382
383static enum hrtimer_restart hrtick(struct hrtimer *timer)
384{
385 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386
387 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388
389 raw_spin_lock(&rq->lock);
390 update_rq_clock(rq);
391 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392 raw_spin_unlock(&rq->lock);
393
394 return HRTIMER_NORESTART;
395}
396
397#ifdef CONFIG_SMP
398
399
400
401static void __hrtick_start(void *arg)
402{
403 struct rq *rq = arg;
404
405 raw_spin_lock(&rq->lock);
406 hrtimer_restart(&rq->hrtick_timer);
407 rq->hrtick_csd_pending = 0;
408 raw_spin_unlock(&rq->lock);
409}
410
411
412
413
414
415
416void hrtick_start(struct rq *rq, u64 delay)
417{
418 struct hrtimer *timer = &rq->hrtick_timer;
419 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
420
421 hrtimer_set_expires(timer, time);
422
423 if (rq == this_rq()) {
424 hrtimer_restart(timer);
425 } else if (!rq->hrtick_csd_pending) {
426 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
427 rq->hrtick_csd_pending = 1;
428 }
429}
430
431static int
432hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
433{
434 int cpu = (int)(long)hcpu;
435
436 switch (action) {
437 case CPU_UP_CANCELED:
438 case CPU_UP_CANCELED_FROZEN:
439 case CPU_DOWN_PREPARE:
440 case CPU_DOWN_PREPARE_FROZEN:
441 case CPU_DEAD:
442 case CPU_DEAD_FROZEN:
443 hrtick_clear(cpu_rq(cpu));
444 return NOTIFY_OK;
445 }
446
447 return NOTIFY_DONE;
448}
449
450static __init void init_hrtick(void)
451{
452 hotcpu_notifier(hotplug_hrtick, 0);
453}
454#else
455
456
457
458
459
460void hrtick_start(struct rq *rq, u64 delay)
461{
462 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
463 HRTIMER_MODE_REL_PINNED, 0);
464}
465
466static inline void init_hrtick(void)
467{
468}
469#endif
470
471static void init_rq_hrtick(struct rq *rq)
472{
473#ifdef CONFIG_SMP
474 rq->hrtick_csd_pending = 0;
475
476 rq->hrtick_csd.flags = 0;
477 rq->hrtick_csd.func = __hrtick_start;
478 rq->hrtick_csd.info = rq;
479#endif
480
481 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
482 rq->hrtick_timer.function = hrtick;
483}
484#else
485static inline void hrtick_clear(struct rq *rq)
486{
487}
488
489static inline void init_rq_hrtick(struct rq *rq)
490{
491}
492
493static inline void init_hrtick(void)
494{
495}
496#endif
497
498
499
500
501
502
503
504
505#ifdef CONFIG_SMP
506
507#ifndef tsk_is_polling
508#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
509#endif
510
511void resched_task(struct task_struct *p)
512{
513 int cpu;
514
515 assert_raw_spin_locked(&task_rq(p)->lock);
516
517 if (test_tsk_need_resched(p))
518 return;
519
520 set_tsk_need_resched(p);
521
522 cpu = task_cpu(p);
523 if (cpu == smp_processor_id())
524 return;
525
526
527 smp_mb();
528 if (!tsk_is_polling(p))
529 smp_send_reschedule(cpu);
530}
531
532void resched_cpu(int cpu)
533{
534 struct rq *rq = cpu_rq(cpu);
535 unsigned long flags;
536
537 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
538 return;
539 resched_task(cpu_curr(cpu));
540 raw_spin_unlock_irqrestore(&rq->lock, flags);
541}
542
543#ifdef CONFIG_NO_HZ
544
545
546
547
548
549
550
551
552int get_nohz_timer_target(void)
553{
554 int cpu = smp_processor_id();
555 int i;
556 struct sched_domain *sd;
557
558 rcu_read_lock();
559 for_each_domain(cpu, sd) {
560 for_each_cpu(i, sched_domain_span(sd)) {
561 if (!idle_cpu(i)) {
562 cpu = i;
563 goto unlock;
564 }
565 }
566 }
567unlock:
568 rcu_read_unlock();
569 return cpu;
570}
571
572
573
574
575
576
577
578
579
580
581void wake_up_idle_cpu(int cpu)
582{
583 struct rq *rq = cpu_rq(cpu);
584
585 if (cpu == smp_processor_id())
586 return;
587
588
589
590
591
592
593
594
595 if (rq->curr != rq->idle)
596 return;
597
598
599
600
601
602
603 set_tsk_need_resched(rq->idle);
604
605
606 smp_mb();
607 if (!tsk_is_polling(rq->idle))
608 smp_send_reschedule(cpu);
609}
610
611static inline bool got_nohz_idle_kick(void)
612{
613 int cpu = smp_processor_id();
614 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
615}
616
617#else
618
619static inline bool got_nohz_idle_kick(void)
620{
621 return false;
622}
623
624#endif
625
626void sched_avg_update(struct rq *rq)
627{
628 s64 period = sched_avg_period();
629
630 while ((s64)(rq->clock - rq->age_stamp) > period) {
631
632
633
634
635
636 asm("" : "+rm" (rq->age_stamp));
637 rq->age_stamp += period;
638 rq->rt_avg /= 2;
639 }
640}
641
642#else
643void resched_task(struct task_struct *p)
644{
645 assert_raw_spin_locked(&task_rq(p)->lock);
646 set_tsk_need_resched(p);
647}
648#endif
649
650#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
651 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
652
653
654
655
656
657
658int walk_tg_tree_from(struct task_group *from,
659 tg_visitor down, tg_visitor up, void *data)
660{
661 struct task_group *parent, *child;
662 int ret;
663
664 parent = from;
665
666down:
667 ret = (*down)(parent, data);
668 if (ret)
669 goto out;
670 list_for_each_entry_rcu(child, &parent->children, siblings) {
671 parent = child;
672 goto down;
673
674up:
675 continue;
676 }
677 ret = (*up)(parent, data);
678 if (ret || parent == from)
679 goto out;
680
681 child = parent;
682 parent = parent->parent;
683 if (parent)
684 goto up;
685out:
686 return ret;
687}
688
689int tg_nop(struct task_group *tg, void *data)
690{
691 return 0;
692}
693#endif
694
695static void set_load_weight(struct task_struct *p)
696{
697 int prio = p->static_prio - MAX_RT_PRIO;
698 struct load_weight *load = &p->se.load;
699
700
701
702
703 if (p->policy == SCHED_IDLE) {
704 load->weight = scale_load(WEIGHT_IDLEPRIO);
705 load->inv_weight = WMULT_IDLEPRIO;
706 return;
707 }
708
709 load->weight = scale_load(prio_to_weight[prio]);
710 load->inv_weight = prio_to_wmult[prio];
711}
712
713static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
714{
715 update_rq_clock(rq);
716 sched_info_queued(p);
717 p->sched_class->enqueue_task(rq, p, flags);
718}
719
720static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
721{
722 update_rq_clock(rq);
723 sched_info_dequeued(p);
724 p->sched_class->dequeue_task(rq, p, flags);
725}
726
727void activate_task(struct rq *rq, struct task_struct *p, int flags)
728{
729 if (task_contributes_to_load(p))
730 rq->nr_uninterruptible--;
731
732 enqueue_task(rq, p, flags);
733}
734
735void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
736{
737 if (task_contributes_to_load(p))
738 rq->nr_uninterruptible++;
739
740 dequeue_task(rq, p, flags);
741}
742
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745
746
747
748
749
750
751
752
753
754
755
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif
814
815
816
817
818
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835
836
837
838
839
840
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta)
864{
865
866
867
868
869#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
870 s64 steal = 0, irq_delta = 0;
871#endif
872#ifdef CONFIG_IRQ_TIME_ACCOUNTING
873 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890 if (irq_delta > delta)
891 irq_delta = delta;
892
893 rq->prev_irq_time += irq_delta;
894 delta -= irq_delta;
895#endif
896#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897 if (static_key_false((¶virt_steal_rq_enabled))) {
898 u64 st;
899
900 steal = paravirt_steal_clock(cpu_of(rq));
901 steal -= rq->prev_steal_time_rq;
902
903 if (unlikely(steal > delta))
904 steal = delta;
905
906 st = steal_ticks(steal);
907 steal = st * TICK_NSEC;
908
909 rq->prev_steal_time_rq += steal;
910
911 delta -= steal;
912 }
913#endif
914
915 rq->clock_task += delta;
916
917#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
918 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
919 sched_rt_avg_update(rq, irq_delta + steal);
920#endif
921}
922
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop)
961{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
963 struct task_struct *old_stop = cpu_rq(cpu)->stop;
964
965 if (stop) {
966
967
968
969
970
971
972
973
974 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
975
976 stop->sched_class = &stop_sched_class;
977 }
978
979 cpu_rq(cpu)->stop = stop;
980
981 if (old_stop) {
982
983
984
985
986 old_stop->sched_class = &rt_sched_class;
987 }
988}
989
990
991
992
993static inline int __normal_prio(struct task_struct *p)
994{
995 return p->static_prio;
996}
997
998
999
1000
1001
1002
1003
1004
1005static inline int normal_prio(struct task_struct *p)
1006{
1007 int prio;
1008
1009 if (task_has_rt_policy(p))
1010 prio = MAX_RT_PRIO-1 - p->rt_priority;
1011 else
1012 prio = __normal_prio(p);
1013 return prio;
1014}
1015
1016
1017
1018
1019
1020
1021
1022
1023static int effective_prio(struct task_struct *p)
1024{
1025 p->normal_prio = normal_prio(p);
1026
1027
1028
1029
1030
1031 if (!rt_prio(p->prio))
1032 return p->normal_prio;
1033 return p->prio;
1034}
1035
1036
1037
1038
1039
1040inline int task_curr(const struct task_struct *p)
1041{
1042 return cpu_curr(task_cpu(p)) == p;
1043}
1044
1045static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1046 const struct sched_class *prev_class,
1047 int oldprio)
1048{
1049 if (prev_class != p->sched_class) {
1050 if (prev_class->switched_from)
1051 prev_class->switched_from(rq, p);
1052 p->sched_class->switched_to(rq, p);
1053 } else if (oldprio != p->prio)
1054 p->sched_class->prio_changed(rq, p, oldprio);
1055}
1056
1057void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1058{
1059 const struct sched_class *class;
1060
1061 if (p->sched_class == rq->curr->sched_class) {
1062 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1063 } else {
1064 for_each_class(class) {
1065 if (class == rq->curr->sched_class)
1066 break;
1067 if (class == p->sched_class) {
1068 resched_task(rq->curr);
1069 break;
1070 }
1071 }
1072 }
1073
1074
1075
1076
1077
1078 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
1079 rq->skip_clock_update = 1;
1080}
1081
1082#ifdef CONFIG_SMP
1083void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1084{
1085#ifdef CONFIG_SCHED_DEBUG
1086
1087
1088
1089
1090 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1091 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1092
1093#ifdef CONFIG_LOCKDEP
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1105 lockdep_is_held(&task_rq(p)->lock)));
1106#endif
1107#endif
1108
1109 trace_sched_migrate_task(p, new_cpu);
1110
1111 if (task_cpu(p) != new_cpu) {
1112 p->se.nr_migrations++;
1113 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1114 }
1115
1116 __set_task_cpu(p, new_cpu);
1117}
1118
1119struct migration_arg {
1120 struct task_struct *task;
1121 int dest_cpu;
1122};
1123
1124static int migration_cpu_stop(void *data);
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1143{
1144 unsigned long flags;
1145 int running, on_rq;
1146 unsigned long ncsw;
1147 struct rq *rq;
1148
1149 for (;;) {
1150
1151
1152
1153
1154
1155
1156 rq = task_rq(p);
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169 while (task_running(rq, p)) {
1170 if (match_state && unlikely(p->state != match_state))
1171 return 0;
1172 cpu_relax();
1173 }
1174
1175
1176
1177
1178
1179
1180 rq = task_rq_lock(p, &flags);
1181 trace_sched_wait_task(p);
1182 running = task_running(rq, p);
1183 on_rq = p->on_rq;
1184 ncsw = 0;
1185 if (!match_state || p->state == match_state)
1186 ncsw = p->nvcsw | LONG_MIN;
1187 task_rq_unlock(rq, p, &flags);
1188
1189
1190
1191
1192 if (unlikely(!ncsw))
1193 break;
1194
1195
1196
1197
1198
1199
1200
1201 if (unlikely(running)) {
1202 cpu_relax();
1203 continue;
1204 }
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 if (unlikely(on_rq)) {
1216 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1217
1218 set_current_state(TASK_UNINTERRUPTIBLE);
1219 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1220 continue;
1221 }
1222
1223
1224
1225
1226
1227
1228 break;
1229 }
1230
1231 return ncsw;
1232}
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247void kick_process(struct task_struct *p)
1248{
1249 int cpu;
1250
1251 preempt_disable();
1252 cpu = task_cpu(p);
1253 if ((cpu != smp_processor_id()) && task_curr(p))
1254 smp_send_reschedule(cpu);
1255 preempt_enable();
1256}
1257EXPORT_SYMBOL_GPL(kick_process);
1258#endif
1259
1260#ifdef CONFIG_SMP
1261
1262
1263
1264static int select_fallback_rq(int cpu, struct task_struct *p)
1265{
1266 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1267 enum { cpuset, possible, fail } state = cpuset;
1268 int dest_cpu;
1269
1270
1271 for_each_cpu(dest_cpu, nodemask) {
1272 if (!cpu_online(dest_cpu))
1273 continue;
1274 if (!cpu_active(dest_cpu))
1275 continue;
1276 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1277 return dest_cpu;
1278 }
1279
1280 for (;;) {
1281
1282 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1283 if (!cpu_online(dest_cpu))
1284 continue;
1285 if (!cpu_active(dest_cpu))
1286 continue;
1287 goto out;
1288 }
1289
1290 switch (state) {
1291 case cpuset:
1292
1293 cpuset_cpus_allowed_fallback(p);
1294 state = possible;
1295 break;
1296
1297 case possible:
1298 do_set_cpus_allowed(p, cpu_possible_mask);
1299 state = fail;
1300 break;
1301
1302 case fail:
1303 BUG();
1304 break;
1305 }
1306 }
1307
1308out:
1309 if (state != cpuset) {
1310
1311
1312
1313
1314
1315 if (p->mm && printk_ratelimit()) {
1316 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1317 task_pid_nr(p), p->comm, cpu);
1318 }
1319 }
1320
1321 return dest_cpu;
1322}
1323
1324
1325
1326
1327static inline
1328int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1329{
1330 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1343 !cpu_online(cpu)))
1344 cpu = select_fallback_rq(task_cpu(p), p);
1345
1346 return cpu;
1347}
1348
1349static void update_avg(u64 *avg, u64 sample)
1350{
1351 s64 diff = sample - *avg;
1352 *avg += diff >> 3;
1353}
1354#endif
1355
1356static void
1357ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1358{
1359#ifdef CONFIG_SCHEDSTATS
1360 struct rq *rq = this_rq();
1361
1362#ifdef CONFIG_SMP
1363 int this_cpu = smp_processor_id();
1364
1365 if (cpu == this_cpu) {
1366 schedstat_inc(rq, ttwu_local);
1367 schedstat_inc(p, se.statistics.nr_wakeups_local);
1368 } else {
1369 struct sched_domain *sd;
1370
1371 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1372 rcu_read_lock();
1373 for_each_domain(this_cpu, sd) {
1374 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1375 schedstat_inc(sd, ttwu_wake_remote);
1376 break;
1377 }
1378 }
1379 rcu_read_unlock();
1380 }
1381
1382 if (wake_flags & WF_MIGRATED)
1383 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1384
1385#endif
1386
1387 schedstat_inc(rq, ttwu_count);
1388 schedstat_inc(p, se.statistics.nr_wakeups);
1389
1390 if (wake_flags & WF_SYNC)
1391 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1392
1393#endif
1394}
1395
1396static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1397{
1398 activate_task(rq, p, en_flags);
1399 p->on_rq = 1;
1400
1401
1402 if (p->flags & PF_WQ_WORKER)
1403 wq_worker_waking_up(p, cpu_of(rq));
1404}
1405
1406
1407
1408
1409static void
1410ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1411{
1412 trace_sched_wakeup(p, true);
1413 check_preempt_curr(rq, p, wake_flags);
1414
1415 p->state = TASK_RUNNING;
1416#ifdef CONFIG_SMP
1417 if (p->sched_class->task_woken)
1418 p->sched_class->task_woken(rq, p);
1419
1420 if (rq->idle_stamp) {
1421 u64 delta = rq->clock - rq->idle_stamp;
1422 u64 max = 2*sysctl_sched_migration_cost;
1423
1424 if (delta > max)
1425 rq->avg_idle = max;
1426 else
1427 update_avg(&rq->avg_idle, delta);
1428 rq->idle_stamp = 0;
1429 }
1430#endif
1431}
1432
1433static void
1434ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1435{
1436#ifdef CONFIG_SMP
1437 if (p->sched_contributes_to_load)
1438 rq->nr_uninterruptible--;
1439#endif
1440
1441 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1442 ttwu_do_wakeup(rq, p, wake_flags);
1443}
1444
1445
1446
1447
1448
1449
1450
1451static int ttwu_remote(struct task_struct *p, int wake_flags)
1452{
1453 struct rq *rq;
1454 int ret = 0;
1455
1456 rq = __task_rq_lock(p);
1457 if (p->on_rq) {
1458 ttwu_do_wakeup(rq, p, wake_flags);
1459 ret = 1;
1460 }
1461 __task_rq_unlock(rq);
1462
1463 return ret;
1464}
1465
1466#ifdef CONFIG_SMP
1467static void sched_ttwu_pending(void)
1468{
1469 struct rq *rq = this_rq();
1470 struct llist_node *llist = llist_del_all(&rq->wake_list);
1471 struct task_struct *p;
1472
1473 raw_spin_lock(&rq->lock);
1474
1475 while (llist) {
1476 p = llist_entry(llist, struct task_struct, wake_entry);
1477 llist = llist_next(llist);
1478 ttwu_do_activate(rq, p, 0);
1479 }
1480
1481 raw_spin_unlock(&rq->lock);
1482}
1483
1484void scheduler_ipi(void)
1485{
1486 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1487 return;
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502 irq_enter();
1503 sched_ttwu_pending();
1504
1505
1506
1507
1508 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1509 this_rq()->idle_balance = 1;
1510 raise_softirq_irqoff(SCHED_SOFTIRQ);
1511 }
1512 irq_exit();
1513}
1514
1515static void ttwu_queue_remote(struct task_struct *p, int cpu)
1516{
1517 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1518 smp_send_reschedule(cpu);
1519}
1520
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu)
1541{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1543}
1544#endif
1545
1546static void ttwu_queue(struct task_struct *p, int cpu)
1547{
1548 struct rq *rq = cpu_rq(cpu);
1549
1550#if defined(CONFIG_SMP)
1551 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1552 sched_clock_cpu(cpu);
1553 ttwu_queue_remote(p, cpu);
1554 return;
1555 }
1556#endif
1557
1558 raw_spin_lock(&rq->lock);
1559 ttwu_do_activate(rq, p, 0);
1560 raw_spin_unlock(&rq->lock);
1561}
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578static int
1579try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1580{
1581 unsigned long flags;
1582 int cpu, success = 0;
1583
1584 smp_wmb();
1585 raw_spin_lock_irqsave(&p->pi_lock, flags);
1586 if (!(p->state & state))
1587 goto out;
1588
1589 success = 1;
1590 cpu = task_cpu(p);
1591
1592 if (p->on_rq && ttwu_remote(p, wake_flags))
1593 goto stat;
1594
1595#ifdef CONFIG_SMP
1596
1597
1598
1599
1600 while (p->on_cpu) {
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602
1603
1604
1605
1606
1607
1608
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax();
1613#endif
1614 }
1615
1616
1617
1618 smp_rmb();
1619
1620 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1621 p->state = TASK_WAKING;
1622
1623 if (p->sched_class->task_waking)
1624 p->sched_class->task_waking(p);
1625
1626 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1627 if (task_cpu(p) != cpu) {
1628 wake_flags |= WF_MIGRATED;
1629 set_task_cpu(p, cpu);
1630 }
1631#endif
1632
1633 ttwu_queue(p, cpu);
1634stat:
1635 ttwu_stat(p, cpu, wake_flags);
1636out:
1637 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1638
1639 return success;
1640}
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650static void try_to_wake_up_local(struct task_struct *p)
1651{
1652 struct rq *rq = task_rq(p);
1653
1654 BUG_ON(rq != this_rq());
1655 BUG_ON(p == current);
1656 lockdep_assert_held(&rq->lock);
1657
1658 if (!raw_spin_trylock(&p->pi_lock)) {
1659 raw_spin_unlock(&rq->lock);
1660 raw_spin_lock(&p->pi_lock);
1661 raw_spin_lock(&rq->lock);
1662 }
1663
1664 if (!(p->state & TASK_NORMAL))
1665 goto out;
1666
1667 if (!p->on_rq)
1668 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1669
1670 ttwu_do_wakeup(rq, p, 0);
1671 ttwu_stat(p, smp_processor_id(), 0);
1672out:
1673 raw_spin_unlock(&p->pi_lock);
1674}
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687int wake_up_process(struct task_struct *p)
1688{
1689 return try_to_wake_up(p, TASK_ALL, 0);
1690}
1691EXPORT_SYMBOL(wake_up_process);
1692
1693int wake_up_state(struct task_struct *p, unsigned int state)
1694{
1695 return try_to_wake_up(p, state, 0);
1696}
1697
1698
1699
1700
1701
1702
1703
1704static void __sched_fork(struct task_struct *p)
1705{
1706 p->on_rq = 0;
1707
1708 p->se.on_rq = 0;
1709 p->se.exec_start = 0;
1710 p->se.sum_exec_runtime = 0;
1711 p->se.prev_sum_exec_runtime = 0;
1712 p->se.nr_migrations = 0;
1713 p->se.vruntime = 0;
1714 INIT_LIST_HEAD(&p->se.group_node);
1715
1716#ifdef CONFIG_SCHEDSTATS
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif
1719
1720 INIT_LIST_HEAD(&p->rt.run_list);
1721
1722#ifdef CONFIG_PREEMPT_NOTIFIERS
1723 INIT_HLIST_HEAD(&p->preempt_notifiers);
1724#endif
1725}
1726
1727
1728
1729
1730void sched_fork(struct task_struct *p)
1731{
1732 unsigned long flags;
1733 int cpu = get_cpu();
1734
1735 __sched_fork(p);
1736
1737
1738
1739
1740
1741 p->state = TASK_RUNNING;
1742
1743
1744
1745
1746 p->prio = current->normal_prio;
1747
1748
1749
1750
1751 if (unlikely(p->sched_reset_on_fork)) {
1752 if (task_has_rt_policy(p)) {
1753 p->policy = SCHED_NORMAL;
1754 p->static_prio = NICE_TO_PRIO(0);
1755 p->rt_priority = 0;
1756 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1757 p->static_prio = NICE_TO_PRIO(0);
1758
1759 p->prio = p->normal_prio = __normal_prio(p);
1760 set_load_weight(p);
1761
1762
1763
1764
1765
1766 p->sched_reset_on_fork = 0;
1767 }
1768
1769 if (!rt_prio(p->prio))
1770 p->sched_class = &fair_sched_class;
1771
1772 if (p->sched_class->task_fork)
1773 p->sched_class->task_fork(p);
1774
1775
1776
1777
1778
1779
1780
1781
1782 raw_spin_lock_irqsave(&p->pi_lock, flags);
1783 set_task_cpu(p, cpu);
1784 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1785
1786#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1787 if (likely(sched_info_on()))
1788 memset(&p->sched_info, 0, sizeof(p->sched_info));
1789#endif
1790#if defined(CONFIG_SMP)
1791 p->on_cpu = 0;
1792#endif
1793#ifdef CONFIG_PREEMPT_COUNT
1794
1795 task_thread_info(p)->preempt_count = 1;
1796#endif
1797#ifdef CONFIG_SMP
1798 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1799#endif
1800
1801 put_cpu();
1802}
1803
1804
1805
1806
1807
1808
1809
1810
1811void wake_up_new_task(struct task_struct *p)
1812{
1813 unsigned long flags;
1814 struct rq *rq;
1815
1816 raw_spin_lock_irqsave(&p->pi_lock, flags);
1817#ifdef CONFIG_SMP
1818
1819
1820
1821
1822
1823 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1824#endif
1825
1826 rq = __task_rq_lock(p);
1827 activate_task(rq, p, 0);
1828 p->on_rq = 1;
1829 trace_sched_wakeup_new(p, true);
1830 check_preempt_curr(rq, p, WF_FORK);
1831#ifdef CONFIG_SMP
1832 if (p->sched_class->task_woken)
1833 p->sched_class->task_woken(rq, p);
1834#endif
1835 task_rq_unlock(rq, p, &flags);
1836}
1837
1838#ifdef CONFIG_PREEMPT_NOTIFIERS
1839
1840
1841
1842
1843
1844void preempt_notifier_register(struct preempt_notifier *notifier)
1845{
1846 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1847}
1848EXPORT_SYMBOL_GPL(preempt_notifier_register);
1849
1850
1851
1852
1853
1854
1855
1856void preempt_notifier_unregister(struct preempt_notifier *notifier)
1857{
1858 hlist_del(¬ifier->link);
1859}
1860EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1861
1862static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1863{
1864 struct preempt_notifier *notifier;
1865 struct hlist_node *node;
1866
1867 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1868 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1869}
1870
1871static void
1872fire_sched_out_preempt_notifiers(struct task_struct *curr,
1873 struct task_struct *next)
1874{
1875 struct preempt_notifier *notifier;
1876 struct hlist_node *node;
1877
1878 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1879 notifier->ops->sched_out(notifier, next);
1880}
1881
1882#else
1883
1884static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1885{
1886}
1887
1888static void
1889fire_sched_out_preempt_notifiers(struct task_struct *curr,
1890 struct task_struct *next)
1891{
1892}
1893
1894#endif
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909static inline void
1910prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next)
1912{
1913 sched_info_switch(prev, next);
1914 perf_event_task_sched_out(prev, next);
1915 fire_sched_out_preempt_notifiers(prev, next);
1916 prepare_lock_switch(rq, next);
1917 prepare_arch_switch(next);
1918 trace_sched_switch(prev, next);
1919}
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1937 __releases(rq->lock)
1938{
1939 struct mm_struct *mm = rq->prev_mm;
1940 long prev_state;
1941
1942 rq->prev_mm = NULL;
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955 prev_state = prev->state;
1956 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif
1960 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif
1964 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch();
1966
1967 fire_sched_in_preempt_notifiers(current);
1968 if (mm)
1969 mmdrop(mm);
1970 if (unlikely(prev_state == TASK_DEAD)) {
1971
1972
1973
1974
1975 kprobe_flush_task(prev);
1976 put_task_struct(prev);
1977 }
1978}
1979
1980#ifdef CONFIG_SMP
1981
1982
1983static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1984{
1985 if (prev->sched_class->pre_schedule)
1986 prev->sched_class->pre_schedule(rq, prev);
1987}
1988
1989
1990static inline void post_schedule(struct rq *rq)
1991{
1992 if (rq->post_schedule) {
1993 unsigned long flags;
1994
1995 raw_spin_lock_irqsave(&rq->lock, flags);
1996 if (rq->curr->sched_class->post_schedule)
1997 rq->curr->sched_class->post_schedule(rq);
1998 raw_spin_unlock_irqrestore(&rq->lock, flags);
1999
2000 rq->post_schedule = 0;
2001 }
2002}
2003
2004#else
2005
2006static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2007{
2008}
2009
2010static inline void post_schedule(struct rq *rq)
2011{
2012}
2013
2014#endif
2015
2016
2017
2018
2019
2020asmlinkage void schedule_tail(struct task_struct *prev)
2021 __releases(rq->lock)
2022{
2023 struct rq *rq = this_rq();
2024
2025 finish_task_switch(rq, prev);
2026
2027
2028
2029
2030
2031 post_schedule(rq);
2032
2033#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2034
2035 preempt_enable();
2036#endif
2037 if (current->set_child_tid)
2038 put_user(task_pid_vnr(current), current->set_child_tid);
2039}
2040
2041
2042
2043
2044
2045static inline void
2046context_switch(struct rq *rq, struct task_struct *prev,
2047 struct task_struct *next)
2048{
2049 struct mm_struct *mm, *oldmm;
2050
2051 prepare_task_switch(rq, prev, next);
2052
2053 mm = next->mm;
2054 oldmm = prev->active_mm;
2055
2056
2057
2058
2059
2060 arch_start_context_switch(prev);
2061
2062 if (!mm) {
2063 next->active_mm = oldmm;
2064 atomic_inc(&oldmm->mm_count);
2065 enter_lazy_tlb(oldmm, next);
2066 } else
2067 switch_mm(oldmm, mm, next);
2068
2069 if (!prev->mm) {
2070 prev->active_mm = NULL;
2071 rq->prev_mm = oldmm;
2072 }
2073
2074
2075
2076
2077
2078
2079#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2080 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2081#endif
2082
2083
2084 switch_to(prev, next, prev);
2085
2086 barrier();
2087
2088
2089
2090
2091
2092 finish_task_switch(this_rq(), prev);
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102unsigned long nr_running(void)
2103{
2104 unsigned long i, sum = 0;
2105
2106 for_each_online_cpu(i)
2107 sum += cpu_rq(i)->nr_running;
2108
2109 return sum;
2110}
2111
2112unsigned long nr_uninterruptible(void)
2113{
2114 unsigned long i, sum = 0;
2115
2116 for_each_possible_cpu(i)
2117 sum += cpu_rq(i)->nr_uninterruptible;
2118
2119
2120
2121
2122
2123 if (unlikely((long)sum < 0))
2124 sum = 0;
2125
2126 return sum;
2127}
2128
2129unsigned long long nr_context_switches(void)
2130{
2131 int i;
2132 unsigned long long sum = 0;
2133
2134 for_each_possible_cpu(i)
2135 sum += cpu_rq(i)->nr_switches;
2136
2137 return sum;
2138}
2139
2140unsigned long nr_iowait(void)
2141{
2142 unsigned long i, sum = 0;
2143
2144 for_each_possible_cpu(i)
2145 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2146
2147 return sum;
2148}
2149
2150unsigned long nr_iowait_cpu(int cpu)
2151{
2152 struct rq *this = cpu_rq(cpu);
2153 return atomic_read(&this->nr_iowait);
2154}
2155
2156unsigned long this_cpu_load(void)
2157{
2158 struct rq *this = this_rq();
2159 return this->cpu_load[0];
2160}
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211static atomic_long_t calc_load_tasks;
2212static unsigned long calc_load_update;
2213unsigned long avenrun[3];
2214EXPORT_SYMBOL(avenrun);
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2230
2231static long calc_load_fold_active(struct rq *this_rq)
2232{
2233 long nr_active, delta = 0;
2234
2235 nr_active = this_rq->nr_running;
2236 nr_active += (long) this_rq->nr_uninterruptible;
2237
2238 if (nr_active != this_rq->calc_load_active) {
2239 delta = nr_active - this_rq->calc_load_active;
2240 this_rq->calc_load_active = nr_active;
2241 }
2242
2243 return delta;
2244}
2245
2246
2247
2248
2249static unsigned long
2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2251{
2252 load *= exp;
2253 load += active * (FIXED_1 - exp);
2254 load += 1UL << (FSHIFT - 1);
2255 return load >> FSHIFT;
2256}
2257
2258#ifdef CONFIG_NO_HZ
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2303
2304static inline int calc_load_write_idx(void)
2305{
2306 int idx = calc_load_idx;
2307
2308
2309
2310
2311
2312 smp_rmb();
2313
2314
2315
2316
2317
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2332 long delta;
2333
2334
2335
2336
2337
2338 delta = calc_load_fold_active(this_rq);
2339 if (delta) {
2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2343}
2344
2345void calc_load_exit_idle(void)
2346{
2347 struct rq *this_rq = this_rq();
2348
2349
2350
2351
2352 if (time_before(jiffies, this_rq->calc_load_update))
2353 return;
2354
2355
2356
2357
2358
2359
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2372
2373 return delta;
2374}
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391static unsigned long
2392fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2393{
2394 unsigned long result = 1UL << frac_bits;
2395
2396 if (n) for (;;) {
2397 if (n & 1) {
2398 result *= x;
2399 result += 1UL << (frac_bits - 1);
2400 result >>= frac_bits;
2401 }
2402 n >>= 1;
2403 if (!n)
2404 break;
2405 x *= x;
2406 x += 1UL << (frac_bits - 1);
2407 x >>= frac_bits;
2408 }
2409
2410 return result;
2411}
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436static unsigned long
2437calc_load_n(unsigned long load, unsigned long exp,
2438 unsigned long active, unsigned int n)
2439{
2440
2441 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2442}
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453static void calc_global_nohz(void)
2454{
2455 long delta, active, n;
2456
2457 if (!time_before(jiffies, calc_load_update + 10)) {
2458
2459
2460
2461 delta = jiffies - calc_load_update - 10;
2462 n = 1 + (delta / LOAD_FREQ);
2463
2464 active = atomic_long_read(&calc_load_tasks);
2465 active = active > 0 ? active * FIXED_1 : 0;
2466
2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2470
2471 calc_load_update += n * LOAD_FREQ;
2472 }
2473
2474
2475
2476
2477
2478
2479
2480
2481 smp_wmb();
2482 calc_load_idx++;
2483}
2484#else
2485
2486static inline long calc_load_fold_idle(void) { return 0; }
2487static inline void calc_global_nohz(void) { }
2488
2489#endif
2490
2491
2492
2493
2494
2495void calc_global_load(unsigned long ticks)
2496{
2497 long active, delta;
2498
2499 if (time_before(jiffies, calc_load_update + 10))
2500 return;
2501
2502
2503
2504
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2509 active = atomic_long_read(&calc_load_tasks);
2510 active = active > 0 ? active * FIXED_1 : 0;
2511
2512 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2513 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2514 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2515
2516 calc_load_update += LOAD_FREQ;
2517
2518
2519
2520
2521 calc_global_nohz();
2522}
2523
2524
2525
2526
2527
2528static void calc_load_account_active(struct rq *this_rq)
2529{
2530 long delta;
2531
2532 if (time_before(jiffies, this_rq->calc_load_update))
2533 return;
2534
2535 delta = calc_load_fold_active(this_rq);
2536 if (delta)
2537 atomic_long_add(delta, &calc_load_tasks);
2538
2539 this_rq->calc_load_update += LOAD_FREQ;
2540}
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573#define DEGRADE_SHIFT 7
2574static const unsigned char
2575 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2576static const unsigned char
2577 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2578 {0, 0, 0, 0, 0, 0, 0, 0},
2579 {64, 32, 8, 0, 0, 0, 0, 0},
2580 {96, 72, 40, 12, 1, 0, 0},
2581 {112, 98, 75, 43, 15, 1, 0},
2582 {120, 112, 98, 76, 45, 16, 2} };
2583
2584
2585
2586
2587
2588
2589static unsigned long
2590decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2591{
2592 int j = 0;
2593
2594 if (!missed_updates)
2595 return load;
2596
2597 if (missed_updates >= degrade_zero_ticks[idx])
2598 return 0;
2599
2600 if (idx == 1)
2601 return load >> missed_updates;
2602
2603 while (missed_updates) {
2604 if (missed_updates % 2)
2605 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2606
2607 missed_updates >>= 1;
2608 j++;
2609 }
2610 return load;
2611}
2612
2613
2614
2615
2616
2617
2618static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2619 unsigned long pending_updates)
2620{
2621 int i, scale;
2622
2623 this_rq->nr_load_updates++;
2624
2625
2626 this_rq->cpu_load[0] = this_load;
2627 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2628 unsigned long old_load, new_load;
2629
2630
2631
2632 old_load = this_rq->cpu_load[i];
2633 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2634 new_load = this_load;
2635
2636
2637
2638
2639
2640 if (new_load > old_load)
2641 new_load += scale - 1;
2642
2643 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2644 }
2645
2646 sched_avg_update(this_rq);
2647}
2648
2649#ifdef CONFIG_NO_HZ
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667void update_idle_cpu_load(struct rq *this_rq)
2668{
2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2670 unsigned long load = this_rq->load.weight;
2671 unsigned long pending_updates;
2672
2673
2674
2675
2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2677 return;
2678
2679 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2680 this_rq->last_load_update_tick = curr_jiffies;
2681
2682 __update_cpu_load(this_rq, load, pending_updates);
2683}
2684
2685
2686
2687
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701
2702
2703
2704
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif
2710
2711
2712
2713
2714static void update_cpu_load_active(struct rq *this_rq)
2715{
2716
2717
2718
2719 this_rq->last_load_update_tick = jiffies;
2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2721
2722 calc_load_account_active(this_rq);
2723}
2724
2725#ifdef CONFIG_SMP
2726
2727
2728
2729
2730
2731void sched_exec(void)
2732{
2733 struct task_struct *p = current;
2734 unsigned long flags;
2735 int dest_cpu;
2736
2737 raw_spin_lock_irqsave(&p->pi_lock, flags);
2738 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2739 if (dest_cpu == smp_processor_id())
2740 goto unlock;
2741
2742 if (likely(cpu_active(dest_cpu))) {
2743 struct migration_arg arg = { p, dest_cpu };
2744
2745 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2746 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2747 return;
2748 }
2749unlock:
2750 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2751}
2752
2753#endif
2754
2755DEFINE_PER_CPU(struct kernel_stat, kstat);
2756DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2757
2758EXPORT_PER_CPU_SYMBOL(kstat);
2759EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2760
2761
2762
2763
2764
2765
2766
2767static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2768{
2769 u64 ns = 0;
2770
2771 if (task_current(rq, p)) {
2772 update_rq_clock(rq);
2773 ns = rq->clock_task - p->se.exec_start;
2774 if ((s64)ns < 0)
2775 ns = 0;
2776 }
2777
2778 return ns;
2779}
2780
2781unsigned long long task_delta_exec(struct task_struct *p)
2782{
2783 unsigned long flags;
2784 struct rq *rq;
2785 u64 ns = 0;
2786
2787 rq = task_rq_lock(p, &flags);
2788 ns = do_task_delta_exec(p, rq);
2789 task_rq_unlock(rq, p, &flags);
2790
2791 return ns;
2792}
2793
2794
2795
2796
2797
2798
2799unsigned long long task_sched_runtime(struct task_struct *p)
2800{
2801 unsigned long flags;
2802 struct rq *rq;
2803 u64 ns = 0;
2804
2805 rq = task_rq_lock(p, &flags);
2806 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2807 task_rq_unlock(rq, p, &flags);
2808
2809 return ns;
2810}
2811
2812#ifdef CONFIG_CGROUP_CPUACCT
2813struct cgroup_subsys cpuacct_subsys;
2814struct cpuacct root_cpuacct;
2815#endif
2816
2817static inline void task_group_account_field(struct task_struct *p, int index,
2818 u64 tmp)
2819{
2820#ifdef CONFIG_CGROUP_CPUACCT
2821 struct kernel_cpustat *kcpustat;
2822 struct cpuacct *ca;
2823#endif
2824
2825
2826
2827
2828
2829
2830 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2831
2832#ifdef CONFIG_CGROUP_CPUACCT
2833 if (unlikely(!cpuacct_subsys.active))
2834 return;
2835
2836 rcu_read_lock();
2837 ca = task_ca(p);
2838 while (ca && (ca != &root_cpuacct)) {
2839 kcpustat = this_cpu_ptr(ca->cpustat);
2840 kcpustat->cpustat[index] += tmp;
2841 ca = parent_ca(ca);
2842 }
2843 rcu_read_unlock();
2844#endif
2845}
2846
2847
2848
2849
2850
2851
2852
2853
2854void account_user_time(struct task_struct *p, cputime_t cputime,
2855 cputime_t cputime_scaled)
2856{
2857 int index;
2858
2859
2860 p->utime += cputime;
2861 p->utimescaled += cputime_scaled;
2862 account_group_user_time(p, cputime);
2863
2864 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2865
2866
2867 task_group_account_field(p, index, (__force u64) cputime);
2868
2869
2870 acct_update_integrals(p);
2871}
2872
2873
2874
2875
2876
2877
2878
2879static void account_guest_time(struct task_struct *p, cputime_t cputime,
2880 cputime_t cputime_scaled)
2881{
2882 u64 *cpustat = kcpustat_this_cpu->cpustat;
2883
2884
2885 p->utime += cputime;
2886 p->utimescaled += cputime_scaled;
2887 account_group_user_time(p, cputime);
2888 p->gtime += cputime;
2889
2890
2891 if (TASK_NICE(p) > 0) {
2892 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2893 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2894 } else {
2895 cpustat[CPUTIME_USER] += (__force u64) cputime;
2896 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2897 }
2898}
2899
2900
2901
2902
2903
2904
2905
2906
2907static inline
2908void __account_system_time(struct task_struct *p, cputime_t cputime,
2909 cputime_t cputime_scaled, int index)
2910{
2911
2912 p->stime += cputime;
2913 p->stimescaled += cputime_scaled;
2914 account_group_system_time(p, cputime);
2915
2916
2917 task_group_account_field(p, index, (__force u64) cputime);
2918
2919
2920 acct_update_integrals(p);
2921}
2922
2923
2924
2925
2926
2927
2928
2929
2930void account_system_time(struct task_struct *p, int hardirq_offset,
2931 cputime_t cputime, cputime_t cputime_scaled)
2932{
2933 int index;
2934
2935 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2936 account_guest_time(p, cputime, cputime_scaled);
2937 return;
2938 }
2939
2940 if (hardirq_count() - hardirq_offset)
2941 index = CPUTIME_IRQ;
2942 else if (in_serving_softirq())
2943 index = CPUTIME_SOFTIRQ;
2944 else
2945 index = CPUTIME_SYSTEM;
2946
2947 __account_system_time(p, cputime, cputime_scaled, index);
2948}
2949
2950
2951
2952
2953
2954void account_steal_time(cputime_t cputime)
2955{
2956 u64 *cpustat = kcpustat_this_cpu->cpustat;
2957
2958 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2959}
2960
2961
2962
2963
2964
2965void account_idle_time(cputime_t cputime)
2966{
2967 u64 *cpustat = kcpustat_this_cpu->cpustat;
2968 struct rq *rq = this_rq();
2969
2970 if (atomic_read(&rq->nr_iowait) > 0)
2971 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2972 else
2973 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2974}
2975
2976static __always_inline bool steal_account_process_tick(void)
2977{
2978#ifdef CONFIG_PARAVIRT
2979 if (static_key_false(¶virt_steal_enabled)) {
2980 u64 steal, st = 0;
2981
2982 steal = paravirt_steal_clock(smp_processor_id());
2983 steal -= this_rq()->prev_steal_time;
2984
2985 st = steal_ticks(steal);
2986 this_rq()->prev_steal_time += st * TICK_NSEC;
2987
2988 account_steal_time(st);
2989 return st;
2990 }
2991#endif
2992 return false;
2993}
2994
2995#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2996
2997#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3020 struct rq *rq)
3021{
3022 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3023 u64 *cpustat = kcpustat_this_cpu->cpustat;
3024
3025 if (steal_account_process_tick())
3026 return;
3027
3028 if (irqtime_account_hi_update()) {
3029 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3030 } else if (irqtime_account_si_update()) {
3031 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3032 } else if (this_cpu_ksoftirqd() == p) {
3033
3034
3035
3036
3037
3038 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3039 CPUTIME_SOFTIRQ);
3040 } else if (user_tick) {
3041 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3042 } else if (p == rq->idle) {
3043 account_idle_time(cputime_one_jiffy);
3044 } else if (p->flags & PF_VCPU) {
3045 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3046 } else {
3047 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3048 CPUTIME_SYSTEM);
3049 }
3050}
3051
3052static void irqtime_account_idle_ticks(int ticks)
3053{
3054 int i;
3055 struct rq *rq = this_rq();
3056
3057 for (i = 0; i < ticks; i++)
3058 irqtime_account_process_tick(current, 0, rq);
3059}
3060#else
3061static void irqtime_account_idle_ticks(int ticks) {}
3062static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3063 struct rq *rq) {}
3064#endif
3065
3066
3067
3068
3069
3070
3071void account_process_tick(struct task_struct *p, int user_tick)
3072{
3073 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3074 struct rq *rq = this_rq();
3075
3076 if (sched_clock_irqtime) {
3077 irqtime_account_process_tick(p, user_tick, rq);
3078 return;
3079 }
3080
3081 if (steal_account_process_tick())
3082 return;
3083
3084 if (user_tick)
3085 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3086 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3087 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3088 one_jiffy_scaled);
3089 else
3090 account_idle_time(cputime_one_jiffy);
3091}
3092
3093
3094
3095
3096
3097
3098void account_steal_ticks(unsigned long ticks)
3099{
3100 account_steal_time(jiffies_to_cputime(ticks));
3101}
3102
3103
3104
3105
3106
3107void account_idle_ticks(unsigned long ticks)
3108{
3109
3110 if (sched_clock_irqtime) {
3111 irqtime_account_idle_ticks(ticks);
3112 return;
3113 }
3114
3115 account_idle_time(jiffies_to_cputime(ticks));
3116}
3117
3118#endif
3119
3120
3121
3122
3123#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3124void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3125{
3126 *ut = p->utime;
3127 *st = p->stime;
3128}
3129
3130void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3131{
3132 struct task_cputime cputime;
3133
3134 thread_group_cputime(p, &cputime);
3135
3136 *ut = cputime.utime;
3137 *st = cputime.stime;
3138}
3139#else
3140
3141#ifndef nsecs_to_cputime
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif
3144
3145void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3146{
3147 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3148
3149
3150
3151
3152 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3153
3154 if (total) {
3155 u64 temp = (__force u64) rtime;
3156
3157 temp *= (__force u64) utime;
3158 do_div(temp, (__force u32) total);
3159 utime = (__force cputime_t) temp;
3160 } else
3161 utime = rtime;
3162
3163
3164
3165
3166 p->prev_utime = max(p->prev_utime, utime);
3167 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3168
3169 *ut = p->prev_utime;
3170 *st = p->prev_stime;
3171}
3172
3173
3174
3175
3176void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3177{
3178 struct signal_struct *sig = p->signal;
3179 struct task_cputime cputime;
3180 cputime_t rtime, utime, total;
3181
3182 thread_group_cputime(p, &cputime);
3183
3184 total = cputime.utime + cputime.stime;
3185 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3186
3187 if (total) {
3188 u64 temp = (__force u64) rtime;
3189
3190 temp *= (__force u64) cputime.utime;
3191 do_div(temp, (__force u32) total);
3192 utime = (__force cputime_t) temp;
3193 } else
3194 utime = rtime;
3195
3196 sig->prev_utime = max(sig->prev_utime, utime);
3197 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3198
3199 *ut = sig->prev_utime;
3200 *st = sig->prev_stime;
3201}
3202#endif
3203
3204
3205
3206
3207
3208void scheduler_tick(void)
3209{
3210 int cpu = smp_processor_id();
3211 struct rq *rq = cpu_rq(cpu);
3212 struct task_struct *curr = rq->curr;
3213
3214 sched_clock_tick();
3215
3216 raw_spin_lock(&rq->lock);
3217 update_rq_clock(rq);
3218 update_cpu_load_active(rq);
3219 curr->sched_class->task_tick(rq, curr, 0);
3220 raw_spin_unlock(&rq->lock);
3221
3222 perf_event_task_tick();
3223
3224#ifdef CONFIG_SMP
3225 rq->idle_balance = idle_cpu(cpu);
3226 trigger_load_balance(rq, cpu);
3227#endif
3228}
3229
3230notrace unsigned long get_parent_ip(unsigned long addr)
3231{
3232 if (in_lock_functions(addr)) {
3233 addr = CALLER_ADDR2;
3234 if (in_lock_functions(addr))
3235 addr = CALLER_ADDR3;
3236 }
3237 return addr;
3238}
3239
3240#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3241 defined(CONFIG_PREEMPT_TRACER))
3242
3243void __kprobes add_preempt_count(int val)
3244{
3245#ifdef CONFIG_DEBUG_PREEMPT
3246
3247
3248
3249 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3250 return;
3251#endif
3252 preempt_count() += val;
3253#ifdef CONFIG_DEBUG_PREEMPT
3254
3255
3256
3257 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3258 PREEMPT_MASK - 10);
3259#endif
3260 if (preempt_count() == val)
3261 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3262}
3263EXPORT_SYMBOL(add_preempt_count);
3264
3265void __kprobes sub_preempt_count(int val)
3266{
3267#ifdef CONFIG_DEBUG_PREEMPT
3268
3269
3270
3271 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3272 return;
3273
3274
3275
3276 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3277 !(preempt_count() & PREEMPT_MASK)))
3278 return;
3279#endif
3280
3281 if (preempt_count() == val)
3282 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3283 preempt_count() -= val;
3284}
3285EXPORT_SYMBOL(sub_preempt_count);
3286
3287#endif
3288
3289
3290
3291
3292static noinline void __schedule_bug(struct task_struct *prev)
3293{
3294 if (oops_in_progress)
3295 return;
3296
3297 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3298 prev->comm, prev->pid, preempt_count());
3299
3300 debug_show_held_locks(prev);
3301 print_modules();
3302 if (irqs_disabled())
3303 print_irqtrace_events(prev);
3304 dump_stack();
3305 add_taint(TAINT_WARN);
3306}
3307
3308
3309
3310
3311static inline void schedule_debug(struct task_struct *prev)
3312{
3313
3314
3315
3316
3317
3318 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3319 __schedule_bug(prev);
3320 rcu_sleep_check();
3321
3322 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3323
3324 schedstat_inc(this_rq(), sched_count);
3325}
3326
3327static void put_prev_task(struct rq *rq, struct task_struct *prev)
3328{
3329 if (prev->on_rq || rq->skip_clock_update < 0)
3330 update_rq_clock(rq);
3331 prev->sched_class->put_prev_task(rq, prev);
3332}
3333
3334
3335
3336
3337static inline struct task_struct *
3338pick_next_task(struct rq *rq)
3339{
3340 const struct sched_class *class;
3341 struct task_struct *p;
3342
3343
3344
3345
3346
3347 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3348 p = fair_sched_class.pick_next_task(rq);
3349 if (likely(p))
3350 return p;
3351 }
3352
3353 for_each_class(class) {
3354 p = class->pick_next_task(rq);
3355 if (p)
3356 return p;
3357 }
3358
3359 BUG();
3360}
3361
3362
3363
3364
3365static void __sched __schedule(void)
3366{
3367 struct task_struct *prev, *next;
3368 unsigned long *switch_count;
3369 struct rq *rq;
3370 int cpu;
3371
3372need_resched:
3373 preempt_disable();
3374 cpu = smp_processor_id();
3375 rq = cpu_rq(cpu);
3376 rcu_note_context_switch(cpu);
3377 prev = rq->curr;
3378
3379 schedule_debug(prev);
3380
3381 if (sched_feat(HRTICK))
3382 hrtick_clear(rq);
3383
3384 raw_spin_lock_irq(&rq->lock);
3385
3386 switch_count = &prev->nivcsw;
3387 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3388 if (unlikely(signal_pending_state(prev->state, prev))) {
3389 prev->state = TASK_RUNNING;
3390 } else {
3391 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3392 prev->on_rq = 0;
3393
3394
3395
3396
3397
3398
3399 if (prev->flags & PF_WQ_WORKER) {
3400 struct task_struct *to_wakeup;
3401
3402 to_wakeup = wq_worker_sleeping(prev, cpu);
3403 if (to_wakeup)
3404 try_to_wake_up_local(to_wakeup);
3405 }
3406 }
3407 switch_count = &prev->nvcsw;
3408 }
3409
3410 pre_schedule(rq, prev);
3411
3412 if (unlikely(!rq->nr_running))
3413 idle_balance(cpu, rq);
3414
3415 put_prev_task(rq, prev);
3416 next = pick_next_task(rq);
3417 clear_tsk_need_resched(prev);
3418 rq->skip_clock_update = 0;
3419
3420 if (likely(prev != next)) {
3421 rq->nr_switches++;
3422 rq->curr = next;
3423 ++*switch_count;
3424
3425 context_switch(rq, prev, next);
3426
3427
3428
3429
3430
3431
3432 cpu = smp_processor_id();
3433 rq = cpu_rq(cpu);
3434 } else
3435 raw_spin_unlock_irq(&rq->lock);
3436
3437 post_schedule(rq);
3438
3439 sched_preempt_enable_no_resched();
3440 if (need_resched())
3441 goto need_resched;
3442}
3443
3444static inline void sched_submit_work(struct task_struct *tsk)
3445{
3446 if (!tsk->state || tsk_is_pi_blocked(tsk))
3447 return;
3448
3449
3450
3451
3452 if (blk_needs_flush_plug(tsk))
3453 blk_schedule_flush_plug(tsk);
3454}
3455
3456asmlinkage void __sched schedule(void)
3457{
3458 struct task_struct *tsk = current;
3459
3460 sched_submit_work(tsk);
3461 __schedule();
3462}
3463EXPORT_SYMBOL(schedule);
3464
3465
3466
3467
3468
3469
3470void __sched schedule_preempt_disabled(void)
3471{
3472 sched_preempt_enable_no_resched();
3473 schedule();
3474 preempt_disable();
3475}
3476
3477#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3478
3479static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3480{
3481 if (lock->owner != owner)
3482 return false;
3483
3484
3485
3486
3487
3488
3489
3490 barrier();
3491
3492 return owner->on_cpu;
3493}
3494
3495
3496
3497
3498
3499int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3500{
3501 if (!sched_feat(OWNER_SPIN))
3502 return 0;
3503
3504 rcu_read_lock();
3505 while (owner_running(lock, owner)) {
3506 if (need_resched())
3507 break;
3508
3509 arch_mutex_cpu_relax();
3510 }
3511 rcu_read_unlock();
3512
3513
3514
3515
3516
3517
3518 return lock->owner == NULL;
3519}
3520#endif
3521
3522#ifdef CONFIG_PREEMPT
3523
3524
3525
3526
3527
3528asmlinkage void __sched notrace preempt_schedule(void)
3529{
3530 struct thread_info *ti = current_thread_info();
3531
3532
3533
3534
3535
3536 if (likely(ti->preempt_count || irqs_disabled()))
3537 return;
3538
3539 do {
3540 add_preempt_count_notrace(PREEMPT_ACTIVE);
3541 __schedule();
3542 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3543
3544
3545
3546
3547
3548 barrier();
3549 } while (need_resched());
3550}
3551EXPORT_SYMBOL(preempt_schedule);
3552
3553
3554
3555
3556
3557
3558
3559asmlinkage void __sched preempt_schedule_irq(void)
3560{
3561 struct thread_info *ti = current_thread_info();
3562
3563
3564 BUG_ON(ti->preempt_count || !irqs_disabled());
3565
3566 do {
3567 add_preempt_count(PREEMPT_ACTIVE);
3568 local_irq_enable();
3569 __schedule();
3570 local_irq_disable();
3571 sub_preempt_count(PREEMPT_ACTIVE);
3572
3573
3574
3575
3576
3577 barrier();
3578 } while (need_resched());
3579}
3580
3581#endif
3582
3583int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3584 void *key)
3585{
3586 return try_to_wake_up(curr->private, mode, wake_flags);
3587}
3588EXPORT_SYMBOL(default_wake_function);
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3600 int nr_exclusive, int wake_flags, void *key)
3601{
3602 wait_queue_t *curr, *next;
3603
3604 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3605 unsigned flags = curr->flags;
3606
3607 if (curr->func(curr, mode, wake_flags, key) &&
3608 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3609 break;
3610 }
3611}
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623void __wake_up(wait_queue_head_t *q, unsigned int mode,
3624 int nr_exclusive, void *key)
3625{
3626 unsigned long flags;
3627
3628 spin_lock_irqsave(&q->lock, flags);
3629 __wake_up_common(q, mode, nr_exclusive, 0, key);
3630 spin_unlock_irqrestore(&q->lock, flags);
3631}
3632EXPORT_SYMBOL(__wake_up);
3633
3634
3635
3636
3637void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3638{
3639 __wake_up_common(q, mode, nr, 0, NULL);
3640}
3641EXPORT_SYMBOL_GPL(__wake_up_locked);
3642
3643void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3644{
3645 __wake_up_common(q, mode, 1, 0, key);
3646}
3647EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3667 int nr_exclusive, void *key)
3668{
3669 unsigned long flags;
3670 int wake_flags = WF_SYNC;
3671
3672 if (unlikely(!q))
3673 return;
3674
3675 if (unlikely(!nr_exclusive))
3676 wake_flags = 0;
3677
3678 spin_lock_irqsave(&q->lock, flags);
3679 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3680 spin_unlock_irqrestore(&q->lock, flags);
3681}
3682EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3683
3684
3685
3686
3687void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3688{
3689 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3690}
3691EXPORT_SYMBOL_GPL(__wake_up_sync);
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705void complete(struct completion *x)
3706{
3707 unsigned long flags;
3708
3709 spin_lock_irqsave(&x->wait.lock, flags);
3710 x->done++;
3711 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3712 spin_unlock_irqrestore(&x->wait.lock, flags);
3713}
3714EXPORT_SYMBOL(complete);
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725void complete_all(struct completion *x)
3726{
3727 unsigned long flags;
3728
3729 spin_lock_irqsave(&x->wait.lock, flags);
3730 x->done += UINT_MAX/2;
3731 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3732 spin_unlock_irqrestore(&x->wait.lock, flags);
3733}
3734EXPORT_SYMBOL(complete_all);
3735
3736static inline long __sched
3737do_wait_for_common(struct completion *x, long timeout, int state)
3738{
3739 if (!x->done) {
3740 DECLARE_WAITQUEUE(wait, current);
3741
3742 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3743 do {
3744 if (signal_pending_state(state, current)) {
3745 timeout = -ERESTARTSYS;
3746 break;
3747 }
3748 __set_current_state(state);
3749 spin_unlock_irq(&x->wait.lock);
3750 timeout = schedule_timeout(timeout);
3751 spin_lock_irq(&x->wait.lock);
3752 } while (!x->done && timeout);
3753 __remove_wait_queue(&x->wait, &wait);
3754 if (!x->done)
3755 return timeout;
3756 }
3757 x->done--;
3758 return timeout ?: 1;
3759}
3760
3761static long __sched
3762wait_for_common(struct completion *x, long timeout, int state)
3763{
3764 might_sleep();
3765
3766 spin_lock_irq(&x->wait.lock);
3767 timeout = do_wait_for_common(x, timeout, state);
3768 spin_unlock_irq(&x->wait.lock);
3769 return timeout;
3770}
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782void __sched wait_for_completion(struct completion *x)
3783{
3784 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3785}
3786EXPORT_SYMBOL(wait_for_completion);
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800unsigned long __sched
3801wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3802{
3803 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3804}
3805EXPORT_SYMBOL(wait_for_completion_timeout);
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816int __sched wait_for_completion_interruptible(struct completion *x)
3817{
3818 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3819 if (t == -ERESTARTSYS)
3820 return t;
3821 return 0;
3822}
3823EXPORT_SYMBOL(wait_for_completion_interruptible);
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836long __sched
3837wait_for_completion_interruptible_timeout(struct completion *x,
3838 unsigned long timeout)
3839{
3840 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3841}
3842EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853int __sched wait_for_completion_killable(struct completion *x)
3854{
3855 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3856 if (t == -ERESTARTSYS)
3857 return t;
3858 return 0;
3859}
3860EXPORT_SYMBOL(wait_for_completion_killable);
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874long __sched
3875wait_for_completion_killable_timeout(struct completion *x,
3876 unsigned long timeout)
3877{
3878 return wait_for_common(x, timeout, TASK_KILLABLE);
3879}
3880EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894bool try_wait_for_completion(struct completion *x)
3895{
3896 unsigned long flags;
3897 int ret = 1;
3898
3899 spin_lock_irqsave(&x->wait.lock, flags);
3900 if (!x->done)
3901 ret = 0;
3902 else
3903 x->done--;
3904 spin_unlock_irqrestore(&x->wait.lock, flags);
3905 return ret;
3906}
3907EXPORT_SYMBOL(try_wait_for_completion);
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917bool completion_done(struct completion *x)
3918{
3919 unsigned long flags;
3920 int ret = 1;
3921
3922 spin_lock_irqsave(&x->wait.lock, flags);
3923 if (!x->done)
3924 ret = 0;
3925 spin_unlock_irqrestore(&x->wait.lock, flags);
3926 return ret;
3927}
3928EXPORT_SYMBOL(completion_done);
3929
3930static long __sched
3931sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3932{
3933 unsigned long flags;
3934 wait_queue_t wait;
3935
3936 init_waitqueue_entry(&wait, current);
3937
3938 __set_current_state(state);
3939
3940 spin_lock_irqsave(&q->lock, flags);
3941 __add_wait_queue(q, &wait);
3942 spin_unlock(&q->lock);
3943 timeout = schedule_timeout(timeout);
3944 spin_lock_irq(&q->lock);
3945 __remove_wait_queue(q, &wait);
3946 spin_unlock_irqrestore(&q->lock, flags);
3947
3948 return timeout;
3949}
3950
3951void __sched interruptible_sleep_on(wait_queue_head_t *q)
3952{
3953 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3954}
3955EXPORT_SYMBOL(interruptible_sleep_on);
3956
3957long __sched
3958interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3959{
3960 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3961}
3962EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3963
3964void __sched sleep_on(wait_queue_head_t *q)
3965{
3966 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3967}
3968EXPORT_SYMBOL(sleep_on);
3969
3970long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3971{
3972 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3973}
3974EXPORT_SYMBOL(sleep_on_timeout);
3975
3976#ifdef CONFIG_RT_MUTEXES
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988void rt_mutex_setprio(struct task_struct *p, int prio)
3989{
3990 int oldprio, on_rq, running;
3991 struct rq *rq;
3992 const struct sched_class *prev_class;
3993
3994 BUG_ON(prio < 0 || prio > MAX_PRIO);
3995
3996 rq = __task_rq_lock(p);
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010 if (unlikely(p == rq->idle)) {
4011 WARN_ON(p != rq->curr);
4012 WARN_ON(p->pi_blocked_on);
4013 goto out_unlock;
4014 }
4015
4016 trace_sched_pi_setprio(p, prio);
4017 oldprio = p->prio;
4018 prev_class = p->sched_class;
4019 on_rq = p->on_rq;
4020 running = task_current(rq, p);
4021 if (on_rq)
4022 dequeue_task(rq, p, 0);
4023 if (running)
4024 p->sched_class->put_prev_task(rq, p);
4025
4026 if (rt_prio(prio))
4027 p->sched_class = &rt_sched_class;
4028 else
4029 p->sched_class = &fair_sched_class;
4030
4031 p->prio = prio;
4032
4033 if (running)
4034 p->sched_class->set_curr_task(rq);
4035 if (on_rq)
4036 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4037
4038 check_class_changed(rq, p, prev_class, oldprio);
4039out_unlock:
4040 __task_rq_unlock(rq);
4041}
4042#endif
4043void set_user_nice(struct task_struct *p, long nice)
4044{
4045 int old_prio, delta, on_rq;
4046 unsigned long flags;
4047 struct rq *rq;
4048
4049 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4050 return;
4051
4052
4053
4054
4055 rq = task_rq_lock(p, &flags);
4056
4057
4058
4059
4060
4061
4062 if (task_has_rt_policy(p)) {
4063 p->static_prio = NICE_TO_PRIO(nice);
4064 goto out_unlock;
4065 }
4066 on_rq = p->on_rq;
4067 if (on_rq)
4068 dequeue_task(rq, p, 0);
4069
4070 p->static_prio = NICE_TO_PRIO(nice);
4071 set_load_weight(p);
4072 old_prio = p->prio;
4073 p->prio = effective_prio(p);
4074 delta = p->prio - old_prio;
4075
4076 if (on_rq) {
4077 enqueue_task(rq, p, 0);
4078
4079
4080
4081
4082 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4083 resched_task(rq->curr);
4084 }
4085out_unlock:
4086 task_rq_unlock(rq, p, &flags);
4087}
4088EXPORT_SYMBOL(set_user_nice);
4089
4090
4091
4092
4093
4094
4095int can_nice(const struct task_struct *p, const int nice)
4096{
4097
4098 int nice_rlim = 20 - nice;
4099
4100 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4101 capable(CAP_SYS_NICE));
4102}
4103
4104#ifdef __ARCH_WANT_SYS_NICE
4105
4106
4107
4108
4109
4110
4111
4112
4113SYSCALL_DEFINE1(nice, int, increment)
4114{
4115 long nice, retval;
4116
4117
4118
4119
4120
4121
4122 if (increment < -40)
4123 increment = -40;
4124 if (increment > 40)
4125 increment = 40;
4126
4127 nice = TASK_NICE(current) + increment;
4128 if (nice < -20)
4129 nice = -20;
4130 if (nice > 19)
4131 nice = 19;
4132
4133 if (increment < 0 && !can_nice(current, nice))
4134 return -EPERM;
4135
4136 retval = security_task_setnice(current, nice);
4137 if (retval)
4138 return retval;
4139
4140 set_user_nice(current, nice);
4141 return 0;
4142}
4143
4144#endif
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154int task_prio(const struct task_struct *p)
4155{
4156 return p->prio - MAX_RT_PRIO;
4157}
4158
4159
4160
4161
4162
4163int task_nice(const struct task_struct *p)
4164{
4165 return TASK_NICE(p);
4166}
4167EXPORT_SYMBOL(task_nice);
4168
4169
4170
4171
4172
4173int idle_cpu(int cpu)
4174{
4175 struct rq *rq = cpu_rq(cpu);
4176
4177 if (rq->curr != rq->idle)
4178 return 0;
4179
4180 if (rq->nr_running)
4181 return 0;
4182
4183#ifdef CONFIG_SMP
4184 if (!llist_empty(&rq->wake_list))
4185 return 0;
4186#endif
4187
4188 return 1;
4189}
4190
4191
4192
4193
4194
4195struct task_struct *idle_task(int cpu)
4196{
4197 return cpu_rq(cpu)->idle;
4198}
4199
4200
4201
4202
4203
4204static struct task_struct *find_process_by_pid(pid_t pid)
4205{
4206 return pid ? find_task_by_vpid(pid) : current;
4207}
4208
4209
4210static void
4211__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4212{
4213 p->policy = policy;
4214 p->rt_priority = prio;
4215 p->normal_prio = normal_prio(p);
4216
4217 p->prio = rt_mutex_getprio(p);
4218 if (rt_prio(p->prio))
4219 p->sched_class = &rt_sched_class;
4220 else
4221 p->sched_class = &fair_sched_class;
4222 set_load_weight(p);
4223}
4224
4225
4226
4227
4228static bool check_same_owner(struct task_struct *p)
4229{
4230 const struct cred *cred = current_cred(), *pcred;
4231 bool match;
4232
4233 rcu_read_lock();
4234 pcred = __task_cred(p);
4235 match = (uid_eq(cred->euid, pcred->euid) ||
4236 uid_eq(cred->euid, pcred->uid));
4237 rcu_read_unlock();
4238 return match;
4239}
4240
4241static int __sched_setscheduler(struct task_struct *p, int policy,
4242 const struct sched_param *param, bool user)
4243{
4244 int retval, oldprio, oldpolicy = -1, on_rq, running;
4245 unsigned long flags;
4246 const struct sched_class *prev_class;
4247 struct rq *rq;
4248 int reset_on_fork;
4249
4250
4251 BUG_ON(in_interrupt());
4252recheck:
4253
4254 if (policy < 0) {
4255 reset_on_fork = p->sched_reset_on_fork;
4256 policy = oldpolicy = p->policy;
4257 } else {
4258 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4259 policy &= ~SCHED_RESET_ON_FORK;
4260
4261 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4262 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4263 policy != SCHED_IDLE)
4264 return -EINVAL;
4265 }
4266
4267
4268
4269
4270
4271
4272 if (param->sched_priority < 0 ||
4273 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4274 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4275 return -EINVAL;
4276 if (rt_policy(policy) != (param->sched_priority != 0))
4277 return -EINVAL;
4278
4279
4280
4281
4282 if (user && !capable(CAP_SYS_NICE)) {
4283 if (rt_policy(policy)) {
4284 unsigned long rlim_rtprio =
4285 task_rlimit(p, RLIMIT_RTPRIO);
4286
4287
4288 if (policy != p->policy && !rlim_rtprio)
4289 return -EPERM;
4290
4291
4292 if (param->sched_priority > p->rt_priority &&
4293 param->sched_priority > rlim_rtprio)
4294 return -EPERM;
4295 }
4296
4297
4298
4299
4300
4301 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4302 if (!can_nice(p, TASK_NICE(p)))
4303 return -EPERM;
4304 }
4305
4306
4307 if (!check_same_owner(p))
4308 return -EPERM;
4309
4310
4311 if (p->sched_reset_on_fork && !reset_on_fork)
4312 return -EPERM;
4313 }
4314
4315 if (user) {
4316 retval = security_task_setscheduler(p);
4317 if (retval)
4318 return retval;
4319 }
4320
4321
4322
4323
4324
4325
4326
4327
4328 rq = task_rq_lock(p, &flags);
4329
4330
4331
4332
4333 if (p == rq->stop) {
4334 task_rq_unlock(rq, p, &flags);
4335 return -EINVAL;
4336 }
4337
4338
4339
4340
4341 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4342 param->sched_priority == p->rt_priority))) {
4343
4344 __task_rq_unlock(rq);
4345 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4346 return 0;
4347 }
4348
4349#ifdef CONFIG_RT_GROUP_SCHED
4350 if (user) {
4351
4352
4353
4354
4355 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4356 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4357 !task_group_is_autogroup(task_group(p))) {
4358 task_rq_unlock(rq, p, &flags);
4359 return -EPERM;
4360 }
4361 }
4362#endif
4363
4364
4365 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4366 policy = oldpolicy = -1;
4367 task_rq_unlock(rq, p, &flags);
4368 goto recheck;
4369 }
4370 on_rq = p->on_rq;
4371 running = task_current(rq, p);
4372 if (on_rq)
4373 dequeue_task(rq, p, 0);
4374 if (running)
4375 p->sched_class->put_prev_task(rq, p);
4376
4377 p->sched_reset_on_fork = reset_on_fork;
4378
4379 oldprio = p->prio;
4380 prev_class = p->sched_class;
4381 __setscheduler(rq, p, policy, param->sched_priority);
4382
4383 if (running)
4384 p->sched_class->set_curr_task(rq);
4385 if (on_rq)
4386 enqueue_task(rq, p, 0);
4387
4388 check_class_changed(rq, p, prev_class, oldprio);
4389 task_rq_unlock(rq, p, &flags);
4390
4391 rt_mutex_adjust_pi(p);
4392
4393 return 0;
4394}
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404int sched_setscheduler(struct task_struct *p, int policy,
4405 const struct sched_param *param)
4406{
4407 return __sched_setscheduler(p, policy, param, true);
4408}
4409EXPORT_SYMBOL_GPL(sched_setscheduler);
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4423 const struct sched_param *param)
4424{
4425 return __sched_setscheduler(p, policy, param, false);
4426}
4427
4428static int
4429do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4430{
4431 struct sched_param lparam;
4432 struct task_struct *p;
4433 int retval;
4434
4435 if (!param || pid < 0)
4436 return -EINVAL;
4437 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4438 return -EFAULT;
4439
4440 rcu_read_lock();
4441 retval = -ESRCH;
4442 p = find_process_by_pid(pid);
4443 if (p != NULL)
4444 retval = sched_setscheduler(p, policy, &lparam);
4445 rcu_read_unlock();
4446
4447 return retval;
4448}
4449
4450
4451
4452
4453
4454
4455
4456SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4457 struct sched_param __user *, param)
4458{
4459
4460 if (policy < 0)
4461 return -EINVAL;
4462
4463 return do_sched_setscheduler(pid, policy, param);
4464}
4465
4466
4467
4468
4469
4470
4471SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4472{
4473 return do_sched_setscheduler(pid, -1, param);
4474}
4475
4476
4477
4478
4479
4480SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4481{
4482 struct task_struct *p;
4483 int retval;
4484
4485 if (pid < 0)
4486 return -EINVAL;
4487
4488 retval = -ESRCH;
4489 rcu_read_lock();
4490 p = find_process_by_pid(pid);
4491 if (p) {
4492 retval = security_task_getscheduler(p);
4493 if (!retval)
4494 retval = p->policy
4495 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4496 }
4497 rcu_read_unlock();
4498 return retval;
4499}
4500
4501
4502
4503
4504
4505
4506SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4507{
4508 struct sched_param lp;
4509 struct task_struct *p;
4510 int retval;
4511
4512 if (!param || pid < 0)
4513 return -EINVAL;
4514
4515 rcu_read_lock();
4516 p = find_process_by_pid(pid);
4517 retval = -ESRCH;
4518 if (!p)
4519 goto out_unlock;
4520
4521 retval = security_task_getscheduler(p);
4522 if (retval)
4523 goto out_unlock;
4524
4525 lp.sched_priority = p->rt_priority;
4526 rcu_read_unlock();
4527
4528
4529
4530
4531 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4532
4533 return retval;
4534
4535out_unlock:
4536 rcu_read_unlock();
4537 return retval;
4538}
4539
4540long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4541{
4542 cpumask_var_t cpus_allowed, new_mask;
4543 struct task_struct *p;
4544 int retval;
4545
4546 get_online_cpus();
4547 rcu_read_lock();
4548
4549 p = find_process_by_pid(pid);
4550 if (!p) {
4551 rcu_read_unlock();
4552 put_online_cpus();
4553 return -ESRCH;
4554 }
4555
4556
4557 get_task_struct(p);
4558 rcu_read_unlock();
4559
4560 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4561 retval = -ENOMEM;
4562 goto out_put_task;
4563 }
4564 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4565 retval = -ENOMEM;
4566 goto out_free_cpus_allowed;
4567 }
4568 retval = -EPERM;
4569 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4570 goto out_unlock;
4571
4572 retval = security_task_setscheduler(p);
4573 if (retval)
4574 goto out_unlock;
4575
4576 cpuset_cpus_allowed(p, cpus_allowed);
4577 cpumask_and(new_mask, in_mask, cpus_allowed);
4578again:
4579 retval = set_cpus_allowed_ptr(p, new_mask);
4580
4581 if (!retval) {
4582 cpuset_cpus_allowed(p, cpus_allowed);
4583 if (!cpumask_subset(new_mask, cpus_allowed)) {
4584
4585
4586
4587
4588
4589 cpumask_copy(new_mask, cpus_allowed);
4590 goto again;
4591 }
4592 }
4593out_unlock:
4594 free_cpumask_var(new_mask);
4595out_free_cpus_allowed:
4596 free_cpumask_var(cpus_allowed);
4597out_put_task:
4598 put_task_struct(p);
4599 put_online_cpus();
4600 return retval;
4601}
4602
4603static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4604 struct cpumask *new_mask)
4605{
4606 if (len < cpumask_size())
4607 cpumask_clear(new_mask);
4608 else if (len > cpumask_size())
4609 len = cpumask_size();
4610
4611 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4612}
4613
4614
4615
4616
4617
4618
4619
4620SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4621 unsigned long __user *, user_mask_ptr)
4622{
4623 cpumask_var_t new_mask;
4624 int retval;
4625
4626 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4627 return -ENOMEM;
4628
4629 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4630 if (retval == 0)
4631 retval = sched_setaffinity(pid, new_mask);
4632 free_cpumask_var(new_mask);
4633 return retval;
4634}
4635
4636long sched_getaffinity(pid_t pid, struct cpumask *mask)
4637{
4638 struct task_struct *p;
4639 unsigned long flags;
4640 int retval;
4641
4642 get_online_cpus();
4643 rcu_read_lock();
4644
4645 retval = -ESRCH;
4646 p = find_process_by_pid(pid);
4647 if (!p)
4648 goto out_unlock;
4649
4650 retval = security_task_getscheduler(p);
4651 if (retval)
4652 goto out_unlock;
4653
4654 raw_spin_lock_irqsave(&p->pi_lock, flags);
4655 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4656 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4657
4658out_unlock:
4659 rcu_read_unlock();
4660 put_online_cpus();
4661
4662 return retval;
4663}
4664
4665
4666
4667
4668
4669
4670
4671SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4672 unsigned long __user *, user_mask_ptr)
4673{
4674 int ret;
4675 cpumask_var_t mask;
4676
4677 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4678 return -EINVAL;
4679 if (len & (sizeof(unsigned long)-1))
4680 return -EINVAL;
4681
4682 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4683 return -ENOMEM;
4684
4685 ret = sched_getaffinity(pid, mask);
4686 if (ret == 0) {
4687 size_t retlen = min_t(size_t, len, cpumask_size());
4688
4689 if (copy_to_user(user_mask_ptr, mask, retlen))
4690 ret = -EFAULT;
4691 else
4692 ret = retlen;
4693 }
4694 free_cpumask_var(mask);
4695
4696 return ret;
4697}
4698
4699
4700
4701
4702
4703
4704
4705SYSCALL_DEFINE0(sched_yield)
4706{
4707 struct rq *rq = this_rq_lock();
4708
4709 schedstat_inc(rq, yld_count);
4710 current->sched_class->yield_task(rq);
4711
4712
4713
4714
4715
4716 __release(rq->lock);
4717 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4718 do_raw_spin_unlock(&rq->lock);
4719 sched_preempt_enable_no_resched();
4720
4721 schedule();
4722
4723 return 0;
4724}
4725
4726static inline int should_resched(void)
4727{
4728 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4729}
4730
4731static void __cond_resched(void)
4732{
4733 add_preempt_count(PREEMPT_ACTIVE);
4734 __schedule();
4735 sub_preempt_count(PREEMPT_ACTIVE);
4736}
4737
4738int __sched _cond_resched(void)
4739{
4740 if (should_resched()) {
4741 __cond_resched();
4742 return 1;
4743 }
4744 return 0;
4745}
4746EXPORT_SYMBOL(_cond_resched);
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756int __cond_resched_lock(spinlock_t *lock)
4757{
4758 int resched = should_resched();
4759 int ret = 0;
4760
4761 lockdep_assert_held(lock);
4762
4763 if (spin_needbreak(lock) || resched) {
4764 spin_unlock(lock);
4765 if (resched)
4766 __cond_resched();
4767 else
4768 cpu_relax();
4769 ret = 1;
4770 spin_lock(lock);
4771 }
4772 return ret;
4773}
4774EXPORT_SYMBOL(__cond_resched_lock);
4775
4776int __sched __cond_resched_softirq(void)
4777{
4778 BUG_ON(!in_softirq());
4779
4780 if (should_resched()) {
4781 local_bh_enable();
4782 __cond_resched();
4783 local_bh_disable();
4784 return 1;
4785 }
4786 return 0;
4787}
4788EXPORT_SYMBOL(__cond_resched_softirq);
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812void __sched yield(void)
4813{
4814 set_current_state(TASK_RUNNING);
4815 sys_sched_yield();
4816}
4817EXPORT_SYMBOL(yield);
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831bool __sched yield_to(struct task_struct *p, bool preempt)
4832{
4833 struct task_struct *curr = current;
4834 struct rq *rq, *p_rq;
4835 unsigned long flags;
4836 bool yielded = 0;
4837
4838 local_irq_save(flags);
4839 rq = this_rq();
4840
4841again:
4842 p_rq = task_rq(p);
4843 double_rq_lock(rq, p_rq);
4844 while (task_rq(p) != p_rq) {
4845 double_rq_unlock(rq, p_rq);
4846 goto again;
4847 }
4848
4849 if (!curr->sched_class->yield_to_task)
4850 goto out;
4851
4852 if (curr->sched_class != p->sched_class)
4853 goto out;
4854
4855 if (task_running(p_rq, p) || p->state)
4856 goto out;
4857
4858 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4859 if (yielded) {
4860 schedstat_inc(rq, yld_count);
4861
4862
4863
4864
4865 if (preempt && rq != p_rq)
4866 resched_task(p_rq->curr);
4867 } else {
4868
4869
4870
4871
4872
4873 rq->skip_clock_update = 0;
4874 }
4875
4876out:
4877 double_rq_unlock(rq, p_rq);
4878 local_irq_restore(flags);
4879
4880 if (yielded)
4881 schedule();
4882
4883 return yielded;
4884}
4885EXPORT_SYMBOL_GPL(yield_to);
4886
4887
4888
4889
4890
4891void __sched io_schedule(void)
4892{
4893 struct rq *rq = raw_rq();
4894
4895 delayacct_blkio_start();
4896 atomic_inc(&rq->nr_iowait);
4897 blk_flush_plug(current);
4898 current->in_iowait = 1;
4899 schedule();
4900 current->in_iowait = 0;
4901 atomic_dec(&rq->nr_iowait);
4902 delayacct_blkio_end();
4903}
4904EXPORT_SYMBOL(io_schedule);
4905
4906long __sched io_schedule_timeout(long timeout)
4907{
4908 struct rq *rq = raw_rq();
4909 long ret;
4910
4911 delayacct_blkio_start();
4912 atomic_inc(&rq->nr_iowait);
4913 blk_flush_plug(current);
4914 current->in_iowait = 1;
4915 ret = schedule_timeout(timeout);
4916 current->in_iowait = 0;
4917 atomic_dec(&rq->nr_iowait);
4918 delayacct_blkio_end();
4919 return ret;
4920}
4921
4922
4923
4924
4925
4926
4927
4928
4929SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4930{
4931 int ret = -EINVAL;
4932
4933 switch (policy) {
4934 case SCHED_FIFO:
4935 case SCHED_RR:
4936 ret = MAX_USER_RT_PRIO-1;
4937 break;
4938 case SCHED_NORMAL:
4939 case SCHED_BATCH:
4940 case SCHED_IDLE:
4941 ret = 0;
4942 break;
4943 }
4944 return ret;
4945}
4946
4947
4948
4949
4950
4951
4952
4953
4954SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4955{
4956 int ret = -EINVAL;
4957
4958 switch (policy) {
4959 case SCHED_FIFO:
4960 case SCHED_RR:
4961 ret = 1;
4962 break;
4963 case SCHED_NORMAL:
4964 case SCHED_BATCH:
4965 case SCHED_IDLE:
4966 ret = 0;
4967 }
4968 return ret;
4969}
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4980 struct timespec __user *, interval)
4981{
4982 struct task_struct *p;
4983 unsigned int time_slice;
4984 unsigned long flags;
4985 struct rq *rq;
4986 int retval;
4987 struct timespec t;
4988
4989 if (pid < 0)
4990 return -EINVAL;
4991
4992 retval = -ESRCH;
4993 rcu_read_lock();
4994 p = find_process_by_pid(pid);
4995 if (!p)
4996 goto out_unlock;
4997
4998 retval = security_task_getscheduler(p);
4999 if (retval)
5000 goto out_unlock;
5001
5002 rq = task_rq_lock(p, &flags);
5003 time_slice = p->sched_class->get_rr_interval(rq, p);
5004 task_rq_unlock(rq, p, &flags);
5005
5006 rcu_read_unlock();
5007 jiffies_to_timespec(time_slice, &t);
5008 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5009 return retval;
5010
5011out_unlock:
5012 rcu_read_unlock();
5013 return retval;
5014}
5015
5016static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5017
5018void sched_show_task(struct task_struct *p)
5019{
5020 unsigned long free = 0;
5021 unsigned state;
5022
5023 state = p->state ? __ffs(p->state) + 1 : 0;
5024 printk(KERN_INFO "%-15.15s %c", p->comm,
5025 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5026#if BITS_PER_LONG == 32
5027 if (state == TASK_RUNNING)
5028 printk(KERN_CONT " running ");
5029 else
5030 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5031#else
5032 if (state == TASK_RUNNING)
5033 printk(KERN_CONT " running task ");
5034 else
5035 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5036#endif
5037#ifdef CONFIG_DEBUG_STACK_USAGE
5038 free = stack_not_used(p);
5039#endif
5040 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5041 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
5042 (unsigned long)task_thread_info(p)->flags);
5043
5044 show_stack(p, NULL);
5045}
5046
5047void show_state_filter(unsigned long state_filter)
5048{
5049 struct task_struct *g, *p;
5050
5051#if BITS_PER_LONG == 32
5052 printk(KERN_INFO
5053 " task PC stack pid father\n");
5054#else
5055 printk(KERN_INFO
5056 " task PC stack pid father\n");
5057#endif
5058 rcu_read_lock();
5059 do_each_thread(g, p) {
5060
5061
5062
5063
5064 touch_nmi_watchdog();
5065 if (!state_filter || (p->state & state_filter))
5066 sched_show_task(p);
5067 } while_each_thread(g, p);
5068
5069 touch_all_softlockup_watchdogs();
5070
5071#ifdef CONFIG_SCHED_DEBUG
5072 sysrq_sched_debug_show();
5073#endif
5074 rcu_read_unlock();
5075
5076
5077
5078 if (!state_filter)
5079 debug_show_all_locks();
5080}
5081
5082void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5083{
5084 idle->sched_class = &idle_sched_class;
5085}
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095void __cpuinit init_idle(struct task_struct *idle, int cpu)
5096{
5097 struct rq *rq = cpu_rq(cpu);
5098 unsigned long flags;
5099
5100 raw_spin_lock_irqsave(&rq->lock, flags);
5101
5102 __sched_fork(idle);
5103 idle->state = TASK_RUNNING;
5104 idle->se.exec_start = sched_clock();
5105
5106 do_set_cpus_allowed(idle, cpumask_of(cpu));
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117 rcu_read_lock();
5118 __set_task_cpu(idle, cpu);
5119 rcu_read_unlock();
5120
5121 rq->curr = rq->idle = idle;
5122#if defined(CONFIG_SMP)
5123 idle->on_cpu = 1;
5124#endif
5125 raw_spin_unlock_irqrestore(&rq->lock, flags);
5126
5127
5128 task_thread_info(idle)->preempt_count = 0;
5129
5130
5131
5132
5133 idle->sched_class = &idle_sched_class;
5134 ftrace_graph_init_idle_task(idle, cpu);
5135#if defined(CONFIG_SMP)
5136 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5137#endif
5138}
5139
5140#ifdef CONFIG_SMP
5141void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5142{
5143 if (p->sched_class && p->sched_class->set_cpus_allowed)
5144 p->sched_class->set_cpus_allowed(p, new_mask);
5145
5146 cpumask_copy(&p->cpus_allowed, new_mask);
5147 p->nr_cpus_allowed = cpumask_weight(new_mask);
5148}
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5174{
5175 unsigned long flags;
5176 struct rq *rq;
5177 unsigned int dest_cpu;
5178 int ret = 0;
5179
5180 rq = task_rq_lock(p, &flags);
5181
5182 if (cpumask_equal(&p->cpus_allowed, new_mask))
5183 goto out;
5184
5185 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5186 ret = -EINVAL;
5187 goto out;
5188 }
5189
5190 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5191 ret = -EINVAL;
5192 goto out;
5193 }
5194
5195 do_set_cpus_allowed(p, new_mask);
5196
5197
5198 if (cpumask_test_cpu(task_cpu(p), new_mask))
5199 goto out;
5200
5201 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5202 if (p->on_rq) {
5203 struct migration_arg arg = { p, dest_cpu };
5204
5205 task_rq_unlock(rq, p, &flags);
5206 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5207 tlb_migrate_finish(p->mm);
5208 return 0;
5209 }
5210out:
5211 task_rq_unlock(rq, p, &flags);
5212
5213 return ret;
5214}
5215EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5229{
5230 struct rq *rq_dest, *rq_src;
5231 int ret = 0;
5232
5233 if (unlikely(!cpu_active(dest_cpu)))
5234 return ret;
5235
5236 rq_src = cpu_rq(src_cpu);
5237 rq_dest = cpu_rq(dest_cpu);
5238
5239 raw_spin_lock(&p->pi_lock);
5240 double_rq_lock(rq_src, rq_dest);
5241
5242 if (task_cpu(p) != src_cpu)
5243 goto done;
5244
5245 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5246 goto fail;
5247
5248
5249
5250
5251
5252 if (p->on_rq) {
5253 dequeue_task(rq_src, p, 0);
5254 set_task_cpu(p, dest_cpu);
5255 enqueue_task(rq_dest, p, 0);
5256 check_preempt_curr(rq_dest, p, 0);
5257 }
5258done:
5259 ret = 1;
5260fail:
5261 double_rq_unlock(rq_src, rq_dest);
5262 raw_spin_unlock(&p->pi_lock);
5263 return ret;
5264}
5265
5266
5267
5268
5269
5270
5271static int migration_cpu_stop(void *data)
5272{
5273 struct migration_arg *arg = data;
5274
5275
5276
5277
5278
5279 local_irq_disable();
5280 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5281 local_irq_enable();
5282 return 0;
5283}
5284
5285#ifdef CONFIG_HOTPLUG_CPU
5286
5287
5288
5289
5290
5291void idle_task_exit(void)
5292{
5293 struct mm_struct *mm = current->active_mm;
5294
5295 BUG_ON(cpu_online(smp_processor_id()));
5296
5297 if (mm != &init_mm)
5298 switch_mm(mm, &init_mm, current);
5299 mmdrop(mm);
5300}
5301
5302
5303
5304
5305
5306
5307
5308
5309static void migrate_nr_uninterruptible(struct rq *rq_src)
5310{
5311 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5312
5313 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5314 rq_src->nr_uninterruptible = 0;
5315}
5316
5317
5318
5319
5320static void calc_global_load_remove(struct rq *rq)
5321{
5322 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5323 rq->calc_load_active = 0;
5324}
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334static void migrate_tasks(unsigned int dead_cpu)
5335{
5336 struct rq *rq = cpu_rq(dead_cpu);
5337 struct task_struct *next, *stop = rq->stop;
5338 int dest_cpu;
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349 rq->stop = NULL;
5350
5351
5352 unthrottle_offline_cfs_rqs(rq);
5353
5354 for ( ; ; ) {
5355
5356
5357
5358
5359 if (rq->nr_running == 1)
5360 break;
5361
5362 next = pick_next_task(rq);
5363 BUG_ON(!next);
5364 next->sched_class->put_prev_task(rq, next);
5365
5366
5367 dest_cpu = select_fallback_rq(dead_cpu, next);
5368 raw_spin_unlock(&rq->lock);
5369
5370 __migrate_task(next, dead_cpu, dest_cpu);
5371
5372 raw_spin_lock(&rq->lock);
5373 }
5374
5375 rq->stop = stop;
5376}
5377
5378#endif
5379
5380#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5381
5382static struct ctl_table sd_ctl_dir[] = {
5383 {
5384 .procname = "sched_domain",
5385 .mode = 0555,
5386 },
5387 {}
5388};
5389
5390static struct ctl_table sd_ctl_root[] = {
5391 {
5392 .procname = "kernel",
5393 .mode = 0555,
5394 .child = sd_ctl_dir,
5395 },
5396 {}
5397};
5398
5399static struct ctl_table *sd_alloc_ctl_entry(int n)
5400{
5401 struct ctl_table *entry =
5402 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5403
5404 return entry;
5405}
5406
5407static void sd_free_ctl_entry(struct ctl_table **tablep)
5408{
5409 struct ctl_table *entry;
5410
5411
5412
5413
5414
5415
5416
5417 for (entry = *tablep; entry->mode; entry++) {
5418 if (entry->child)
5419 sd_free_ctl_entry(&entry->child);
5420 if (entry->proc_handler == NULL)
5421 kfree(entry->procname);
5422 }
5423
5424 kfree(*tablep);
5425 *tablep = NULL;
5426}
5427
5428static void
5429set_table_entry(struct ctl_table *entry,
5430 const char *procname, void *data, int maxlen,
5431 umode_t mode, proc_handler *proc_handler)
5432{
5433 entry->procname = procname;
5434 entry->data = data;
5435 entry->maxlen = maxlen;
5436 entry->mode = mode;
5437 entry->proc_handler = proc_handler;
5438}
5439
5440static struct ctl_table *
5441sd_alloc_ctl_domain_table(struct sched_domain *sd)
5442{
5443 struct ctl_table *table = sd_alloc_ctl_entry(13);
5444
5445 if (table == NULL)
5446 return NULL;
5447
5448 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5449 sizeof(long), 0644, proc_doulongvec_minmax);
5450 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5451 sizeof(long), 0644, proc_doulongvec_minmax);
5452 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5453 sizeof(int), 0644, proc_dointvec_minmax);
5454 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5455 sizeof(int), 0644, proc_dointvec_minmax);
5456 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5457 sizeof(int), 0644, proc_dointvec_minmax);
5458 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5459 sizeof(int), 0644, proc_dointvec_minmax);
5460 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5461 sizeof(int), 0644, proc_dointvec_minmax);
5462 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5463 sizeof(int), 0644, proc_dointvec_minmax);
5464 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5465 sizeof(int), 0644, proc_dointvec_minmax);
5466 set_table_entry(&table[9], "cache_nice_tries",
5467 &sd->cache_nice_tries,
5468 sizeof(int), 0644, proc_dointvec_minmax);
5469 set_table_entry(&table[10], "flags", &sd->flags,
5470 sizeof(int), 0644, proc_dointvec_minmax);
5471 set_table_entry(&table[11], "name", sd->name,
5472 CORENAME_MAX_SIZE, 0444, proc_dostring);
5473
5474
5475 return table;
5476}
5477
5478static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5479{
5480 struct ctl_table *entry, *table;
5481 struct sched_domain *sd;
5482 int domain_num = 0, i;
5483 char buf[32];
5484
5485 for_each_domain(cpu, sd)
5486 domain_num++;
5487 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5488 if (table == NULL)
5489 return NULL;
5490
5491 i = 0;
5492 for_each_domain(cpu, sd) {
5493 snprintf(buf, 32, "domain%d", i);
5494 entry->procname = kstrdup(buf, GFP_KERNEL);
5495 entry->mode = 0555;
5496 entry->child = sd_alloc_ctl_domain_table(sd);
5497 entry++;
5498 i++;
5499 }
5500 return table;
5501}
5502
5503static struct ctl_table_header *sd_sysctl_header;
5504static void register_sched_domain_sysctl(void)
5505{
5506 int i, cpu_num = num_possible_cpus();
5507 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5508 char buf[32];
5509
5510 WARN_ON(sd_ctl_dir[0].child);
5511 sd_ctl_dir[0].child = entry;
5512
5513 if (entry == NULL)
5514 return;
5515
5516 for_each_possible_cpu(i) {
5517 snprintf(buf, 32, "cpu%d", i);
5518 entry->procname = kstrdup(buf, GFP_KERNEL);
5519 entry->mode = 0555;
5520 entry->child = sd_alloc_ctl_cpu_table(i);
5521 entry++;
5522 }
5523
5524 WARN_ON(sd_sysctl_header);
5525 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5526}
5527
5528
5529static void unregister_sched_domain_sysctl(void)
5530{
5531 if (sd_sysctl_header)
5532 unregister_sysctl_table(sd_sysctl_header);
5533 sd_sysctl_header = NULL;
5534 if (sd_ctl_dir[0].child)
5535 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5536}
5537#else
5538static void register_sched_domain_sysctl(void)
5539{
5540}
5541static void unregister_sched_domain_sysctl(void)
5542{
5543}
5544#endif
5545
5546static void set_rq_online(struct rq *rq)
5547{
5548 if (!rq->online) {
5549 const struct sched_class *class;
5550
5551 cpumask_set_cpu(rq->cpu, rq->rd->online);
5552 rq->online = 1;
5553
5554 for_each_class(class) {
5555 if (class->rq_online)
5556 class->rq_online(rq);
5557 }
5558 }
5559}
5560
5561static void set_rq_offline(struct rq *rq)
5562{
5563 if (rq->online) {
5564 const struct sched_class *class;
5565
5566 for_each_class(class) {
5567 if (class->rq_offline)
5568 class->rq_offline(rq);
5569 }
5570
5571 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5572 rq->online = 0;
5573 }
5574}
5575
5576
5577
5578
5579
5580static int __cpuinit
5581migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5582{
5583 int cpu = (long)hcpu;
5584 unsigned long flags;
5585 struct rq *rq = cpu_rq(cpu);
5586
5587 switch (action & ~CPU_TASKS_FROZEN) {
5588
5589 case CPU_UP_PREPARE:
5590 rq->calc_load_update = calc_load_update;
5591 break;
5592
5593 case CPU_ONLINE:
5594
5595 raw_spin_lock_irqsave(&rq->lock, flags);
5596 if (rq->rd) {
5597 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5598
5599 set_rq_online(rq);
5600 }
5601 raw_spin_unlock_irqrestore(&rq->lock, flags);
5602 break;
5603
5604#ifdef CONFIG_HOTPLUG_CPU
5605 case CPU_DYING:
5606 sched_ttwu_pending();
5607
5608 raw_spin_lock_irqsave(&rq->lock, flags);
5609 if (rq->rd) {
5610 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5611 set_rq_offline(rq);
5612 }
5613 migrate_tasks(cpu);
5614 BUG_ON(rq->nr_running != 1);
5615 raw_spin_unlock_irqrestore(&rq->lock, flags);
5616
5617 migrate_nr_uninterruptible(rq);
5618 calc_global_load_remove(rq);
5619 break;
5620#endif
5621 }
5622
5623 update_max_interval();
5624
5625 return NOTIFY_OK;
5626}
5627
5628
5629
5630
5631
5632
5633static struct notifier_block __cpuinitdata migration_notifier = {
5634 .notifier_call = migration_call,
5635 .priority = CPU_PRI_MIGRATION,
5636};
5637
5638static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5639 unsigned long action, void *hcpu)
5640{
5641 switch (action & ~CPU_TASKS_FROZEN) {
5642 case CPU_STARTING:
5643 case CPU_DOWN_FAILED:
5644 set_cpu_active((long)hcpu, true);
5645 return NOTIFY_OK;
5646 default:
5647 return NOTIFY_DONE;
5648 }
5649}
5650
5651static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5652 unsigned long action, void *hcpu)
5653{
5654 switch (action & ~CPU_TASKS_FROZEN) {
5655 case CPU_DOWN_PREPARE:
5656 set_cpu_active((long)hcpu, false);
5657 return NOTIFY_OK;
5658 default:
5659 return NOTIFY_DONE;
5660 }
5661}
5662
5663static int __init migration_init(void)
5664{
5665 void *cpu = (void *)(long)smp_processor_id();
5666 int err;
5667
5668
5669 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5670 BUG_ON(err == NOTIFY_BAD);
5671 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5672 register_cpu_notifier(&migration_notifier);
5673
5674
5675 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5676 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5677
5678 return 0;
5679}
5680early_initcall(migration_init);
5681#endif
5682
5683#ifdef CONFIG_SMP
5684
5685static cpumask_var_t sched_domains_tmpmask;
5686
5687#ifdef CONFIG_SCHED_DEBUG
5688
5689static __read_mostly int sched_debug_enabled;
5690
5691static int __init sched_debug_setup(char *str)
5692{
5693 sched_debug_enabled = 1;
5694
5695 return 0;
5696}
5697early_param("sched_debug", sched_debug_setup);
5698
5699static inline bool sched_debug(void)
5700{
5701 return sched_debug_enabled;
5702}
5703
5704static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5705 struct cpumask *groupmask)
5706{
5707 struct sched_group *group = sd->groups;
5708 char str[256];
5709
5710 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5711 cpumask_clear(groupmask);
5712
5713 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5714
5715 if (!(sd->flags & SD_LOAD_BALANCE)) {
5716 printk("does not load-balance\n");
5717 if (sd->parent)
5718 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5719 " has parent");
5720 return -1;
5721 }
5722
5723 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5724
5725 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5726 printk(KERN_ERR "ERROR: domain->span does not contain "
5727 "CPU%d\n", cpu);
5728 }
5729 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5730 printk(KERN_ERR "ERROR: domain->groups does not contain"
5731 " CPU%d\n", cpu);
5732 }
5733
5734 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5735 do {
5736 if (!group) {
5737 printk("\n");
5738 printk(KERN_ERR "ERROR: group is NULL\n");
5739 break;
5740 }
5741
5742
5743
5744
5745
5746
5747 if (!group->sgp->power_orig) {
5748 printk(KERN_CONT "\n");
5749 printk(KERN_ERR "ERROR: domain->cpu_power not "
5750 "set\n");
5751 break;
5752 }
5753
5754 if (!cpumask_weight(sched_group_cpus(group))) {
5755 printk(KERN_CONT "\n");
5756 printk(KERN_ERR "ERROR: empty group\n");
5757 break;
5758 }
5759
5760 if (!(sd->flags & SD_OVERLAP) &&
5761 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5762 printk(KERN_CONT "\n");
5763 printk(KERN_ERR "ERROR: repeated CPUs\n");
5764 break;
5765 }
5766
5767 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5768
5769 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5770
5771 printk(KERN_CONT " %s", str);
5772 if (group->sgp->power != SCHED_POWER_SCALE) {
5773 printk(KERN_CONT " (cpu_power = %d)",
5774 group->sgp->power);
5775 }
5776
5777 group = group->next;
5778 } while (group != sd->groups);
5779 printk(KERN_CONT "\n");
5780
5781 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5782 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5783
5784 if (sd->parent &&
5785 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5786 printk(KERN_ERR "ERROR: parent span is not a superset "
5787 "of domain->span\n");
5788 return 0;
5789}
5790
5791static void sched_domain_debug(struct sched_domain *sd, int cpu)
5792{
5793 int level = 0;
5794
5795 if (!sched_debug_enabled)
5796 return;
5797
5798 if (!sd) {
5799 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5800 return;
5801 }
5802
5803 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5804
5805 for (;;) {
5806 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5807 break;
5808 level++;
5809 sd = sd->parent;
5810 if (!sd)
5811 break;
5812 }
5813}
5814#else
5815# define sched_domain_debug(sd, cpu) do { } while (0)
5816static inline bool sched_debug(void)
5817{
5818 return false;
5819}
5820#endif
5821
5822static int sd_degenerate(struct sched_domain *sd)
5823{
5824 if (cpumask_weight(sched_domain_span(sd)) == 1)
5825 return 1;
5826
5827
5828 if (sd->flags & (SD_LOAD_BALANCE |
5829 SD_BALANCE_NEWIDLE |
5830 SD_BALANCE_FORK |
5831 SD_BALANCE_EXEC |
5832 SD_SHARE_CPUPOWER |
5833 SD_SHARE_PKG_RESOURCES)) {
5834 if (sd->groups != sd->groups->next)
5835 return 0;
5836 }
5837
5838
5839 if (sd->flags & (SD_WAKE_AFFINE))
5840 return 0;
5841
5842 return 1;
5843}
5844
5845static int
5846sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5847{
5848 unsigned long cflags = sd->flags, pflags = parent->flags;
5849
5850 if (sd_degenerate(parent))
5851 return 1;
5852
5853 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5854 return 0;
5855
5856
5857 if (parent->groups == parent->groups->next) {
5858 pflags &= ~(SD_LOAD_BALANCE |
5859 SD_BALANCE_NEWIDLE |
5860 SD_BALANCE_FORK |
5861 SD_BALANCE_EXEC |
5862 SD_SHARE_CPUPOWER |
5863 SD_SHARE_PKG_RESOURCES);
5864 if (nr_node_ids == 1)
5865 pflags &= ~SD_SERIALIZE;
5866 }
5867 if (~cflags & pflags)
5868 return 0;
5869
5870 return 1;
5871}
5872
5873static void free_rootdomain(struct rcu_head *rcu)
5874{
5875 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5876
5877 cpupri_cleanup(&rd->cpupri);
5878 free_cpumask_var(rd->rto_mask);
5879 free_cpumask_var(rd->online);
5880 free_cpumask_var(rd->span);
5881 kfree(rd);
5882}
5883
5884static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5885{
5886 struct root_domain *old_rd = NULL;
5887 unsigned long flags;
5888
5889 raw_spin_lock_irqsave(&rq->lock, flags);
5890
5891 if (rq->rd) {
5892 old_rd = rq->rd;
5893
5894 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5895 set_rq_offline(rq);
5896
5897 cpumask_clear_cpu(rq->cpu, old_rd->span);
5898
5899
5900
5901
5902
5903
5904 if (!atomic_dec_and_test(&old_rd->refcount))
5905 old_rd = NULL;
5906 }
5907
5908 atomic_inc(&rd->refcount);
5909 rq->rd = rd;
5910
5911 cpumask_set_cpu(rq->cpu, rd->span);
5912 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5913 set_rq_online(rq);
5914
5915 raw_spin_unlock_irqrestore(&rq->lock, flags);
5916
5917 if (old_rd)
5918 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5919}
5920
5921static int init_rootdomain(struct root_domain *rd)
5922{
5923 memset(rd, 0, sizeof(*rd));
5924
5925 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5926 goto out;
5927 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5928 goto free_span;
5929 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5930 goto free_online;
5931
5932 if (cpupri_init(&rd->cpupri) != 0)
5933 goto free_rto_mask;
5934 return 0;
5935
5936free_rto_mask:
5937 free_cpumask_var(rd->rto_mask);
5938free_online:
5939 free_cpumask_var(rd->online);
5940free_span:
5941 free_cpumask_var(rd->span);
5942out:
5943 return -ENOMEM;
5944}
5945
5946
5947
5948
5949
5950struct root_domain def_root_domain;
5951
5952static void init_defrootdomain(void)
5953{
5954 init_rootdomain(&def_root_domain);
5955
5956 atomic_set(&def_root_domain.refcount, 1);
5957}
5958
5959static struct root_domain *alloc_rootdomain(void)
5960{
5961 struct root_domain *rd;
5962
5963 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5964 if (!rd)
5965 return NULL;
5966
5967 if (init_rootdomain(rd) != 0) {
5968 kfree(rd);
5969 return NULL;
5970 }
5971
5972 return rd;
5973}
5974
5975static void free_sched_groups(struct sched_group *sg, int free_sgp)
5976{
5977 struct sched_group *tmp, *first;
5978
5979 if (!sg)
5980 return;
5981
5982 first = sg;
5983 do {
5984 tmp = sg->next;
5985
5986 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5987 kfree(sg->sgp);
5988
5989 kfree(sg);
5990 sg = tmp;
5991 } while (sg != first);
5992}
5993
5994static void free_sched_domain(struct rcu_head *rcu)
5995{
5996 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5997
5998
5999
6000
6001
6002 if (sd->flags & SD_OVERLAP) {
6003 free_sched_groups(sd->groups, 1);
6004 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6005 kfree(sd->groups->sgp);
6006 kfree(sd->groups);
6007 }
6008 kfree(sd);
6009}
6010
6011static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6012{
6013 call_rcu(&sd->rcu, free_sched_domain);
6014}
6015
6016static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6017{
6018 for (; sd; sd = sd->parent)
6019 destroy_sched_domain(sd, cpu);
6020}
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031DEFINE_PER_CPU(struct sched_domain *, sd_llc);
6032DEFINE_PER_CPU(int, sd_llc_id);
6033
6034static void update_top_cache_domain(int cpu)
6035{
6036 struct sched_domain *sd;
6037 int id = cpu;
6038
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd)
6041 id = cpumask_first(sched_domain_span(sd));
6042
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id;
6045}
6046
6047
6048
6049
6050
6051static void
6052cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6053{
6054 struct rq *rq = cpu_rq(cpu);
6055 struct sched_domain *tmp;
6056
6057
6058 for (tmp = sd; tmp; ) {
6059 struct sched_domain *parent = tmp->parent;
6060 if (!parent)
6061 break;
6062
6063 if (sd_parent_degenerate(tmp, parent)) {
6064 tmp->parent = parent->parent;
6065 if (parent->parent)
6066 parent->parent->child = tmp;
6067 destroy_sched_domain(parent, cpu);
6068 } else
6069 tmp = tmp->parent;
6070 }
6071
6072 if (sd && sd_degenerate(sd)) {
6073 tmp = sd;
6074 sd = sd->parent;
6075 destroy_sched_domain(tmp, cpu);
6076 if (sd)
6077 sd->child = NULL;
6078 }
6079
6080 sched_domain_debug(sd, cpu);
6081
6082 rq_attach_root(rq, rd);
6083 tmp = rq->sd;
6084 rcu_assign_pointer(rq->sd, sd);
6085 destroy_sched_domains(tmp, cpu);
6086
6087 update_top_cache_domain(cpu);
6088}
6089
6090
6091static cpumask_var_t cpu_isolated_map;
6092
6093
6094static int __init isolated_cpu_setup(char *str)
6095{
6096 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6097 cpulist_parse(str, cpu_isolated_map);
6098 return 1;
6099}
6100
6101__setup("isolcpus=", isolated_cpu_setup);
6102
6103static const struct cpumask *cpu_cpu_mask(int cpu)
6104{
6105 return cpumask_of_node(cpu_to_node(cpu));
6106}
6107
6108struct sd_data {
6109 struct sched_domain **__percpu sd;
6110 struct sched_group **__percpu sg;
6111 struct sched_group_power **__percpu sgp;
6112};
6113
6114struct s_data {
6115 struct sched_domain ** __percpu sd;
6116 struct root_domain *rd;
6117};
6118
6119enum s_alloc {
6120 sa_rootdomain,
6121 sa_sd,
6122 sa_sd_storage,
6123 sa_none,
6124};
6125
6126struct sched_domain_topology_level;
6127
6128typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6129typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6130
6131#define SDTL_OVERLAP 0x01
6132
6133struct sched_domain_topology_level {
6134 sched_domain_init_f init;
6135 sched_domain_mask_f mask;
6136 int flags;
6137 int numa_level;
6138 struct sd_data data;
6139};
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6155{
6156 const struct cpumask *span = sched_domain_span(sd);
6157 struct sd_data *sdd = sd->private;
6158 struct sched_domain *sibling;
6159 int i;
6160
6161 for_each_cpu(i, span) {
6162 sibling = *per_cpu_ptr(sdd->sd, i);
6163 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6164 continue;
6165
6166 cpumask_set_cpu(i, sched_group_mask(sg));
6167 }
6168}
6169
6170
6171
6172
6173
6174int group_balance_cpu(struct sched_group *sg)
6175{
6176 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6177}
6178
6179static int
6180build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6181{
6182 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6183 const struct cpumask *span = sched_domain_span(sd);
6184 struct cpumask *covered = sched_domains_tmpmask;
6185 struct sd_data *sdd = sd->private;
6186 struct sched_domain *child;
6187 int i;
6188
6189 cpumask_clear(covered);
6190
6191 for_each_cpu(i, span) {
6192 struct cpumask *sg_span;
6193
6194 if (cpumask_test_cpu(i, covered))
6195 continue;
6196
6197 child = *per_cpu_ptr(sdd->sd, i);
6198
6199
6200 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6201 continue;
6202
6203 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6204 GFP_KERNEL, cpu_to_node(cpu));
6205
6206 if (!sg)
6207 goto fail;
6208
6209 sg_span = sched_group_cpus(sg);
6210 if (child->child) {
6211 child = child->child;
6212 cpumask_copy(sg_span, sched_domain_span(child));
6213 } else
6214 cpumask_set_cpu(i, sg_span);
6215
6216 cpumask_or(covered, covered, sg_span);
6217
6218 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6219 if (atomic_inc_return(&sg->sgp->ref) == 1)
6220 build_group_mask(sd, sg);
6221
6222
6223
6224
6225
6226
6227 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6228
6229
6230
6231
6232
6233
6234 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6235 group_balance_cpu(sg) == cpu)
6236 groups = sg;
6237
6238 if (!first)
6239 first = sg;
6240 if (last)
6241 last->next = sg;
6242 last = sg;
6243 last->next = first;
6244 }
6245 sd->groups = groups;
6246
6247 return 0;
6248
6249fail:
6250 free_sched_groups(first, 0);
6251
6252 return -ENOMEM;
6253}
6254
6255static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6256{
6257 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6258 struct sched_domain *child = sd->child;
6259
6260 if (child)
6261 cpu = cpumask_first(sched_domain_span(child));
6262
6263 if (sg) {
6264 *sg = *per_cpu_ptr(sdd->sg, cpu);
6265 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
6266 atomic_set(&(*sg)->sgp->ref, 1);
6267 }
6268
6269 return cpu;
6270}
6271
6272
6273
6274
6275
6276
6277
6278
6279static int
6280build_sched_groups(struct sched_domain *sd, int cpu)
6281{
6282 struct sched_group *first = NULL, *last = NULL;
6283 struct sd_data *sdd = sd->private;
6284 const struct cpumask *span = sched_domain_span(sd);
6285 struct cpumask *covered;
6286 int i;
6287
6288 get_group(cpu, sdd, &sd->groups);
6289 atomic_inc(&sd->groups->ref);
6290
6291 if (cpu != cpumask_first(sched_domain_span(sd)))
6292 return 0;
6293
6294 lockdep_assert_held(&sched_domains_mutex);
6295 covered = sched_domains_tmpmask;
6296
6297 cpumask_clear(covered);
6298
6299 for_each_cpu(i, span) {
6300 struct sched_group *sg;
6301 int group = get_group(i, sdd, &sg);
6302 int j;
6303
6304 if (cpumask_test_cpu(i, covered))
6305 continue;
6306
6307 cpumask_clear(sched_group_cpus(sg));
6308 sg->sgp->power = 0;
6309 cpumask_setall(sched_group_mask(sg));
6310
6311 for_each_cpu(j, span) {
6312 if (get_group(j, sdd, NULL) != group)
6313 continue;
6314
6315 cpumask_set_cpu(j, covered);
6316 cpumask_set_cpu(j, sched_group_cpus(sg));
6317 }
6318
6319 if (!first)
6320 first = sg;
6321 if (last)
6322 last->next = sg;
6323 last = sg;
6324 }
6325 last->next = first;
6326
6327 return 0;
6328}
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6341{
6342 struct sched_group *sg = sd->groups;
6343
6344 WARN_ON(!sd || !sg);
6345
6346 do {
6347 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6348 sg = sg->next;
6349 } while (sg != sd->groups);
6350
6351 if (cpu != group_balance_cpu(sg))
6352 return;
6353
6354 update_group_power(sd, cpu);
6355 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6356}
6357
6358int __weak arch_sd_sibling_asym_packing(void)
6359{
6360 return 0*SD_ASYM_PACKING;
6361}
6362
6363
6364
6365
6366
6367
6368#ifdef CONFIG_SCHED_DEBUG
6369# define SD_INIT_NAME(sd, type) sd->name = #type
6370#else
6371# define SD_INIT_NAME(sd, type) do { } while (0)
6372#endif
6373
6374#define SD_INIT_FUNC(type) \
6375static noinline struct sched_domain * \
6376sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6377{ \
6378 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6379 *sd = SD_##type##_INIT; \
6380 SD_INIT_NAME(sd, type); \
6381 sd->private = &tl->data; \
6382 return sd; \
6383}
6384
6385SD_INIT_FUNC(CPU)
6386#ifdef CONFIG_SCHED_SMT
6387 SD_INIT_FUNC(SIBLING)
6388#endif
6389#ifdef CONFIG_SCHED_MC
6390 SD_INIT_FUNC(MC)
6391#endif
6392#ifdef CONFIG_SCHED_BOOK
6393 SD_INIT_FUNC(BOOK)
6394#endif
6395
6396static int default_relax_domain_level = -1;
6397int sched_domain_level_max;
6398
6399static int __init setup_relax_domain_level(char *str)
6400{
6401 if (kstrtoint(str, 0, &default_relax_domain_level))
6402 pr_warn("Unable to set relax_domain_level\n");
6403
6404 return 1;
6405}
6406__setup("relax_domain_level=", setup_relax_domain_level);
6407
6408static void set_domain_attribute(struct sched_domain *sd,
6409 struct sched_domain_attr *attr)
6410{
6411 int request;
6412
6413 if (!attr || attr->relax_domain_level < 0) {
6414 if (default_relax_domain_level < 0)
6415 return;
6416 else
6417 request = default_relax_domain_level;
6418 } else
6419 request = attr->relax_domain_level;
6420 if (request < sd->level) {
6421
6422 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6423 } else {
6424
6425 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6426 }
6427}
6428
6429static void __sdt_free(const struct cpumask *cpu_map);
6430static int __sdt_alloc(const struct cpumask *cpu_map);
6431
6432static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6433 const struct cpumask *cpu_map)
6434{
6435 switch (what) {
6436 case sa_rootdomain:
6437 if (!atomic_read(&d->rd->refcount))
6438 free_rootdomain(&d->rd->rcu);
6439 case sa_sd:
6440 free_percpu(d->sd);
6441 case sa_sd_storage:
6442 __sdt_free(cpu_map);
6443 case sa_none:
6444 break;
6445 }
6446}
6447
6448static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6449 const struct cpumask *cpu_map)
6450{
6451 memset(d, 0, sizeof(*d));
6452
6453 if (__sdt_alloc(cpu_map))
6454 return sa_sd_storage;
6455 d->sd = alloc_percpu(struct sched_domain *);
6456 if (!d->sd)
6457 return sa_sd_storage;
6458 d->rd = alloc_rootdomain();
6459 if (!d->rd)
6460 return sa_sd;
6461 return sa_rootdomain;
6462}
6463
6464
6465
6466
6467
6468
6469static void claim_allocations(int cpu, struct sched_domain *sd)
6470{
6471 struct sd_data *sdd = sd->private;
6472
6473 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6474 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6475
6476 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6477 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6478
6479 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6480 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6481}
6482
6483#ifdef CONFIG_SCHED_SMT
6484static const struct cpumask *cpu_smt_mask(int cpu)
6485{
6486 return topology_thread_cpumask(cpu);
6487}
6488#endif
6489
6490
6491
6492
6493static struct sched_domain_topology_level default_topology[] = {
6494#ifdef CONFIG_SCHED_SMT
6495 { sd_init_SIBLING, cpu_smt_mask, },
6496#endif
6497#ifdef CONFIG_SCHED_MC
6498 { sd_init_MC, cpu_coregroup_mask, },
6499#endif
6500#ifdef CONFIG_SCHED_BOOK
6501 { sd_init_BOOK, cpu_book_mask, },
6502#endif
6503 { sd_init_CPU, cpu_cpu_mask, },
6504 { NULL, },
6505};
6506
6507static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6508
6509#ifdef CONFIG_NUMA
6510
6511static int sched_domains_numa_levels;
6512static int *sched_domains_numa_distance;
6513static struct cpumask ***sched_domains_numa_masks;
6514static int sched_domains_curr_level;
6515
6516static inline int sd_local_flags(int level)
6517{
6518 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6519 return 0;
6520
6521 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6522}
6523
6524static struct sched_domain *
6525sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6526{
6527 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6528 int level = tl->numa_level;
6529 int sd_weight = cpumask_weight(
6530 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6531
6532 *sd = (struct sched_domain){
6533 .min_interval = sd_weight,
6534 .max_interval = 2*sd_weight,
6535 .busy_factor = 32,
6536 .imbalance_pct = 125,
6537 .cache_nice_tries = 2,
6538 .busy_idx = 3,
6539 .idle_idx = 2,
6540 .newidle_idx = 0,
6541 .wake_idx = 0,
6542 .forkexec_idx = 0,
6543
6544 .flags = 1*SD_LOAD_BALANCE
6545 | 1*SD_BALANCE_NEWIDLE
6546 | 0*SD_BALANCE_EXEC
6547 | 0*SD_BALANCE_FORK
6548 | 0*SD_BALANCE_WAKE
6549 | 0*SD_WAKE_AFFINE
6550 | 0*SD_PREFER_LOCAL
6551 | 0*SD_SHARE_CPUPOWER
6552 | 0*SD_SHARE_PKG_RESOURCES
6553 | 1*SD_SERIALIZE
6554 | 0*SD_PREFER_SIBLING
6555 | sd_local_flags(level)
6556 ,
6557 .last_balance = jiffies,
6558 .balance_interval = sd_weight,
6559 };
6560 SD_INIT_NAME(sd, NUMA);
6561 sd->private = &tl->data;
6562
6563
6564
6565
6566 sched_domains_curr_level = tl->numa_level;
6567
6568 return sd;
6569}
6570
6571static const struct cpumask *sd_numa_mask(int cpu)
6572{
6573 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6574}
6575
6576static void sched_numa_warn(const char *str)
6577{
6578 static int done = false;
6579 int i,j;
6580
6581 if (done)
6582 return;
6583
6584 done = true;
6585
6586 printk(KERN_WARNING "ERROR: %s\n\n", str);
6587
6588 for (i = 0; i < nr_node_ids; i++) {
6589 printk(KERN_WARNING " ");
6590 for (j = 0; j < nr_node_ids; j++)
6591 printk(KERN_CONT "%02d ", node_distance(i,j));
6592 printk(KERN_CONT "\n");
6593 }
6594 printk(KERN_WARNING "\n");
6595}
6596
6597static bool find_numa_distance(int distance)
6598{
6599 int i;
6600
6601 if (distance == node_distance(0, 0))
6602 return true;
6603
6604 for (i = 0; i < sched_domains_numa_levels; i++) {
6605 if (sched_domains_numa_distance[i] == distance)
6606 return true;
6607 }
6608
6609 return false;
6610}
6611
6612static void sched_init_numa(void)
6613{
6614 int next_distance, curr_distance = node_distance(0, 0);
6615 struct sched_domain_topology_level *tl;
6616 int level = 0;
6617 int i, j, k;
6618
6619 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6620 if (!sched_domains_numa_distance)
6621 return;
6622
6623
6624
6625
6626
6627
6628
6629
6630 next_distance = curr_distance;
6631 for (i = 0; i < nr_node_ids; i++) {
6632 for (j = 0; j < nr_node_ids; j++) {
6633 for (k = 0; k < nr_node_ids; k++) {
6634 int distance = node_distance(i, k);
6635
6636 if (distance > curr_distance &&
6637 (distance < next_distance ||
6638 next_distance == curr_distance))
6639 next_distance = distance;
6640
6641
6642
6643
6644
6645
6646 if (sched_debug() && node_distance(k, i) != distance)
6647 sched_numa_warn("Node-distance not symmetric");
6648
6649 if (sched_debug() && i && !find_numa_distance(distance))
6650 sched_numa_warn("Node-0 not representative");
6651 }
6652 if (next_distance != curr_distance) {
6653 sched_domains_numa_distance[level++] = next_distance;
6654 sched_domains_numa_levels = level;
6655 curr_distance = next_distance;
6656 } else break;
6657 }
6658
6659
6660
6661
6662 if (!sched_debug())
6663 break;
6664 }
6665
6666
6667
6668
6669
6670
6671
6672
6673 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6674 if (!sched_domains_numa_masks)
6675 return;
6676
6677
6678
6679
6680
6681 for (i = 0; i < level; i++) {
6682 sched_domains_numa_masks[i] =
6683 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6684 if (!sched_domains_numa_masks[i])
6685 return;
6686
6687 for (j = 0; j < nr_node_ids; j++) {
6688 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6689 if (!mask)
6690 return;
6691
6692 sched_domains_numa_masks[i][j] = mask;
6693
6694 for (k = 0; k < nr_node_ids; k++) {
6695 if (node_distance(j, k) > sched_domains_numa_distance[i])
6696 continue;
6697
6698 cpumask_or(mask, mask, cpumask_of_node(k));
6699 }
6700 }
6701 }
6702
6703 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6704 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6705 if (!tl)
6706 return;
6707
6708
6709
6710
6711 for (i = 0; default_topology[i].init; i++)
6712 tl[i] = default_topology[i];
6713
6714
6715
6716
6717 for (j = 0; j < level; i++, j++) {
6718 tl[i] = (struct sched_domain_topology_level){
6719 .init = sd_numa_init,
6720 .mask = sd_numa_mask,
6721 .flags = SDTL_OVERLAP,
6722 .numa_level = j,
6723 };
6724 }
6725
6726 sched_domain_topology = tl;
6727}
6728#else
6729static inline void sched_init_numa(void)
6730{
6731}
6732#endif
6733
6734static int __sdt_alloc(const struct cpumask *cpu_map)
6735{
6736 struct sched_domain_topology_level *tl;
6737 int j;
6738
6739 for (tl = sched_domain_topology; tl->init; tl++) {
6740 struct sd_data *sdd = &tl->data;
6741
6742 sdd->sd = alloc_percpu(struct sched_domain *);
6743 if (!sdd->sd)
6744 return -ENOMEM;
6745
6746 sdd->sg = alloc_percpu(struct sched_group *);
6747 if (!sdd->sg)
6748 return -ENOMEM;
6749
6750 sdd->sgp = alloc_percpu(struct sched_group_power *);
6751 if (!sdd->sgp)
6752 return -ENOMEM;
6753
6754 for_each_cpu(j, cpu_map) {
6755 struct sched_domain *sd;
6756 struct sched_group *sg;
6757 struct sched_group_power *sgp;
6758
6759 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6760 GFP_KERNEL, cpu_to_node(j));
6761 if (!sd)
6762 return -ENOMEM;
6763
6764 *per_cpu_ptr(sdd->sd, j) = sd;
6765
6766 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6767 GFP_KERNEL, cpu_to_node(j));
6768 if (!sg)
6769 return -ENOMEM;
6770
6771 sg->next = sg;
6772
6773 *per_cpu_ptr(sdd->sg, j) = sg;
6774
6775 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6776 GFP_KERNEL, cpu_to_node(j));
6777 if (!sgp)
6778 return -ENOMEM;
6779
6780 *per_cpu_ptr(sdd->sgp, j) = sgp;
6781 }
6782 }
6783
6784 return 0;
6785}
6786
6787static void __sdt_free(const struct cpumask *cpu_map)
6788{
6789 struct sched_domain_topology_level *tl;
6790 int j;
6791
6792 for (tl = sched_domain_topology; tl->init; tl++) {
6793 struct sd_data *sdd = &tl->data;
6794
6795 for_each_cpu(j, cpu_map) {
6796 struct sched_domain *sd;
6797
6798 if (sdd->sd) {
6799 sd = *per_cpu_ptr(sdd->sd, j);
6800 if (sd && (sd->flags & SD_OVERLAP))
6801 free_sched_groups(sd->groups, 0);
6802 kfree(*per_cpu_ptr(sdd->sd, j));
6803 }
6804
6805 if (sdd->sg)
6806 kfree(*per_cpu_ptr(sdd->sg, j));
6807 if (sdd->sgp)
6808 kfree(*per_cpu_ptr(sdd->sgp, j));
6809 }
6810 free_percpu(sdd->sd);
6811 sdd->sd = NULL;
6812 free_percpu(sdd->sg);
6813 sdd->sg = NULL;
6814 free_percpu(sdd->sgp);
6815 sdd->sgp = NULL;
6816 }
6817}
6818
6819struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6820 struct s_data *d, const struct cpumask *cpu_map,
6821 struct sched_domain_attr *attr, struct sched_domain *child,
6822 int cpu)
6823{
6824 struct sched_domain *sd = tl->init(tl, cpu);
6825 if (!sd)
6826 return child;
6827
6828 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6829 if (child) {
6830 sd->level = child->level + 1;
6831 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6832 child->parent = sd;
6833 }
6834 sd->child = child;
6835 set_domain_attribute(sd, attr);
6836
6837 return sd;
6838}
6839
6840
6841
6842
6843
6844static int build_sched_domains(const struct cpumask *cpu_map,
6845 struct sched_domain_attr *attr)
6846{
6847 enum s_alloc alloc_state = sa_none;
6848 struct sched_domain *sd;
6849 struct s_data d;
6850 int i, ret = -ENOMEM;
6851
6852 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6853 if (alloc_state != sa_rootdomain)
6854 goto error;
6855
6856
6857 for_each_cpu(i, cpu_map) {
6858 struct sched_domain_topology_level *tl;
6859
6860 sd = NULL;
6861 for (tl = sched_domain_topology; tl->init; tl++) {
6862 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6863 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6864 sd->flags |= SD_OVERLAP;
6865 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6866 break;
6867 }
6868
6869 while (sd->child)
6870 sd = sd->child;
6871
6872 *per_cpu_ptr(d.sd, i) = sd;
6873 }
6874
6875
6876 for_each_cpu(i, cpu_map) {
6877 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6878 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6879 if (sd->flags & SD_OVERLAP) {
6880 if (build_overlap_sched_groups(sd, i))
6881 goto error;
6882 } else {
6883 if (build_sched_groups(sd, i))
6884 goto error;
6885 }
6886 }
6887 }
6888
6889
6890 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6891 if (!cpumask_test_cpu(i, cpu_map))
6892 continue;
6893
6894 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6895 claim_allocations(i, sd);
6896 init_sched_groups_power(i, sd);
6897 }
6898 }
6899
6900
6901 rcu_read_lock();
6902 for_each_cpu(i, cpu_map) {
6903 sd = *per_cpu_ptr(d.sd, i);
6904 cpu_attach_domain(sd, d.rd, i);
6905 }
6906 rcu_read_unlock();
6907
6908 ret = 0;
6909error:
6910 __free_domain_allocs(&d, alloc_state, cpu_map);
6911 return ret;
6912}
6913
6914static cpumask_var_t *doms_cur;
6915static int ndoms_cur;
6916static struct sched_domain_attr *dattr_cur;
6917
6918
6919
6920
6921
6922
6923
6924static cpumask_var_t fallback_doms;
6925
6926
6927
6928
6929
6930
6931int __attribute__((weak)) arch_update_cpu_topology(void)
6932{
6933 return 0;
6934}
6935
6936cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6937{
6938 int i;
6939 cpumask_var_t *doms;
6940
6941 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6942 if (!doms)
6943 return NULL;
6944 for (i = 0; i < ndoms; i++) {
6945 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6946 free_sched_domains(doms, i);
6947 return NULL;
6948 }
6949 }
6950 return doms;
6951}
6952
6953void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6954{
6955 unsigned int i;
6956 for (i = 0; i < ndoms; i++)
6957 free_cpumask_var(doms[i]);
6958 kfree(doms);
6959}
6960
6961
6962
6963
6964
6965
6966static int init_sched_domains(const struct cpumask *cpu_map)
6967{
6968 int err;
6969
6970 arch_update_cpu_topology();
6971 ndoms_cur = 1;
6972 doms_cur = alloc_sched_domains(ndoms_cur);
6973 if (!doms_cur)
6974 doms_cur = &fallback_doms;
6975 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6976 err = build_sched_domains(doms_cur[0], NULL);
6977 register_sched_domain_sysctl();
6978
6979 return err;
6980}
6981
6982
6983
6984
6985
6986static void detach_destroy_domains(const struct cpumask *cpu_map)
6987{
6988 int i;
6989
6990 rcu_read_lock();
6991 for_each_cpu(i, cpu_map)
6992 cpu_attach_domain(NULL, &def_root_domain, i);
6993 rcu_read_unlock();
6994}
6995
6996
6997static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6998 struct sched_domain_attr *new, int idx_new)
6999{
7000 struct sched_domain_attr tmp;
7001
7002
7003 if (!new && !cur)
7004 return 1;
7005
7006 tmp = SD_ATTR_INIT;
7007 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7008 new ? (new + idx_new) : &tmp,
7009 sizeof(struct sched_domain_attr));
7010}
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7039 struct sched_domain_attr *dattr_new)
7040{
7041 int i, j, n;
7042 int new_topology;
7043
7044 mutex_lock(&sched_domains_mutex);
7045
7046
7047 unregister_sched_domain_sysctl();
7048
7049
7050 new_topology = arch_update_cpu_topology();
7051
7052 n = doms_new ? ndoms_new : 0;
7053
7054
7055 for (i = 0; i < ndoms_cur; i++) {
7056 for (j = 0; j < n && !new_topology; j++) {
7057 if (cpumask_equal(doms_cur[i], doms_new[j])
7058 && dattrs_equal(dattr_cur, i, dattr_new, j))
7059 goto match1;
7060 }
7061
7062 detach_destroy_domains(doms_cur[i]);
7063match1:
7064 ;
7065 }
7066
7067 if (doms_new == NULL) {
7068 ndoms_cur = 0;
7069 doms_new = &fallback_doms;
7070 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7071 WARN_ON_ONCE(dattr_new);
7072 }
7073
7074
7075 for (i = 0; i < ndoms_new; i++) {
7076 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7077 if (cpumask_equal(doms_new[i], doms_cur[j])
7078 && dattrs_equal(dattr_new, i, dattr_cur, j))
7079 goto match2;
7080 }
7081
7082 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7083match2:
7084 ;
7085 }
7086
7087
7088 if (doms_cur != &fallback_doms)
7089 free_sched_domains(doms_cur, ndoms_cur);
7090 kfree(dattr_cur);
7091 doms_cur = doms_new;
7092 dattr_cur = dattr_new;
7093 ndoms_cur = ndoms_new;
7094
7095 register_sched_domain_sysctl();
7096
7097 mutex_unlock(&sched_domains_mutex);
7098}
7099
7100
7101
7102
7103
7104
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu)
7107{
7108 switch (action & ~CPU_TASKS_FROZEN) {
7109 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus();
7112 return NOTIFY_OK;
7113 default:
7114 return NOTIFY_DONE;
7115 }
7116}
7117
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu)
7120{
7121 switch (action & ~CPU_TASKS_FROZEN) {
7122 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus();
7124 return NOTIFY_OK;
7125 default:
7126 return NOTIFY_DONE;
7127 }
7128}
7129
7130void __init sched_init_smp(void)
7131{
7132 cpumask_var_t non_isolated_cpus;
7133
7134 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7135 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7136
7137 sched_init_numa();
7138
7139 get_online_cpus();
7140 mutex_lock(&sched_domains_mutex);
7141 init_sched_domains(cpu_active_mask);
7142 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7143 if (cpumask_empty(non_isolated_cpus))
7144 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7145 mutex_unlock(&sched_domains_mutex);
7146 put_online_cpus();
7147
7148 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7149 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7150
7151
7152 hotcpu_notifier(update_runtime, 0);
7153
7154 init_hrtick();
7155
7156
7157 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7158 BUG();
7159 sched_init_granularity();
7160 free_cpumask_var(non_isolated_cpus);
7161
7162 init_sched_rt_class();
7163}
7164#else
7165void __init sched_init_smp(void)
7166{
7167 sched_init_granularity();
7168}
7169#endif
7170
7171const_debug unsigned int sysctl_timer_migration = 1;
7172
7173int in_sched_functions(unsigned long addr)
7174{
7175 return in_lock_functions(addr) ||
7176 (addr >= (unsigned long)__sched_text_start
7177 && addr < (unsigned long)__sched_text_end);
7178}
7179
7180#ifdef CONFIG_CGROUP_SCHED
7181struct task_group root_task_group;
7182#endif
7183
7184DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
7185
7186void __init sched_init(void)
7187{
7188 int i, j;
7189 unsigned long alloc_size = 0, ptr;
7190
7191#ifdef CONFIG_FAIR_GROUP_SCHED
7192 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7193#endif
7194#ifdef CONFIG_RT_GROUP_SCHED
7195 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7196#endif
7197#ifdef CONFIG_CPUMASK_OFFSTACK
7198 alloc_size += num_possible_cpus() * cpumask_size();
7199#endif
7200 if (alloc_size) {
7201 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7202
7203#ifdef CONFIG_FAIR_GROUP_SCHED
7204 root_task_group.se = (struct sched_entity **)ptr;
7205 ptr += nr_cpu_ids * sizeof(void **);
7206
7207 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7208 ptr += nr_cpu_ids * sizeof(void **);
7209
7210#endif
7211#ifdef CONFIG_RT_GROUP_SCHED
7212 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7213 ptr += nr_cpu_ids * sizeof(void **);
7214
7215 root_task_group.rt_rq = (struct rt_rq **)ptr;
7216 ptr += nr_cpu_ids * sizeof(void **);
7217
7218#endif
7219#ifdef CONFIG_CPUMASK_OFFSTACK
7220 for_each_possible_cpu(i) {
7221 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
7222 ptr += cpumask_size();
7223 }
7224#endif
7225 }
7226
7227#ifdef CONFIG_SMP
7228 init_defrootdomain();
7229#endif
7230
7231 init_rt_bandwidth(&def_rt_bandwidth,
7232 global_rt_period(), global_rt_runtime());
7233
7234#ifdef CONFIG_RT_GROUP_SCHED
7235 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7236 global_rt_period(), global_rt_runtime());
7237#endif
7238
7239#ifdef CONFIG_CGROUP_SCHED
7240 list_add(&root_task_group.list, &task_groups);
7241 INIT_LIST_HEAD(&root_task_group.children);
7242 INIT_LIST_HEAD(&root_task_group.siblings);
7243 autogroup_init(&init_task);
7244
7245#endif
7246
7247#ifdef CONFIG_CGROUP_CPUACCT
7248 root_cpuacct.cpustat = &kernel_cpustat;
7249 root_cpuacct.cpuusage = alloc_percpu(u64);
7250
7251 BUG_ON(!root_cpuacct.cpuusage);
7252#endif
7253 for_each_possible_cpu(i) {
7254 struct rq *rq;
7255
7256 rq = cpu_rq(i);
7257 raw_spin_lock_init(&rq->lock);
7258 rq->nr_running = 0;
7259 rq->calc_load_active = 0;
7260 rq->calc_load_update = jiffies + LOAD_FREQ;
7261 init_cfs_rq(&rq->cfs);
7262 init_rt_rq(&rq->rt, rq);
7263#ifdef CONFIG_FAIR_GROUP_SCHED
7264 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7265 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7286 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7287#endif
7288
7289 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7290#ifdef CONFIG_RT_GROUP_SCHED
7291 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7292 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7293#endif
7294
7295 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7296 rq->cpu_load[j] = 0;
7297
7298 rq->last_load_update_tick = jiffies;
7299
7300#ifdef CONFIG_SMP
7301 rq->sd = NULL;
7302 rq->rd = NULL;
7303 rq->cpu_power = SCHED_POWER_SCALE;
7304 rq->post_schedule = 0;
7305 rq->active_balance = 0;
7306 rq->next_balance = jiffies;
7307 rq->push_cpu = 0;
7308 rq->cpu = i;
7309 rq->online = 0;
7310 rq->idle_stamp = 0;
7311 rq->avg_idle = 2*sysctl_sched_migration_cost;
7312
7313 INIT_LIST_HEAD(&rq->cfs_tasks);
7314
7315 rq_attach_root(rq, &def_root_domain);
7316#ifdef CONFIG_NO_HZ
7317 rq->nohz_flags = 0;
7318#endif
7319#endif
7320 init_rq_hrtick(rq);
7321 atomic_set(&rq->nr_iowait, 0);
7322 }
7323
7324 set_load_weight(&init_task);
7325
7326#ifdef CONFIG_PREEMPT_NOTIFIERS
7327 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7328#endif
7329
7330#ifdef CONFIG_RT_MUTEXES
7331 plist_head_init(&init_task.pi_waiters);
7332#endif
7333
7334
7335
7336
7337 atomic_inc(&init_mm.mm_count);
7338 enter_lazy_tlb(&init_mm, current);
7339
7340
7341
7342
7343
7344
7345
7346 init_idle(current, smp_processor_id());
7347
7348 calc_load_update = jiffies + LOAD_FREQ;
7349
7350
7351
7352
7353 current->sched_class = &fair_sched_class;
7354
7355#ifdef CONFIG_SMP
7356 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7357
7358 if (cpu_isolated_map == NULL)
7359 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7360 idle_thread_set_boot_cpu();
7361#endif
7362 init_sched_fair_class();
7363
7364 scheduler_running = 1;
7365}
7366
7367#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7368static inline int preempt_count_equals(int preempt_offset)
7369{
7370 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7371
7372 return (nested == preempt_offset);
7373}
7374
7375void __might_sleep(const char *file, int line, int preempt_offset)
7376{
7377 static unsigned long prev_jiffy;
7378
7379 rcu_sleep_check();
7380 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7381 system_state != SYSTEM_RUNNING || oops_in_progress)
7382 return;
7383 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7384 return;
7385 prev_jiffy = jiffies;
7386
7387 printk(KERN_ERR
7388 "BUG: sleeping function called from invalid context at %s:%d\n",
7389 file, line);
7390 printk(KERN_ERR
7391 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7392 in_atomic(), irqs_disabled(),
7393 current->pid, current->comm);
7394
7395 debug_show_held_locks(current);
7396 if (irqs_disabled())
7397 print_irqtrace_events(current);
7398 dump_stack();
7399}
7400EXPORT_SYMBOL(__might_sleep);
7401#endif
7402
7403#ifdef CONFIG_MAGIC_SYSRQ
7404static void normalize_task(struct rq *rq, struct task_struct *p)
7405{
7406 const struct sched_class *prev_class = p->sched_class;
7407 int old_prio = p->prio;
7408 int on_rq;
7409
7410 on_rq = p->on_rq;
7411 if (on_rq)
7412 dequeue_task(rq, p, 0);
7413 __setscheduler(rq, p, SCHED_NORMAL, 0);
7414 if (on_rq) {
7415 enqueue_task(rq, p, 0);
7416 resched_task(rq->curr);
7417 }
7418
7419 check_class_changed(rq, p, prev_class, old_prio);
7420}
7421
7422void normalize_rt_tasks(void)
7423{
7424 struct task_struct *g, *p;
7425 unsigned long flags;
7426 struct rq *rq;
7427
7428 read_lock_irqsave(&tasklist_lock, flags);
7429 do_each_thread(g, p) {
7430
7431
7432
7433 if (!p->mm)
7434 continue;
7435
7436 p->se.exec_start = 0;
7437#ifdef CONFIG_SCHEDSTATS
7438 p->se.statistics.wait_start = 0;
7439 p->se.statistics.sleep_start = 0;
7440 p->se.statistics.block_start = 0;
7441#endif
7442
7443 if (!rt_task(p)) {
7444
7445
7446
7447
7448 if (TASK_NICE(p) < 0 && p->mm)
7449 set_user_nice(p, 0);
7450 continue;
7451 }
7452
7453 raw_spin_lock(&p->pi_lock);
7454 rq = __task_rq_lock(p);
7455
7456 normalize_task(rq, p);
7457
7458 __task_rq_unlock(rq);
7459 raw_spin_unlock(&p->pi_lock);
7460 } while_each_thread(g, p);
7461
7462 read_unlock_irqrestore(&tasklist_lock, flags);
7463}
7464
7465#endif
7466
7467#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484struct task_struct *curr_task(int cpu)
7485{
7486 return cpu_curr(cpu);
7487}
7488
7489#endif
7490
7491#ifdef CONFIG_IA64
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507void set_curr_task(int cpu, struct task_struct *p)
7508{
7509 cpu_curr(cpu) = p;
7510}
7511
7512#endif
7513
7514#ifdef CONFIG_CGROUP_SCHED
7515
7516static DEFINE_SPINLOCK(task_group_lock);
7517
7518static void free_sched_group(struct task_group *tg)
7519{
7520 free_fair_sched_group(tg);
7521 free_rt_sched_group(tg);
7522 autogroup_free(tg);
7523 kfree(tg);
7524}
7525
7526
7527struct task_group *sched_create_group(struct task_group *parent)
7528{
7529 struct task_group *tg;
7530 unsigned long flags;
7531
7532 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7533 if (!tg)
7534 return ERR_PTR(-ENOMEM);
7535
7536 if (!alloc_fair_sched_group(tg, parent))
7537 goto err;
7538
7539 if (!alloc_rt_sched_group(tg, parent))
7540 goto err;
7541
7542 spin_lock_irqsave(&task_group_lock, flags);
7543 list_add_rcu(&tg->list, &task_groups);
7544
7545 WARN_ON(!parent);
7546
7547 tg->parent = parent;
7548 INIT_LIST_HEAD(&tg->children);
7549 list_add_rcu(&tg->siblings, &parent->children);
7550 spin_unlock_irqrestore(&task_group_lock, flags);
7551
7552 return tg;
7553
7554err:
7555 free_sched_group(tg);
7556 return ERR_PTR(-ENOMEM);
7557}
7558
7559
7560static void free_sched_group_rcu(struct rcu_head *rhp)
7561{
7562
7563 free_sched_group(container_of(rhp, struct task_group, rcu));
7564}
7565
7566
7567void sched_destroy_group(struct task_group *tg)
7568{
7569 unsigned long flags;
7570 int i;
7571
7572
7573 for_each_possible_cpu(i)
7574 unregister_fair_sched_group(tg, i);
7575
7576 spin_lock_irqsave(&task_group_lock, flags);
7577 list_del_rcu(&tg->list);
7578 list_del_rcu(&tg->siblings);
7579 spin_unlock_irqrestore(&task_group_lock, flags);
7580
7581
7582 call_rcu(&tg->rcu, free_sched_group_rcu);
7583}
7584
7585
7586
7587
7588
7589
7590void sched_move_task(struct task_struct *tsk)
7591{
7592 int on_rq, running;
7593 unsigned long flags;
7594 struct rq *rq;
7595
7596 rq = task_rq_lock(tsk, &flags);
7597
7598 running = task_current(rq, tsk);
7599 on_rq = tsk->on_rq;
7600
7601 if (on_rq)
7602 dequeue_task(rq, tsk, 0);
7603 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk);
7605
7606#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq);
7609 else
7610#endif
7611 set_task_rq(tsk, task_cpu(tsk));
7612
7613 if (unlikely(running))
7614 tsk->sched_class->set_curr_task(rq);
7615 if (on_rq)
7616 enqueue_task(rq, tsk, 0);
7617
7618 task_rq_unlock(rq, tsk, &flags);
7619}
7620#endif
7621
7622#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7623static unsigned long to_ratio(u64 period, u64 runtime)
7624{
7625 if (runtime == RUNTIME_INF)
7626 return 1ULL << 20;
7627
7628 return div64_u64(runtime << 20, period);
7629}
7630#endif
7631
7632#ifdef CONFIG_RT_GROUP_SCHED
7633
7634
7635
7636static DEFINE_MUTEX(rt_constraints_mutex);
7637
7638
7639static inline int tg_has_rt_tasks(struct task_group *tg)
7640{
7641 struct task_struct *g, *p;
7642
7643 do_each_thread(g, p) {
7644 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7645 return 1;
7646 } while_each_thread(g, p);
7647
7648 return 0;
7649}
7650
7651struct rt_schedulable_data {
7652 struct task_group *tg;
7653 u64 rt_period;
7654 u64 rt_runtime;
7655};
7656
7657static int tg_rt_schedulable(struct task_group *tg, void *data)
7658{
7659 struct rt_schedulable_data *d = data;
7660 struct task_group *child;
7661 unsigned long total, sum = 0;
7662 u64 period, runtime;
7663
7664 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7665 runtime = tg->rt_bandwidth.rt_runtime;
7666
7667 if (tg == d->tg) {
7668 period = d->rt_period;
7669 runtime = d->rt_runtime;
7670 }
7671
7672
7673
7674
7675 if (runtime > period && runtime != RUNTIME_INF)
7676 return -EINVAL;
7677
7678
7679
7680
7681 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7682 return -EBUSY;
7683
7684 total = to_ratio(period, runtime);
7685
7686
7687
7688
7689 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7690 return -EINVAL;
7691
7692
7693
7694
7695 list_for_each_entry_rcu(child, &tg->children, siblings) {
7696 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7697 runtime = child->rt_bandwidth.rt_runtime;
7698
7699 if (child == d->tg) {
7700 period = d->rt_period;
7701 runtime = d->rt_runtime;
7702 }
7703
7704 sum += to_ratio(period, runtime);
7705 }
7706
7707 if (sum > total)
7708 return -EINVAL;
7709
7710 return 0;
7711}
7712
7713static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7714{
7715 int ret;
7716
7717 struct rt_schedulable_data data = {
7718 .tg = tg,
7719 .rt_period = period,
7720 .rt_runtime = runtime,
7721 };
7722
7723 rcu_read_lock();
7724 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7725 rcu_read_unlock();
7726
7727 return ret;
7728}
7729
7730static int tg_set_rt_bandwidth(struct task_group *tg,
7731 u64 rt_period, u64 rt_runtime)
7732{
7733 int i, err = 0;
7734
7735 mutex_lock(&rt_constraints_mutex);
7736 read_lock(&tasklist_lock);
7737 err = __rt_schedulable(tg, rt_period, rt_runtime);
7738 if (err)
7739 goto unlock;
7740
7741 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7742 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7743 tg->rt_bandwidth.rt_runtime = rt_runtime;
7744
7745 for_each_possible_cpu(i) {
7746 struct rt_rq *rt_rq = tg->rt_rq[i];
7747
7748 raw_spin_lock(&rt_rq->rt_runtime_lock);
7749 rt_rq->rt_runtime = rt_runtime;
7750 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7751 }
7752 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7753unlock:
7754 read_unlock(&tasklist_lock);
7755 mutex_unlock(&rt_constraints_mutex);
7756
7757 return err;
7758}
7759
7760int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7761{
7762 u64 rt_runtime, rt_period;
7763
7764 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7765 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7766 if (rt_runtime_us < 0)
7767 rt_runtime = RUNTIME_INF;
7768
7769 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7770}
7771
7772long sched_group_rt_runtime(struct task_group *tg)
7773{
7774 u64 rt_runtime_us;
7775
7776 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7777 return -1;
7778
7779 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7780 do_div(rt_runtime_us, NSEC_PER_USEC);
7781 return rt_runtime_us;
7782}
7783
7784int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7785{
7786 u64 rt_runtime, rt_period;
7787
7788 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7789 rt_runtime = tg->rt_bandwidth.rt_runtime;
7790
7791 if (rt_period == 0)
7792 return -EINVAL;
7793
7794 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7795}
7796
7797long sched_group_rt_period(struct task_group *tg)
7798{
7799 u64 rt_period_us;
7800
7801 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7802 do_div(rt_period_us, NSEC_PER_USEC);
7803 return rt_period_us;
7804}
7805
7806static int sched_rt_global_constraints(void)
7807{
7808 u64 runtime, period;
7809 int ret = 0;
7810
7811 if (sysctl_sched_rt_period <= 0)
7812 return -EINVAL;
7813
7814 runtime = global_rt_runtime();
7815 period = global_rt_period();
7816
7817
7818
7819
7820 if (runtime > period && runtime != RUNTIME_INF)
7821 return -EINVAL;
7822
7823 mutex_lock(&rt_constraints_mutex);
7824 read_lock(&tasklist_lock);
7825 ret = __rt_schedulable(NULL, 0, 0);
7826 read_unlock(&tasklist_lock);
7827 mutex_unlock(&rt_constraints_mutex);
7828
7829 return ret;
7830}
7831
7832int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7833{
7834
7835 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7836 return 0;
7837
7838 return 1;
7839}
7840
7841#else
7842static int sched_rt_global_constraints(void)
7843{
7844 unsigned long flags;
7845 int i;
7846
7847 if (sysctl_sched_rt_period <= 0)
7848 return -EINVAL;
7849
7850
7851
7852
7853
7854 if (sysctl_sched_rt_runtime == 0)
7855 return -EBUSY;
7856
7857 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7858 for_each_possible_cpu(i) {
7859 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7860
7861 raw_spin_lock(&rt_rq->rt_runtime_lock);
7862 rt_rq->rt_runtime = global_rt_runtime();
7863 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7864 }
7865 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7866
7867 return 0;
7868}
7869#endif
7870
7871int sched_rt_handler(struct ctl_table *table, int write,
7872 void __user *buffer, size_t *lenp,
7873 loff_t *ppos)
7874{
7875 int ret;
7876 int old_period, old_runtime;
7877 static DEFINE_MUTEX(mutex);
7878
7879 mutex_lock(&mutex);
7880 old_period = sysctl_sched_rt_period;
7881 old_runtime = sysctl_sched_rt_runtime;
7882
7883 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7884
7885 if (!ret && write) {
7886 ret = sched_rt_global_constraints();
7887 if (ret) {
7888 sysctl_sched_rt_period = old_period;
7889 sysctl_sched_rt_runtime = old_runtime;
7890 } else {
7891 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7892 def_rt_bandwidth.rt_period =
7893 ns_to_ktime(global_rt_period());
7894 }
7895 }
7896 mutex_unlock(&mutex);
7897
7898 return ret;
7899}
7900
7901#ifdef CONFIG_CGROUP_SCHED
7902
7903
7904static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7905{
7906 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7907 struct task_group, css);
7908}
7909
7910static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7911{
7912 struct task_group *tg, *parent;
7913
7914 if (!cgrp->parent) {
7915
7916 return &root_task_group.css;
7917 }
7918
7919 parent = cgroup_tg(cgrp->parent);
7920 tg = sched_create_group(parent);
7921 if (IS_ERR(tg))
7922 return ERR_PTR(-ENOMEM);
7923
7924 return &tg->css;
7925}
7926
7927static void cpu_cgroup_destroy(struct cgroup *cgrp)
7928{
7929 struct task_group *tg = cgroup_tg(cgrp);
7930
7931 sched_destroy_group(tg);
7932}
7933
7934static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7935 struct cgroup_taskset *tset)
7936{
7937 struct task_struct *task;
7938
7939 cgroup_taskset_for_each(task, cgrp, tset) {
7940#ifdef CONFIG_RT_GROUP_SCHED
7941 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7942 return -EINVAL;
7943#else
7944
7945 if (task->sched_class != &fair_sched_class)
7946 return -EINVAL;
7947#endif
7948 }
7949 return 0;
7950}
7951
7952static void cpu_cgroup_attach(struct cgroup *cgrp,
7953 struct cgroup_taskset *tset)
7954{
7955 struct task_struct *task;
7956
7957 cgroup_taskset_for_each(task, cgrp, tset)
7958 sched_move_task(task);
7959}
7960
7961static void
7962cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7963 struct task_struct *task)
7964{
7965
7966
7967
7968
7969
7970 if (!(task->flags & PF_EXITING))
7971 return;
7972
7973 sched_move_task(task);
7974}
7975
7976#ifdef CONFIG_FAIR_GROUP_SCHED
7977static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7978 u64 shareval)
7979{
7980 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7981}
7982
7983static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7984{
7985 struct task_group *tg = cgroup_tg(cgrp);
7986
7987 return (u64) scale_load_down(tg->shares);
7988}
7989
7990#ifdef CONFIG_CFS_BANDWIDTH
7991static DEFINE_MUTEX(cfs_constraints_mutex);
7992
7993const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7994const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7995
7996static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7997
7998static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7999{
8000 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8001 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8002
8003 if (tg == &root_task_group)
8004 return -EINVAL;
8005
8006
8007
8008
8009
8010
8011 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8012 return -EINVAL;
8013
8014
8015
8016
8017
8018
8019 if (period > max_cfs_quota_period)
8020 return -EINVAL;
8021
8022 mutex_lock(&cfs_constraints_mutex);
8023 ret = __cfs_schedulable(tg, period, quota);
8024 if (ret)
8025 goto out_unlock;
8026
8027 runtime_enabled = quota != RUNTIME_INF;
8028 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8029 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
8030 raw_spin_lock_irq(&cfs_b->lock);
8031 cfs_b->period = ns_to_ktime(period);
8032 cfs_b->quota = quota;
8033
8034 __refill_cfs_bandwidth_runtime(cfs_b);
8035
8036 if (runtime_enabled && cfs_b->timer_active) {
8037
8038 cfs_b->timer_active = 0;
8039 __start_cfs_bandwidth(cfs_b);
8040 }
8041 raw_spin_unlock_irq(&cfs_b->lock);
8042
8043 for_each_possible_cpu(i) {
8044 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8045 struct rq *rq = cfs_rq->rq;
8046
8047 raw_spin_lock_irq(&rq->lock);
8048 cfs_rq->runtime_enabled = runtime_enabled;
8049 cfs_rq->runtime_remaining = 0;
8050
8051 if (cfs_rq->throttled)
8052 unthrottle_cfs_rq(cfs_rq);
8053 raw_spin_unlock_irq(&rq->lock);
8054 }
8055out_unlock:
8056 mutex_unlock(&cfs_constraints_mutex);
8057
8058 return ret;
8059}
8060
8061int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8062{
8063 u64 quota, period;
8064
8065 period = ktime_to_ns(tg->cfs_bandwidth.period);
8066 if (cfs_quota_us < 0)
8067 quota = RUNTIME_INF;
8068 else
8069 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8070
8071 return tg_set_cfs_bandwidth(tg, period, quota);
8072}
8073
8074long tg_get_cfs_quota(struct task_group *tg)
8075{
8076 u64 quota_us;
8077
8078 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8079 return -1;
8080
8081 quota_us = tg->cfs_bandwidth.quota;
8082 do_div(quota_us, NSEC_PER_USEC);
8083
8084 return quota_us;
8085}
8086
8087int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8088{
8089 u64 quota, period;
8090
8091 period = (u64)cfs_period_us * NSEC_PER_USEC;
8092 quota = tg->cfs_bandwidth.quota;
8093
8094 return tg_set_cfs_bandwidth(tg, period, quota);
8095}
8096
8097long tg_get_cfs_period(struct task_group *tg)
8098{
8099 u64 cfs_period_us;
8100
8101 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8102 do_div(cfs_period_us, NSEC_PER_USEC);
8103
8104 return cfs_period_us;
8105}
8106
8107static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
8108{
8109 return tg_get_cfs_quota(cgroup_tg(cgrp));
8110}
8111
8112static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
8113 s64 cfs_quota_us)
8114{
8115 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
8116}
8117
8118static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
8119{
8120 return tg_get_cfs_period(cgroup_tg(cgrp));
8121}
8122
8123static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8124 u64 cfs_period_us)
8125{
8126 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
8127}
8128
8129struct cfs_schedulable_data {
8130 struct task_group *tg;
8131 u64 period, quota;
8132};
8133
8134
8135
8136
8137
8138static u64 normalize_cfs_quota(struct task_group *tg,
8139 struct cfs_schedulable_data *d)
8140{
8141 u64 quota, period;
8142
8143 if (tg == d->tg) {
8144 period = d->period;
8145 quota = d->quota;
8146 } else {
8147 period = tg_get_cfs_period(tg);
8148 quota = tg_get_cfs_quota(tg);
8149 }
8150
8151
8152 if (quota == RUNTIME_INF || quota == -1)
8153 return RUNTIME_INF;
8154
8155 return to_ratio(period, quota);
8156}
8157
8158static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8159{
8160 struct cfs_schedulable_data *d = data;
8161 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8162 s64 quota = 0, parent_quota = -1;
8163
8164 if (!tg->parent) {
8165 quota = RUNTIME_INF;
8166 } else {
8167 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8168
8169 quota = normalize_cfs_quota(tg, d);
8170 parent_quota = parent_b->hierarchal_quota;
8171
8172
8173
8174
8175
8176 if (quota == RUNTIME_INF)
8177 quota = parent_quota;
8178 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8179 return -EINVAL;
8180 }
8181 cfs_b->hierarchal_quota = quota;
8182
8183 return 0;
8184}
8185
8186static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8187{
8188 int ret;
8189 struct cfs_schedulable_data data = {
8190 .tg = tg,
8191 .period = period,
8192 .quota = quota,
8193 };
8194
8195 if (quota != RUNTIME_INF) {
8196 do_div(data.period, NSEC_PER_USEC);
8197 do_div(data.quota, NSEC_PER_USEC);
8198 }
8199
8200 rcu_read_lock();
8201 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8202 rcu_read_unlock();
8203
8204 return ret;
8205}
8206
8207static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
8208 struct cgroup_map_cb *cb)
8209{
8210 struct task_group *tg = cgroup_tg(cgrp);
8211 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8212
8213 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
8214 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
8215 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
8216
8217 return 0;
8218}
8219#endif
8220#endif
8221
8222#ifdef CONFIG_RT_GROUP_SCHED
8223static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8224 s64 val)
8225{
8226 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8227}
8228
8229static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8230{
8231 return sched_group_rt_runtime(cgroup_tg(cgrp));
8232}
8233
8234static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8235 u64 rt_period_us)
8236{
8237 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8238}
8239
8240static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8241{
8242 return sched_group_rt_period(cgroup_tg(cgrp));
8243}
8244#endif
8245
8246static struct cftype cpu_files[] = {
8247#ifdef CONFIG_FAIR_GROUP_SCHED
8248 {
8249 .name = "shares",
8250 .read_u64 = cpu_shares_read_u64,
8251 .write_u64 = cpu_shares_write_u64,
8252 },
8253#endif
8254#ifdef CONFIG_CFS_BANDWIDTH
8255 {
8256 .name = "cfs_quota_us",
8257 .read_s64 = cpu_cfs_quota_read_s64,
8258 .write_s64 = cpu_cfs_quota_write_s64,
8259 },
8260 {
8261 .name = "cfs_period_us",
8262 .read_u64 = cpu_cfs_period_read_u64,
8263 .write_u64 = cpu_cfs_period_write_u64,
8264 },
8265 {
8266 .name = "stat",
8267 .read_map = cpu_stats_show,
8268 },
8269#endif
8270#ifdef CONFIG_RT_GROUP_SCHED
8271 {
8272 .name = "rt_runtime_us",
8273 .read_s64 = cpu_rt_runtime_read,
8274 .write_s64 = cpu_rt_runtime_write,
8275 },
8276 {
8277 .name = "rt_period_us",
8278 .read_u64 = cpu_rt_period_read_uint,
8279 .write_u64 = cpu_rt_period_write_uint,
8280 },
8281#endif
8282 { }
8283};
8284
8285struct cgroup_subsys cpu_cgroup_subsys = {
8286 .name = "cpu",
8287 .create = cpu_cgroup_create,
8288 .destroy = cpu_cgroup_destroy,
8289 .can_attach = cpu_cgroup_can_attach,
8290 .attach = cpu_cgroup_attach,
8291 .exit = cpu_cgroup_exit,
8292 .subsys_id = cpu_cgroup_subsys_id,
8293 .base_cftypes = cpu_files,
8294 .early_init = 1,
8295};
8296
8297#endif
8298
8299#ifdef CONFIG_CGROUP_CPUACCT
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8310{
8311 struct cpuacct *ca;
8312
8313 if (!cgrp->parent)
8314 return &root_cpuacct.css;
8315
8316 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8317 if (!ca)
8318 goto out;
8319
8320 ca->cpuusage = alloc_percpu(u64);
8321 if (!ca->cpuusage)
8322 goto out_free_ca;
8323
8324 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8325 if (!ca->cpustat)
8326 goto out_free_cpuusage;
8327
8328 return &ca->css;
8329
8330out_free_cpuusage:
8331 free_percpu(ca->cpuusage);
8332out_free_ca:
8333 kfree(ca);
8334out:
8335 return ERR_PTR(-ENOMEM);
8336}
8337
8338
8339static void cpuacct_destroy(struct cgroup *cgrp)
8340{
8341 struct cpuacct *ca = cgroup_ca(cgrp);
8342
8343 free_percpu(ca->cpustat);
8344 free_percpu(ca->cpuusage);
8345 kfree(ca);
8346}
8347
8348static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8349{
8350 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8351 u64 data;
8352
8353#ifndef CONFIG_64BIT
8354
8355
8356
8357 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8358 data = *cpuusage;
8359 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8360#else
8361 data = *cpuusage;
8362#endif
8363
8364 return data;
8365}
8366
8367static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8368{
8369 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8370
8371#ifndef CONFIG_64BIT
8372
8373
8374
8375 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8376 *cpuusage = val;
8377 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8378#else
8379 *cpuusage = val;
8380#endif
8381}
8382
8383
8384static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8385{
8386 struct cpuacct *ca = cgroup_ca(cgrp);
8387 u64 totalcpuusage = 0;
8388 int i;
8389
8390 for_each_present_cpu(i)
8391 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8392
8393 return totalcpuusage;
8394}
8395
8396static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8397 u64 reset)
8398{
8399 struct cpuacct *ca = cgroup_ca(cgrp);
8400 int err = 0;
8401 int i;
8402
8403 if (reset) {
8404 err = -EINVAL;
8405 goto out;
8406 }
8407
8408 for_each_present_cpu(i)
8409 cpuacct_cpuusage_write(ca, i, 0);
8410
8411out:
8412 return err;
8413}
8414
8415static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8416 struct seq_file *m)
8417{
8418 struct cpuacct *ca = cgroup_ca(cgroup);
8419 u64 percpu;
8420 int i;
8421
8422 for_each_present_cpu(i) {
8423 percpu = cpuacct_cpuusage_read(ca, i);
8424 seq_printf(m, "%llu ", (unsigned long long) percpu);
8425 }
8426 seq_printf(m, "\n");
8427 return 0;
8428}
8429
8430static const char *cpuacct_stat_desc[] = {
8431 [CPUACCT_STAT_USER] = "user",
8432 [CPUACCT_STAT_SYSTEM] = "system",
8433};
8434
8435static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8436 struct cgroup_map_cb *cb)
8437{
8438 struct cpuacct *ca = cgroup_ca(cgrp);
8439 int cpu;
8440 s64 val = 0;
8441
8442 for_each_online_cpu(cpu) {
8443 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8444 val += kcpustat->cpustat[CPUTIME_USER];
8445 val += kcpustat->cpustat[CPUTIME_NICE];
8446 }
8447 val = cputime64_to_clock_t(val);
8448 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8449
8450 val = 0;
8451 for_each_online_cpu(cpu) {
8452 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8453 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8454 val += kcpustat->cpustat[CPUTIME_IRQ];
8455 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8456 }
8457
8458 val = cputime64_to_clock_t(val);
8459 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8460
8461 return 0;
8462}
8463
8464static struct cftype files[] = {
8465 {
8466 .name = "usage",
8467 .read_u64 = cpuusage_read,
8468 .write_u64 = cpuusage_write,
8469 },
8470 {
8471 .name = "usage_percpu",
8472 .read_seq_string = cpuacct_percpu_seq_read,
8473 },
8474 {
8475 .name = "stat",
8476 .read_map = cpuacct_stats_show,
8477 },
8478 { }
8479};
8480
8481
8482
8483
8484
8485
8486void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8487{
8488 struct cpuacct *ca;
8489 int cpu;
8490
8491 if (unlikely(!cpuacct_subsys.active))
8492 return;
8493
8494 cpu = task_cpu(tsk);
8495
8496 rcu_read_lock();
8497
8498 ca = task_ca(tsk);
8499
8500 for (; ca; ca = parent_ca(ca)) {
8501 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8502 *cpuusage += cputime;
8503 }
8504
8505 rcu_read_unlock();
8506}
8507
8508struct cgroup_subsys cpuacct_subsys = {
8509 .name = "cpuacct",
8510 .create = cpuacct_create,
8511 .destroy = cpuacct_destroy,
8512 .subsys_id = cpuacct_subsys_id,
8513 .base_cftypes = files,
8514};
8515#endif
8516