1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74#include <linux/slab.h>
75
76#include <asm/tlb.h>
77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79
80#include "sched_cpupri.h"
81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
83
84#define CREATE_TRACE_POINTS
85#include <trace/events/sched.h>
86
87
88
89
90
91
92#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
93#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
94#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
95
96
97
98
99
100
101#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
102#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
103#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
104
105
106
107
108#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
109
110#define NICE_0_LOAD SCHED_LOAD_SCALE
111#define NICE_0_SHIFT SCHED_LOAD_SHIFT
112
113
114
115
116
117
118
119#define DEF_TIMESLICE (100 * HZ / 1000)
120
121
122
123
124#define RUNTIME_INF ((u64)~0ULL)
125
126static inline int rt_policy(int policy)
127{
128 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
129 return 1;
130 return 0;
131}
132
133static inline int task_has_rt_policy(struct task_struct *p)
134{
135 return rt_policy(p->policy);
136}
137
138
139
140
141struct rt_prio_array {
142 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
143 struct list_head queue[MAX_RT_PRIO];
144};
145
146struct rt_bandwidth {
147
148 raw_spinlock_t rt_runtime_lock;
149 ktime_t rt_period;
150 u64 rt_runtime;
151 struct hrtimer rt_period_timer;
152};
153
154static struct rt_bandwidth def_rt_bandwidth;
155
156static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
157
158static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
159{
160 struct rt_bandwidth *rt_b =
161 container_of(timer, struct rt_bandwidth, rt_period_timer);
162 ktime_t now;
163 int overrun;
164 int idle = 0;
165
166 for (;;) {
167 now = hrtimer_cb_get_time(timer);
168 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
169
170 if (!overrun)
171 break;
172
173 idle = do_sched_rt_period_timer(rt_b, overrun);
174 }
175
176 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
177}
178
179static
180void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
181{
182 rt_b->rt_period = ns_to_ktime(period);
183 rt_b->rt_runtime = runtime;
184
185 raw_spin_lock_init(&rt_b->rt_runtime_lock);
186
187 hrtimer_init(&rt_b->rt_period_timer,
188 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
189 rt_b->rt_period_timer.function = sched_rt_period_timer;
190}
191
192static inline int rt_bandwidth_enabled(void)
193{
194 return sysctl_sched_rt_runtime >= 0;
195}
196
197static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
198{
199 ktime_t now;
200
201 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
202 return;
203
204 if (hrtimer_active(&rt_b->rt_period_timer))
205 return;
206
207 raw_spin_lock(&rt_b->rt_runtime_lock);
208 for (;;) {
209 unsigned long delta;
210 ktime_t soft, hard;
211
212 if (hrtimer_active(&rt_b->rt_period_timer))
213 break;
214
215 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
216 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
217
218 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
219 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
220 delta = ktime_to_ns(ktime_sub(hard, soft));
221 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
222 HRTIMER_MODE_ABS_PINNED, 0);
223 }
224 raw_spin_unlock(&rt_b->rt_runtime_lock);
225}
226
227#ifdef CONFIG_RT_GROUP_SCHED
228static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
229{
230 hrtimer_cancel(&rt_b->rt_period_timer);
231}
232#endif
233
234
235
236
237
238static DEFINE_MUTEX(sched_domains_mutex);
239
240#ifdef CONFIG_CGROUP_SCHED
241
242#include <linux/cgroup.h>
243
244struct cfs_rq;
245
246static LIST_HEAD(task_groups);
247
248
249struct task_group {
250 struct cgroup_subsys_state css;
251
252#ifdef CONFIG_FAIR_GROUP_SCHED
253
254 struct sched_entity **se;
255
256 struct cfs_rq **cfs_rq;
257 unsigned long shares;
258
259 atomic_t load_weight;
260#endif
261
262#ifdef CONFIG_RT_GROUP_SCHED
263 struct sched_rt_entity **rt_se;
264 struct rt_rq **rt_rq;
265
266 struct rt_bandwidth rt_bandwidth;
267#endif
268
269 struct rcu_head rcu;
270 struct list_head list;
271
272 struct task_group *parent;
273 struct list_head siblings;
274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
280
281
282static DEFINE_SPINLOCK(task_group_lock);
283
284#ifdef CONFIG_FAIR_GROUP_SCHED
285
286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
287
288
289
290
291
292
293
294
295
296#define MIN_SHARES 2
297#define MAX_SHARES (1UL << 18)
298
299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
300#endif
301
302
303
304
305struct task_group root_task_group;
306
307#endif
308
309
310struct cfs_rq {
311 struct load_weight load;
312 unsigned long nr_running;
313
314 u64 exec_clock;
315 u64 min_vruntime;
316
317 struct rb_root tasks_timeline;
318 struct rb_node *rb_leftmost;
319
320 struct list_head tasks;
321 struct list_head *balance_iterator;
322
323
324
325
326
327 struct sched_entity *curr, *next, *last;
328
329 unsigned int nr_spread_over;
330
331#ifdef CONFIG_FAIR_GROUP_SCHED
332 struct rq *rq;
333
334
335
336
337
338
339
340
341
342 int on_list;
343 struct list_head leaf_cfs_rq_list;
344 struct task_group *tg;
345
346#ifdef CONFIG_SMP
347
348
349
350 unsigned long task_weight;
351
352
353
354
355
356
357
358 unsigned long h_load;
359
360
361
362
363
364
365
366
367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
370
371 unsigned long load_contribution;
372#endif
373#endif
374};
375
376
377struct rt_rq {
378 struct rt_prio_array active;
379 unsigned long rt_nr_running;
380#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
381 struct {
382 int curr;
383#ifdef CONFIG_SMP
384 int next;
385#endif
386 } highest_prio;
387#endif
388#ifdef CONFIG_SMP
389 unsigned long rt_nr_migratory;
390 unsigned long rt_nr_total;
391 int overloaded;
392 struct plist_head pushable_tasks;
393#endif
394 int rt_throttled;
395 u64 rt_time;
396 u64 rt_runtime;
397
398 raw_spinlock_t rt_runtime_lock;
399
400#ifdef CONFIG_RT_GROUP_SCHED
401 unsigned long rt_nr_boosted;
402
403 struct rq *rq;
404 struct list_head leaf_rt_rq_list;
405 struct task_group *tg;
406#endif
407};
408
409#ifdef CONFIG_SMP
410
411
412
413
414
415
416
417
418
419struct root_domain {
420 atomic_t refcount;
421 cpumask_var_t span;
422 cpumask_var_t online;
423
424
425
426
427
428 cpumask_var_t rto_mask;
429 atomic_t rto_count;
430 struct cpupri cpupri;
431};
432
433
434
435
436
437static struct root_domain def_root_domain;
438
439#endif
440
441
442
443
444
445
446
447
448struct rq {
449
450 raw_spinlock_t lock;
451
452
453
454
455
456 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
459 unsigned long last_load_update_tick;
460#ifdef CONFIG_NO_HZ
461 u64 nohz_stamp;
462 unsigned char nohz_balance_kick;
463#endif
464 unsigned int skip_clock_update;
465
466
467 struct load_weight load;
468 unsigned long nr_load_updates;
469 u64 nr_switches;
470
471 struct cfs_rq cfs;
472 struct rt_rq rt;
473
474#ifdef CONFIG_FAIR_GROUP_SCHED
475
476 struct list_head leaf_cfs_rq_list;
477#endif
478#ifdef CONFIG_RT_GROUP_SCHED
479 struct list_head leaf_rt_rq_list;
480#endif
481
482
483
484
485
486
487
488 unsigned long nr_uninterruptible;
489
490 struct task_struct *curr, *idle, *stop;
491 unsigned long next_balance;
492 struct mm_struct *prev_mm;
493
494 u64 clock;
495 u64 clock_task;
496
497 atomic_t nr_iowait;
498
499#ifdef CONFIG_SMP
500 struct root_domain *rd;
501 struct sched_domain *sd;
502
503 unsigned long cpu_power;
504
505 unsigned char idle_at_tick;
506
507 int post_schedule;
508 int active_balance;
509 int push_cpu;
510 struct cpu_stop_work active_balance_work;
511
512 int cpu;
513 int online;
514
515 unsigned long avg_load_per_task;
516
517 u64 rt_avg;
518 u64 age_stamp;
519 u64 idle_stamp;
520 u64 avg_idle;
521#endif
522
523#ifdef CONFIG_IRQ_TIME_ACCOUNTING
524 u64 prev_irq_time;
525#endif
526
527
528 unsigned long calc_load_update;
529 long calc_load_active;
530
531#ifdef CONFIG_SCHED_HRTICK
532#ifdef CONFIG_SMP
533 int hrtick_csd_pending;
534 struct call_single_data hrtick_csd;
535#endif
536 struct hrtimer hrtick_timer;
537#endif
538
539#ifdef CONFIG_SCHEDSTATS
540
541 struct sched_info rq_sched_info;
542 unsigned long long rq_cpu_time;
543
544
545
546 unsigned int yld_count;
547
548
549 unsigned int sched_switch;
550 unsigned int sched_count;
551 unsigned int sched_goidle;
552
553
554 unsigned int ttwu_count;
555 unsigned int ttwu_local;
556#endif
557};
558
559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
560
561
562static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
563
564static inline int cpu_of(struct rq *rq)
565{
566#ifdef CONFIG_SMP
567 return rq->cpu;
568#else
569 return 0;
570#endif
571}
572
573#define rcu_dereference_check_sched_domain(p) \
574 rcu_dereference_check((p), \
575 rcu_read_lock_sched_held() || \
576 lockdep_is_held(&sched_domains_mutex))
577
578
579
580
581
582
583
584
585#define for_each_domain(cpu, __sd) \
586 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
587
588#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
589#define this_rq() (&__get_cpu_var(runqueues))
590#define task_rq(p) cpu_rq(task_cpu(p))
591#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
592#define raw_rq() (&__raw_get_cpu_var(runqueues))
593
594#ifdef CONFIG_CGROUP_SCHED
595
596
597
598
599
600
601
602
603
604static inline struct task_group *task_group(struct task_struct *p)
605{
606 struct task_group *tg;
607 struct cgroup_subsys_state *css;
608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
617}
618
619
620static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
621{
622#ifdef CONFIG_FAIR_GROUP_SCHED
623 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
624 p->se.parent = task_group(p)->se[cpu];
625#endif
626
627#ifdef CONFIG_RT_GROUP_SCHED
628 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
629 p->rt.parent = task_group(p)->rt_se[cpu];
630#endif
631}
632
633#else
634
635static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
636static inline struct task_group *task_group(struct task_struct *p)
637{
638 return NULL;
639}
640
641#endif
642
643static void update_rq_clock_task(struct rq *rq, s64 delta);
644
645static void update_rq_clock(struct rq *rq)
646{
647 s64 delta;
648
649 if (rq->skip_clock_update)
650 return;
651
652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
653 rq->clock += delta;
654 update_rq_clock_task(rq, delta);
655}
656
657
658
659
660#ifdef CONFIG_SCHED_DEBUG
661# define const_debug __read_mostly
662#else
663# define const_debug static const
664#endif
665
666
667
668
669
670
671
672
673
674int runqueue_is_locked(int cpu)
675{
676 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
677}
678
679
680
681
682
683#define SCHED_FEAT(name, enabled) \
684 __SCHED_FEAT_##name ,
685
686enum {
687#include "sched_features.h"
688};
689
690#undef SCHED_FEAT
691
692#define SCHED_FEAT(name, enabled) \
693 (1UL << __SCHED_FEAT_##name) * enabled |
694
695const_debug unsigned int sysctl_sched_features =
696#include "sched_features.h"
697 0;
698
699#undef SCHED_FEAT
700
701#ifdef CONFIG_SCHED_DEBUG
702#define SCHED_FEAT(name, enabled) \
703 #name ,
704
705static __read_mostly char *sched_feat_names[] = {
706#include "sched_features.h"
707 NULL
708};
709
710#undef SCHED_FEAT
711
712static int sched_feat_show(struct seq_file *m, void *v)
713{
714 int i;
715
716 for (i = 0; sched_feat_names[i]; i++) {
717 if (!(sysctl_sched_features & (1UL << i)))
718 seq_puts(m, "NO_");
719 seq_printf(m, "%s ", sched_feat_names[i]);
720 }
721 seq_puts(m, "\n");
722
723 return 0;
724}
725
726static ssize_t
727sched_feat_write(struct file *filp, const char __user *ubuf,
728 size_t cnt, loff_t *ppos)
729{
730 char buf[64];
731 char *cmp;
732 int neg = 0;
733 int i;
734
735 if (cnt > 63)
736 cnt = 63;
737
738 if (copy_from_user(&buf, ubuf, cnt))
739 return -EFAULT;
740
741 buf[cnt] = 0;
742 cmp = strstrip(buf);
743
744 if (strncmp(cmp, "NO_", 3) == 0) {
745 neg = 1;
746 cmp += 3;
747 }
748
749 for (i = 0; sched_feat_names[i]; i++) {
750 if (strcmp(cmp, sched_feat_names[i]) == 0) {
751 if (neg)
752 sysctl_sched_features &= ~(1UL << i);
753 else
754 sysctl_sched_features |= (1UL << i);
755 break;
756 }
757 }
758
759 if (!sched_feat_names[i])
760 return -EINVAL;
761
762 *ppos += cnt;
763
764 return cnt;
765}
766
767static int sched_feat_open(struct inode *inode, struct file *filp)
768{
769 return single_open(filp, sched_feat_show, NULL);
770}
771
772static const struct file_operations sched_feat_fops = {
773 .open = sched_feat_open,
774 .write = sched_feat_write,
775 .read = seq_read,
776 .llseek = seq_lseek,
777 .release = single_release,
778};
779
780static __init int sched_init_debug(void)
781{
782 debugfs_create_file("sched_features", 0644, NULL, NULL,
783 &sched_feat_fops);
784
785 return 0;
786}
787late_initcall(sched_init_debug);
788
789#endif
790
791#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
792
793
794
795
796
797const_debug unsigned int sysctl_sched_nr_migrate = 32;
798
799
800
801
802
803
804
805const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
806
807
808
809
810
811unsigned int sysctl_sched_rt_period = 1000000;
812
813static __read_mostly int scheduler_running;
814
815
816
817
818
819int sysctl_sched_rt_runtime = 950000;
820
821static inline u64 global_rt_period(void)
822{
823 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
824}
825
826static inline u64 global_rt_runtime(void)
827{
828 if (sysctl_sched_rt_runtime < 0)
829 return RUNTIME_INF;
830
831 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
832}
833
834#ifndef prepare_arch_switch
835# define prepare_arch_switch(next) do { } while (0)
836#endif
837#ifndef finish_arch_switch
838# define finish_arch_switch(prev) do { } while (0)
839#endif
840
841static inline int task_current(struct rq *rq, struct task_struct *p)
842{
843 return rq->curr == p;
844}
845
846#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline int task_running(struct rq *rq, struct task_struct *p)
848{
849 return task_current(rq, p);
850}
851
852static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
853{
854}
855
856static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
857{
858#ifdef CONFIG_DEBUG_SPINLOCK
859
860 rq->lock.owner = current;
861#endif
862
863
864
865
866
867 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
868
869 raw_spin_unlock_irq(&rq->lock);
870}
871
872#else
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875#ifdef CONFIG_SMP
876 return p->oncpu;
877#else
878 return task_current(rq, p);
879#endif
880}
881
882static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
883{
884#ifdef CONFIG_SMP
885
886
887
888
889
890 next->oncpu = 1;
891#endif
892#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
893 raw_spin_unlock_irq(&rq->lock);
894#else
895 raw_spin_unlock(&rq->lock);
896#endif
897}
898
899static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
900{
901#ifdef CONFIG_SMP
902
903
904
905
906
907 smp_wmb();
908 prev->oncpu = 0;
909#endif
910#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
911 local_irq_enable();
912#endif
913}
914#endif
915
916
917
918
919
920static inline int task_is_waking(struct task_struct *p)
921{
922 return unlikely(p->state == TASK_WAKING);
923}
924
925
926
927
928
929static inline struct rq *__task_rq_lock(struct task_struct *p)
930 __acquires(rq->lock)
931{
932 struct rq *rq;
933
934 for (;;) {
935 rq = task_rq(p);
936 raw_spin_lock(&rq->lock);
937 if (likely(rq == task_rq(p)))
938 return rq;
939 raw_spin_unlock(&rq->lock);
940 }
941}
942
943
944
945
946
947
948static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
949 __acquires(rq->lock)
950{
951 struct rq *rq;
952
953 for (;;) {
954 local_irq_save(*flags);
955 rq = task_rq(p);
956 raw_spin_lock(&rq->lock);
957 if (likely(rq == task_rq(p)))
958 return rq;
959 raw_spin_unlock_irqrestore(&rq->lock, *flags);
960 }
961}
962
963static void __task_rq_unlock(struct rq *rq)
964 __releases(rq->lock)
965{
966 raw_spin_unlock(&rq->lock);
967}
968
969static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
970 __releases(rq->lock)
971{
972 raw_spin_unlock_irqrestore(&rq->lock, *flags);
973}
974
975
976
977
978static struct rq *this_rq_lock(void)
979 __acquires(rq->lock)
980{
981 struct rq *rq;
982
983 local_irq_disable();
984 rq = this_rq();
985 raw_spin_lock(&rq->lock);
986
987 return rq;
988}
989
990#ifdef CONFIG_SCHED_HRTICK
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007static inline int hrtick_enabled(struct rq *rq)
1008{
1009 if (!sched_feat(HRTICK))
1010 return 0;
1011 if (!cpu_active(cpu_of(rq)))
1012 return 0;
1013 return hrtimer_is_hres_active(&rq->hrtick_timer);
1014}
1015
1016static void hrtick_clear(struct rq *rq)
1017{
1018 if (hrtimer_active(&rq->hrtick_timer))
1019 hrtimer_cancel(&rq->hrtick_timer);
1020}
1021
1022
1023
1024
1025
1026static enum hrtimer_restart hrtick(struct hrtimer *timer)
1027{
1028 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1029
1030 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1031
1032 raw_spin_lock(&rq->lock);
1033 update_rq_clock(rq);
1034 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1035 raw_spin_unlock(&rq->lock);
1036
1037 return HRTIMER_NORESTART;
1038}
1039
1040#ifdef CONFIG_SMP
1041
1042
1043
1044static void __hrtick_start(void *arg)
1045{
1046 struct rq *rq = arg;
1047
1048 raw_spin_lock(&rq->lock);
1049 hrtimer_restart(&rq->hrtick_timer);
1050 rq->hrtick_csd_pending = 0;
1051 raw_spin_unlock(&rq->lock);
1052}
1053
1054
1055
1056
1057
1058
1059static void hrtick_start(struct rq *rq, u64 delay)
1060{
1061 struct hrtimer *timer = &rq->hrtick_timer;
1062 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1063
1064 hrtimer_set_expires(timer, time);
1065
1066 if (rq == this_rq()) {
1067 hrtimer_restart(timer);
1068 } else if (!rq->hrtick_csd_pending) {
1069 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1070 rq->hrtick_csd_pending = 1;
1071 }
1072}
1073
1074static int
1075hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1076{
1077 int cpu = (int)(long)hcpu;
1078
1079 switch (action) {
1080 case CPU_UP_CANCELED:
1081 case CPU_UP_CANCELED_FROZEN:
1082 case CPU_DOWN_PREPARE:
1083 case CPU_DOWN_PREPARE_FROZEN:
1084 case CPU_DEAD:
1085 case CPU_DEAD_FROZEN:
1086 hrtick_clear(cpu_rq(cpu));
1087 return NOTIFY_OK;
1088 }
1089
1090 return NOTIFY_DONE;
1091}
1092
1093static __init void init_hrtick(void)
1094{
1095 hotcpu_notifier(hotplug_hrtick, 0);
1096}
1097#else
1098
1099
1100
1101
1102
1103static void hrtick_start(struct rq *rq, u64 delay)
1104{
1105 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1106 HRTIMER_MODE_REL_PINNED, 0);
1107}
1108
1109static inline void init_hrtick(void)
1110{
1111}
1112#endif
1113
1114static void init_rq_hrtick(struct rq *rq)
1115{
1116#ifdef CONFIG_SMP
1117 rq->hrtick_csd_pending = 0;
1118
1119 rq->hrtick_csd.flags = 0;
1120 rq->hrtick_csd.func = __hrtick_start;
1121 rq->hrtick_csd.info = rq;
1122#endif
1123
1124 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1125 rq->hrtick_timer.function = hrtick;
1126}
1127#else
1128static inline void hrtick_clear(struct rq *rq)
1129{
1130}
1131
1132static inline void init_rq_hrtick(struct rq *rq)
1133{
1134}
1135
1136static inline void init_hrtick(void)
1137{
1138}
1139#endif
1140
1141
1142
1143
1144
1145
1146
1147
1148#ifdef CONFIG_SMP
1149
1150#ifndef tsk_is_polling
1151#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1152#endif
1153
1154static void resched_task(struct task_struct *p)
1155{
1156 int cpu;
1157
1158 assert_raw_spin_locked(&task_rq(p)->lock);
1159
1160 if (test_tsk_need_resched(p))
1161 return;
1162
1163 set_tsk_need_resched(p);
1164
1165 cpu = task_cpu(p);
1166 if (cpu == smp_processor_id())
1167 return;
1168
1169
1170 smp_mb();
1171 if (!tsk_is_polling(p))
1172 smp_send_reschedule(cpu);
1173}
1174
1175static void resched_cpu(int cpu)
1176{
1177 struct rq *rq = cpu_rq(cpu);
1178 unsigned long flags;
1179
1180 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1181 return;
1182 resched_task(cpu_curr(cpu));
1183 raw_spin_unlock_irqrestore(&rq->lock, flags);
1184}
1185
1186#ifdef CONFIG_NO_HZ
1187
1188
1189
1190
1191
1192
1193
1194
1195int get_nohz_timer_target(void)
1196{
1197 int cpu = smp_processor_id();
1198 int i;
1199 struct sched_domain *sd;
1200
1201 for_each_domain(cpu, sd) {
1202 for_each_cpu(i, sched_domain_span(sd))
1203 if (!idle_cpu(i))
1204 return i;
1205 }
1206 return cpu;
1207}
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218void wake_up_idle_cpu(int cpu)
1219{
1220 struct rq *rq = cpu_rq(cpu);
1221
1222 if (cpu == smp_processor_id())
1223 return;
1224
1225
1226
1227
1228
1229
1230
1231
1232 if (rq->curr != rq->idle)
1233 return;
1234
1235
1236
1237
1238
1239
1240 set_tsk_need_resched(rq->idle);
1241
1242
1243 smp_mb();
1244 if (!tsk_is_polling(rq->idle))
1245 smp_send_reschedule(cpu);
1246}
1247
1248#endif
1249
1250static u64 sched_avg_period(void)
1251{
1252 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1253}
1254
1255static void sched_avg_update(struct rq *rq)
1256{
1257 s64 period = sched_avg_period();
1258
1259 while ((s64)(rq->clock - rq->age_stamp) > period) {
1260
1261
1262
1263
1264
1265 asm("" : "+rm" (rq->age_stamp));
1266 rq->age_stamp += period;
1267 rq->rt_avg /= 2;
1268 }
1269}
1270
1271static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1272{
1273 rq->rt_avg += rt_delta;
1274 sched_avg_update(rq);
1275}
1276
1277#else
1278static void resched_task(struct task_struct *p)
1279{
1280 assert_raw_spin_locked(&task_rq(p)->lock);
1281 set_tsk_need_resched(p);
1282}
1283
1284static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1285{
1286}
1287
1288static void sched_avg_update(struct rq *rq)
1289{
1290}
1291#endif
1292
1293#if BITS_PER_LONG == 32
1294# define WMULT_CONST (~0UL)
1295#else
1296# define WMULT_CONST (1UL << 32)
1297#endif
1298
1299#define WMULT_SHIFT 32
1300
1301
1302
1303
1304#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1305
1306
1307
1308
1309static unsigned long
1310calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1311 struct load_weight *lw)
1312{
1313 u64 tmp;
1314
1315 if (!lw->inv_weight) {
1316 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1317 lw->inv_weight = 1;
1318 else
1319 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1320 / (lw->weight+1);
1321 }
1322
1323 tmp = (u64)delta_exec * weight;
1324
1325
1326
1327 if (unlikely(tmp > WMULT_CONST))
1328 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1329 WMULT_SHIFT/2);
1330 else
1331 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1332
1333 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1334}
1335
1336static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1337{
1338 lw->weight += inc;
1339 lw->inv_weight = 0;
1340}
1341
1342static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1343{
1344 lw->weight -= dec;
1345 lw->inv_weight = 0;
1346}
1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363#define WEIGHT_IDLEPRIO 3
1364#define WMULT_IDLEPRIO 1431655765
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378static const int prio_to_weight[40] = {
1379 88761, 71755, 56483, 46273, 36291,
1380 29154, 23254, 18705, 14949, 11916,
1381 9548, 7620, 6100, 4904, 3906,
1382 3121, 2501, 1991, 1586, 1277,
1383 1024, 820, 655, 526, 423,
1384 335, 272, 215, 172, 137,
1385 110, 87, 70, 56, 45,
1386 36, 29, 23, 18, 15,
1387};
1388
1389
1390
1391
1392
1393
1394
1395
1396static const u32 prio_to_wmult[40] = {
1397 48388, 59856, 76040, 92818, 118348,
1398 147320, 184698, 229616, 287308, 360437,
1399 449829, 563644, 704093, 875809, 1099582,
1400 1376151, 1717300, 2157191, 2708050, 3363326,
1401 4194304, 5237765, 6557202, 8165337, 10153587,
1402 12820798, 15790321, 19976592, 24970740, 31350126,
1403 39045157, 49367440, 61356676, 76695844, 95443717,
1404 119304647, 148102320, 186737708, 238609294, 286331153,
1405};
1406
1407
1408enum cpuacct_stat_index {
1409 CPUACCT_STAT_USER,
1410 CPUACCT_STAT_SYSTEM,
1411
1412 CPUACCT_STAT_NSTATS,
1413};
1414
1415#ifdef CONFIG_CGROUP_CPUACCT
1416static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1417static void cpuacct_update_stats(struct task_struct *tsk,
1418 enum cpuacct_stat_index idx, cputime_t val);
1419#else
1420static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1421static inline void cpuacct_update_stats(struct task_struct *tsk,
1422 enum cpuacct_stat_index idx, cputime_t val) {}
1423#endif
1424
1425static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1426{
1427 update_load_add(&rq->load, load);
1428}
1429
1430static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1431{
1432 update_load_sub(&rq->load, load);
1433}
1434
1435#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1436typedef int (*tg_visitor)(struct task_group *, void *);
1437
1438
1439
1440
1441
1442static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1443{
1444 struct task_group *parent, *child;
1445 int ret;
1446
1447 rcu_read_lock();
1448 parent = &root_task_group;
1449down:
1450 ret = (*down)(parent, data);
1451 if (ret)
1452 goto out_unlock;
1453 list_for_each_entry_rcu(child, &parent->children, siblings) {
1454 parent = child;
1455 goto down;
1456
1457up:
1458 continue;
1459 }
1460 ret = (*up)(parent, data);
1461 if (ret)
1462 goto out_unlock;
1463
1464 child = parent;
1465 parent = parent->parent;
1466 if (parent)
1467 goto up;
1468out_unlock:
1469 rcu_read_unlock();
1470
1471 return ret;
1472}
1473
1474static int tg_nop(struct task_group *tg, void *data)
1475{
1476 return 0;
1477}
1478#endif
1479
1480#ifdef CONFIG_SMP
1481
1482static unsigned long weighted_cpuload(const int cpu)
1483{
1484 return cpu_rq(cpu)->load.weight;
1485}
1486
1487
1488
1489
1490
1491
1492
1493
1494static unsigned long source_load(int cpu, int type)
1495{
1496 struct rq *rq = cpu_rq(cpu);
1497 unsigned long total = weighted_cpuload(cpu);
1498
1499 if (type == 0 || !sched_feat(LB_BIAS))
1500 return total;
1501
1502 return min(rq->cpu_load[type-1], total);
1503}
1504
1505
1506
1507
1508
1509static unsigned long target_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return max(rq->cpu_load[type-1], total);
1518}
1519
1520static unsigned long power_of(int cpu)
1521{
1522 return cpu_rq(cpu)->cpu_power;
1523}
1524
1525static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1526
1527static unsigned long cpu_avg_load_per_task(int cpu)
1528{
1529 struct rq *rq = cpu_rq(cpu);
1530 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1531
1532 if (nr_running)
1533 rq->avg_load_per_task = rq->load.weight / nr_running;
1534 else
1535 rq->avg_load_per_task = 0;
1536
1537 return rq->avg_load_per_task;
1538}
1539
1540#ifdef CONFIG_FAIR_GROUP_SCHED
1541
1542
1543
1544
1545
1546
1547static int tg_load_down(struct task_group *tg, void *data)
1548{
1549 unsigned long load;
1550 long cpu = (long)data;
1551
1552 if (!tg->parent) {
1553 load = cpu_rq(cpu)->load.weight;
1554 } else {
1555 load = tg->parent->cfs_rq[cpu]->h_load;
1556 load *= tg->se[cpu]->load.weight;
1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1558 }
1559
1560 tg->cfs_rq[cpu]->h_load = load;
1561
1562 return 0;
1563}
1564
1565static void update_h_load(long cpu)
1566{
1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1568}
1569
1570#endif
1571
1572#ifdef CONFIG_PREEMPT
1573
1574static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1585 __releases(this_rq->lock)
1586 __acquires(busiest->lock)
1587 __acquires(this_rq->lock)
1588{
1589 raw_spin_unlock(&this_rq->lock);
1590 double_rq_lock(this_rq, busiest);
1591
1592 return 1;
1593}
1594
1595#else
1596
1597
1598
1599
1600
1601
1602
1603static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1604 __releases(this_rq->lock)
1605 __acquires(busiest->lock)
1606 __acquires(this_rq->lock)
1607{
1608 int ret = 0;
1609
1610 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1611 if (busiest < this_rq) {
1612 raw_spin_unlock(&this_rq->lock);
1613 raw_spin_lock(&busiest->lock);
1614 raw_spin_lock_nested(&this_rq->lock,
1615 SINGLE_DEPTH_NESTING);
1616 ret = 1;
1617 } else
1618 raw_spin_lock_nested(&busiest->lock,
1619 SINGLE_DEPTH_NESTING);
1620 }
1621 return ret;
1622}
1623
1624#endif
1625
1626
1627
1628
1629static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1630{
1631 if (unlikely(!irqs_disabled())) {
1632
1633 raw_spin_unlock(&this_rq->lock);
1634 BUG_ON(1);
1635 }
1636
1637 return _double_lock_balance(this_rq, busiest);
1638}
1639
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock)
1642{
1643 raw_spin_unlock(&busiest->lock);
1644 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1645}
1646
1647
1648
1649
1650
1651
1652
1653static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1654 __acquires(rq1->lock)
1655 __acquires(rq2->lock)
1656{
1657 BUG_ON(!irqs_disabled());
1658 if (rq1 == rq2) {
1659 raw_spin_lock(&rq1->lock);
1660 __acquire(rq2->lock);
1661 } else {
1662 if (rq1 < rq2) {
1663 raw_spin_lock(&rq1->lock);
1664 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1665 } else {
1666 raw_spin_lock(&rq2->lock);
1667 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1668 }
1669 }
1670}
1671
1672
1673
1674
1675
1676
1677
1678static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1679 __releases(rq1->lock)
1680 __releases(rq2->lock)
1681{
1682 raw_spin_unlock(&rq1->lock);
1683 if (rq1 != rq2)
1684 raw_spin_unlock(&rq2->lock);
1685 else
1686 __release(rq2->lock);
1687}
1688
1689#endif
1690
1691static void calc_load_account_idle(struct rq *this_rq);
1692static void update_sysctl(void);
1693static int get_update_sysctl_factor(void);
1694static void update_cpu_load(struct rq *this_rq);
1695
1696static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1697{
1698 set_task_rq(p, cpu);
1699#ifdef CONFIG_SMP
1700
1701
1702
1703
1704
1705 smp_wmb();
1706 task_thread_info(p)->cpu = cpu;
1707#endif
1708}
1709
1710static const struct sched_class rt_sched_class;
1711
1712#define sched_class_highest (&stop_sched_class)
1713#define for_each_class(class) \
1714 for (class = sched_class_highest; class; class = class->next)
1715
1716#include "sched_stats.h"
1717
1718static void inc_nr_running(struct rq *rq)
1719{
1720 rq->nr_running++;
1721}
1722
1723static void dec_nr_running(struct rq *rq)
1724{
1725 rq->nr_running--;
1726}
1727
1728static void set_load_weight(struct task_struct *p)
1729{
1730
1731
1732
1733 if (p->policy == SCHED_IDLE) {
1734 p->se.load.weight = WEIGHT_IDLEPRIO;
1735 p->se.load.inv_weight = WMULT_IDLEPRIO;
1736 return;
1737 }
1738
1739 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1740 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1741}
1742
1743static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1744{
1745 update_rq_clock(rq);
1746 sched_info_queued(p);
1747 p->sched_class->enqueue_task(rq, p, flags);
1748 p->se.on_rq = 1;
1749}
1750
1751static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1752{
1753 update_rq_clock(rq);
1754 sched_info_dequeued(p);
1755 p->sched_class->dequeue_task(rq, p, flags);
1756 p->se.on_rq = 0;
1757}
1758
1759
1760
1761
1762static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1763{
1764 if (task_contributes_to_load(p))
1765 rq->nr_uninterruptible--;
1766
1767 enqueue_task(rq, p, flags);
1768 inc_nr_running(rq);
1769}
1770
1771
1772
1773
1774static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1775{
1776 if (task_contributes_to_load(p))
1777 rq->nr_uninterruptible++;
1778
1779 dequeue_task(rq, p, flags);
1780 dec_nr_running(rq);
1781}
1782
1783#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1797static DEFINE_PER_CPU(u64, cpu_softirq_time);
1798
1799static DEFINE_PER_CPU(u64, irq_start_time);
1800static int sched_clock_irqtime;
1801
1802void enable_sched_clock_irqtime(void)
1803{
1804 sched_clock_irqtime = 1;
1805}
1806
1807void disable_sched_clock_irqtime(void)
1808{
1809 sched_clock_irqtime = 0;
1810}
1811
1812#ifndef CONFIG_64BIT
1813static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1814
1815static inline void irq_time_write_begin(void)
1816{
1817 __this_cpu_inc(irq_time_seq.sequence);
1818 smp_wmb();
1819}
1820
1821static inline void irq_time_write_end(void)
1822{
1823 smp_wmb();
1824 __this_cpu_inc(irq_time_seq.sequence);
1825}
1826
1827static inline u64 irq_time_read(int cpu)
1828{
1829 u64 irq_time;
1830 unsigned seq;
1831
1832 do {
1833 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1834 irq_time = per_cpu(cpu_softirq_time, cpu) +
1835 per_cpu(cpu_hardirq_time, cpu);
1836 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1837
1838 return irq_time;
1839}
1840#else
1841static inline void irq_time_write_begin(void)
1842{
1843}
1844
1845static inline void irq_time_write_end(void)
1846{
1847}
1848
1849static inline u64 irq_time_read(int cpu)
1850{
1851 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1852}
1853#endif
1854
1855
1856
1857
1858
1859void account_system_vtime(struct task_struct *curr)
1860{
1861 unsigned long flags;
1862 s64 delta;
1863 int cpu;
1864
1865 if (!sched_clock_irqtime)
1866 return;
1867
1868 local_irq_save(flags);
1869
1870 cpu = smp_processor_id();
1871 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1872 __this_cpu_add(irq_start_time, delta);
1873
1874 irq_time_write_begin();
1875
1876
1877
1878
1879
1880
1881 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1884 __this_cpu_add(cpu_softirq_time, delta);
1885
1886 irq_time_write_end();
1887 local_irq_restore(flags);
1888}
1889EXPORT_SYMBOL_GPL(account_system_vtime);
1890
1891static void update_rq_clock_task(struct rq *rq, s64 delta)
1892{
1893 s64 irq_delta;
1894
1895 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912 if (irq_delta > delta)
1913 irq_delta = delta;
1914
1915 rq->prev_irq_time += irq_delta;
1916 delta -= irq_delta;
1917 rq->clock_task += delta;
1918
1919 if (irq_delta && sched_feat(NONIRQ_POWER))
1920 sched_rt_avg_update(rq, irq_delta);
1921}
1922
1923#else
1924
1925static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{
1927 rq->clock_task += delta;
1928}
1929
1930#endif
1931
1932#include "sched_idletask.c"
1933#include "sched_fair.c"
1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
1936#include "sched_stoptask.c"
1937#ifdef CONFIG_SCHED_DEBUG
1938# include "sched_debug.c"
1939#endif
1940
1941void sched_set_stop_task(int cpu, struct task_struct *stop)
1942{
1943 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1944 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1945
1946 if (stop) {
1947
1948
1949
1950
1951
1952
1953
1954
1955 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
1956
1957 stop->sched_class = &stop_sched_class;
1958 }
1959
1960 cpu_rq(cpu)->stop = stop;
1961
1962 if (old_stop) {
1963
1964
1965
1966
1967 old_stop->sched_class = &rt_sched_class;
1968 }
1969}
1970
1971
1972
1973
1974static inline int __normal_prio(struct task_struct *p)
1975{
1976 return p->static_prio;
1977}
1978
1979
1980
1981
1982
1983
1984
1985
1986static inline int normal_prio(struct task_struct *p)
1987{
1988 int prio;
1989
1990 if (task_has_rt_policy(p))
1991 prio = MAX_RT_PRIO-1 - p->rt_priority;
1992 else
1993 prio = __normal_prio(p);
1994 return prio;
1995}
1996
1997
1998
1999
2000
2001
2002
2003
2004static int effective_prio(struct task_struct *p)
2005{
2006 p->normal_prio = normal_prio(p);
2007
2008
2009
2010
2011
2012 if (!rt_prio(p->prio))
2013 return p->normal_prio;
2014 return p->prio;
2015}
2016
2017
2018
2019
2020
2021inline int task_curr(const struct task_struct *p)
2022{
2023 return cpu_curr(task_cpu(p)) == p;
2024}
2025
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class,
2028 int oldprio, int running)
2029{
2030 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running);
2033 p->sched_class->switched_to(rq, p, running);
2034 } else
2035 p->sched_class->prio_changed(rq, p, oldprio, running);
2036}
2037
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2039{
2040 const struct sched_class *class;
2041
2042 if (p->sched_class == rq->curr->sched_class) {
2043 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2044 } else {
2045 for_each_class(class) {
2046 if (class == rq->curr->sched_class)
2047 break;
2048 if (class == p->sched_class) {
2049 resched_task(rq->curr);
2050 break;
2051 }
2052 }
2053 }
2054
2055
2056
2057
2058
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1;
2061}
2062
2063#ifdef CONFIG_SMP
2064
2065
2066
2067static int
2068task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2069{
2070 s64 delta;
2071
2072 if (p->sched_class != &fair_sched_class)
2073 return 0;
2074
2075 if (unlikely(p->policy == SCHED_IDLE))
2076 return 0;
2077
2078
2079
2080
2081 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2082 (&p->se == cfs_rq_of(&p->se)->next ||
2083 &p->se == cfs_rq_of(&p->se)->last))
2084 return 1;
2085
2086 if (sysctl_sched_migration_cost == -1)
2087 return 1;
2088 if (sysctl_sched_migration_cost == 0)
2089 return 0;
2090
2091 delta = now - p->se.exec_start;
2092
2093 return delta < (s64)sysctl_sched_migration_cost;
2094}
2095
2096void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2097{
2098#ifdef CONFIG_SCHED_DEBUG
2099
2100
2101
2102
2103 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2104 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2105#endif
2106
2107 trace_sched_migrate_task(p, new_cpu);
2108
2109 if (task_cpu(p) != new_cpu) {
2110 p->se.nr_migrations++;
2111 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2112 }
2113
2114 __set_task_cpu(p, new_cpu);
2115}
2116
2117struct migration_arg {
2118 struct task_struct *task;
2119 int dest_cpu;
2120};
2121
2122static int migration_cpu_stop(void *data);
2123
2124
2125
2126
2127
2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2129{
2130
2131
2132
2133
2134 return p->se.on_rq || task_running(rq, p);
2135}
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2154{
2155 unsigned long flags;
2156 int running, on_rq;
2157 unsigned long ncsw;
2158 struct rq *rq;
2159
2160 for (;;) {
2161
2162
2163
2164
2165
2166
2167 rq = task_rq(p);
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180 while (task_running(rq, p)) {
2181 if (match_state && unlikely(p->state != match_state))
2182 return 0;
2183 cpu_relax();
2184 }
2185
2186
2187
2188
2189
2190
2191 rq = task_rq_lock(p, &flags);
2192 trace_sched_wait_task(p);
2193 running = task_running(rq, p);
2194 on_rq = p->se.on_rq;
2195 ncsw = 0;
2196 if (!match_state || p->state == match_state)
2197 ncsw = p->nvcsw | LONG_MIN;
2198 task_rq_unlock(rq, &flags);
2199
2200
2201
2202
2203 if (unlikely(!ncsw))
2204 break;
2205
2206
2207
2208
2209
2210
2211
2212 if (unlikely(running)) {
2213 cpu_relax();
2214 continue;
2215 }
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1);
2228 continue;
2229 }
2230
2231
2232
2233
2234
2235
2236 break;
2237 }
2238
2239 return ncsw;
2240}
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255void kick_process(struct task_struct *p)
2256{
2257 int cpu;
2258
2259 preempt_disable();
2260 cpu = task_cpu(p);
2261 if ((cpu != smp_processor_id()) && task_curr(p))
2262 smp_send_reschedule(cpu);
2263 preempt_enable();
2264}
2265EXPORT_SYMBOL_GPL(kick_process);
2266#endif
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP
2290
2291
2292
2293static int select_fallback_rq(int cpu, struct task_struct *p)
2294{
2295 int dest_cpu;
2296 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2297
2298
2299 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2300 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2301 return dest_cpu;
2302
2303
2304 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2305 if (dest_cpu < nr_cpu_ids)
2306 return dest_cpu;
2307
2308
2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2310
2311
2312
2313
2314
2315 if (p->mm && printk_ratelimit()) {
2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2317 task_pid_nr(p), p->comm, cpu);
2318 }
2319
2320 return dest_cpu;
2321}
2322
2323
2324
2325
2326static inline
2327int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2328{
2329 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2342 !cpu_online(cpu)))
2343 cpu = select_fallback_rq(task_cpu(p), p);
2344
2345 return cpu;
2346}
2347
2348static void update_avg(u64 *avg, u64 sample)
2349{
2350 s64 diff = sample - *avg;
2351 *avg += diff >> 3;
2352}
2353#endif
2354
2355static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2356 bool is_sync, bool is_migrate, bool is_local,
2357 unsigned long en_flags)
2358{
2359 schedstat_inc(p, se.statistics.nr_wakeups);
2360 if (is_sync)
2361 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2362 if (is_migrate)
2363 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2364 if (is_local)
2365 schedstat_inc(p, se.statistics.nr_wakeups_local);
2366 else
2367 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2368
2369 activate_task(rq, p, en_flags);
2370}
2371
2372static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2373 int wake_flags, bool success)
2374{
2375 trace_sched_wakeup(p, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394
2395 if ((p->flags & PF_WQ_WORKER) && success)
2396 wq_worker_waking_up(p, cpu_of(rq));
2397}
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414static int try_to_wake_up(struct task_struct *p, unsigned int state,
2415 int wake_flags)
2416{
2417 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags;
2419 unsigned long en_flags = ENQUEUE_WAKEUP;
2420 struct rq *rq;
2421
2422 this_cpu = get_cpu();
2423
2424 smp_wmb();
2425 rq = task_rq_lock(p, &flags);
2426 if (!(p->state & state))
2427 goto out;
2428
2429 if (p->se.on_rq)
2430 goto out_running;
2431
2432 cpu = task_cpu(p);
2433 orig_cpu = cpu;
2434
2435#ifdef CONFIG_SMP
2436 if (unlikely(task_running(rq, p)))
2437 goto out_activate;
2438
2439
2440
2441
2442
2443
2444
2445 if (task_contributes_to_load(p)) {
2446 if (likely(cpu_online(orig_cpu)))
2447 rq->nr_uninterruptible--;
2448 else
2449 this_rq()->nr_uninterruptible--;
2450 }
2451 p->state = TASK_WAKING;
2452
2453 if (p->sched_class->task_waking) {
2454 p->sched_class->task_waking(rq, p);
2455 en_flags |= ENQUEUE_WAKING;
2456 }
2457
2458 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2459 if (cpu != orig_cpu)
2460 set_task_cpu(p, cpu);
2461 __task_rq_unlock(rq);
2462
2463 rq = cpu_rq(cpu);
2464 raw_spin_lock(&rq->lock);
2465
2466
2467
2468
2469
2470
2471
2472 WARN_ON(task_cpu(p) != cpu);
2473 WARN_ON(p->state != TASK_WAKING);
2474
2475#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count);
2477 if (cpu == this_cpu)
2478 schedstat_inc(rq, ttwu_local);
2479 else {
2480 struct sched_domain *sd;
2481 for_each_domain(this_cpu, sd) {
2482 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2483 schedstat_inc(sd, ttwu_wake_remote);
2484 break;
2485 }
2486 }
2487 }
2488#endif
2489
2490out_activate:
2491#endif
2492 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2493 cpu == this_cpu, en_flags);
2494 success = 1;
2495out_running:
2496 ttwu_post_activation(p, rq, wake_flags, success);
2497out:
2498 task_rq_unlock(rq, &flags);
2499 put_cpu();
2500
2501 return success;
2502}
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512static void try_to_wake_up_local(struct task_struct *p)
2513{
2514 struct rq *rq = task_rq(p);
2515 bool success = false;
2516
2517 BUG_ON(rq != this_rq());
2518 BUG_ON(p == current);
2519 lockdep_assert_held(&rq->lock);
2520
2521 if (!(p->state & TASK_NORMAL))
2522 return;
2523
2524 if (!p->se.on_rq) {
2525 if (likely(!task_running(rq, p))) {
2526 schedstat_inc(rq, ttwu_count);
2527 schedstat_inc(rq, ttwu_local);
2528 }
2529 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2530 success = true;
2531 }
2532 ttwu_post_activation(p, rq, 0, success);
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546int wake_up_process(struct task_struct *p)
2547{
2548 return try_to_wake_up(p, TASK_ALL, 0);
2549}
2550EXPORT_SYMBOL(wake_up_process);
2551
2552int wake_up_state(struct task_struct *p, unsigned int state)
2553{
2554 return try_to_wake_up(p, state, 0);
2555}
2556
2557
2558
2559
2560
2561
2562
2563static void __sched_fork(struct task_struct *p)
2564{
2565 p->se.exec_start = 0;
2566 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0;
2569
2570#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2572#endif
2573
2574 INIT_LIST_HEAD(&p->rt.run_list);
2575 p->se.on_rq = 0;
2576 INIT_LIST_HEAD(&p->se.group_node);
2577
2578#ifdef CONFIG_PREEMPT_NOTIFIERS
2579 INIT_HLIST_HEAD(&p->preempt_notifiers);
2580#endif
2581}
2582
2583
2584
2585
2586void sched_fork(struct task_struct *p, int clone_flags)
2587{
2588 int cpu = get_cpu();
2589
2590 __sched_fork(p);
2591
2592
2593
2594
2595
2596 p->state = TASK_RUNNING;
2597
2598
2599
2600
2601 if (unlikely(p->sched_reset_on_fork)) {
2602 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2603 p->policy = SCHED_NORMAL;
2604 p->normal_prio = p->static_prio;
2605 }
2606
2607 if (PRIO_TO_NICE(p->static_prio) < 0) {
2608 p->static_prio = NICE_TO_PRIO(0);
2609 p->normal_prio = p->static_prio;
2610 set_load_weight(p);
2611 }
2612
2613
2614
2615
2616
2617 p->sched_reset_on_fork = 0;
2618 }
2619
2620
2621
2622
2623 p->prio = current->normal_prio;
2624
2625 if (!rt_prio(p->prio))
2626 p->sched_class = &fair_sched_class;
2627
2628 if (p->sched_class->task_fork)
2629 p->sched_class->task_fork(p);
2630
2631
2632
2633
2634
2635
2636
2637
2638 rcu_read_lock();
2639 set_task_cpu(p, cpu);
2640 rcu_read_unlock();
2641
2642#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2643 if (likely(sched_info_on()))
2644 memset(&p->sched_info, 0, sizeof(p->sched_info));
2645#endif
2646#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2647 p->oncpu = 0;
2648#endif
2649#ifdef CONFIG_PREEMPT
2650
2651 task_thread_info(p)->preempt_count = 1;
2652#endif
2653#ifdef CONFIG_SMP
2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2656
2657 put_cpu();
2658}
2659
2660
2661
2662
2663
2664
2665
2666
2667void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2668{
2669 unsigned long flags;
2670 struct rq *rq;
2671 int cpu __maybe_unused = get_cpu();
2672
2673#ifdef CONFIG_SMP
2674 rq = task_rq_lock(p, &flags);
2675 p->state = TASK_WAKING;
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2686 set_task_cpu(p, cpu);
2687
2688 p->state = TASK_RUNNING;
2689 task_rq_unlock(rq, &flags);
2690#endif
2691
2692 rq = task_rq_lock(p, &flags);
2693 activate_task(rq, p, 0);
2694 trace_sched_wakeup_new(p, 1);
2695 check_preempt_curr(rq, p, WF_FORK);
2696#ifdef CONFIG_SMP
2697 if (p->sched_class->task_woken)
2698 p->sched_class->task_woken(rq, p);
2699#endif
2700 task_rq_unlock(rq, &flags);
2701 put_cpu();
2702}
2703
2704#ifdef CONFIG_PREEMPT_NOTIFIERS
2705
2706
2707
2708
2709
2710void preempt_notifier_register(struct preempt_notifier *notifier)
2711{
2712 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2713}
2714EXPORT_SYMBOL_GPL(preempt_notifier_register);
2715
2716
2717
2718
2719
2720
2721
2722void preempt_notifier_unregister(struct preempt_notifier *notifier)
2723{
2724 hlist_del(¬ifier->link);
2725}
2726EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2727
2728static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2729{
2730 struct preempt_notifier *notifier;
2731 struct hlist_node *node;
2732
2733 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2734 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2735}
2736
2737static void
2738fire_sched_out_preempt_notifiers(struct task_struct *curr,
2739 struct task_struct *next)
2740{
2741 struct preempt_notifier *notifier;
2742 struct hlist_node *node;
2743
2744 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2745 notifier->ops->sched_out(notifier, next);
2746}
2747
2748#else
2749
2750static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2751{
2752}
2753
2754static void
2755fire_sched_out_preempt_notifiers(struct task_struct *curr,
2756 struct task_struct *next)
2757{
2758}
2759
2760#endif
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next)
2778{
2779 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next);
2782}
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2800 __releases(rq->lock)
2801{
2802 struct mm_struct *mm = rq->prev_mm;
2803 long prev_state;
2804
2805 rq->prev_mm = NULL;
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818 prev_state = prev->state;
2819 finish_arch_switch(prev);
2820#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2821 local_irq_disable();
2822#endif
2823 perf_event_task_sched_in(current);
2824#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2825 local_irq_enable();
2826#endif
2827 finish_lock_switch(rq, prev);
2828
2829 fire_sched_in_preempt_notifiers(current);
2830 if (mm)
2831 mmdrop(mm);
2832 if (unlikely(prev_state == TASK_DEAD)) {
2833
2834
2835
2836
2837 kprobe_flush_task(prev);
2838 put_task_struct(prev);
2839 }
2840}
2841
2842#ifdef CONFIG_SMP
2843
2844
2845static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2846{
2847 if (prev->sched_class->pre_schedule)
2848 prev->sched_class->pre_schedule(rq, prev);
2849}
2850
2851
2852static inline void post_schedule(struct rq *rq)
2853{
2854 if (rq->post_schedule) {
2855 unsigned long flags;
2856
2857 raw_spin_lock_irqsave(&rq->lock, flags);
2858 if (rq->curr->sched_class->post_schedule)
2859 rq->curr->sched_class->post_schedule(rq);
2860 raw_spin_unlock_irqrestore(&rq->lock, flags);
2861
2862 rq->post_schedule = 0;
2863 }
2864}
2865
2866#else
2867
2868static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2869{
2870}
2871
2872static inline void post_schedule(struct rq *rq)
2873{
2874}
2875
2876#endif
2877
2878
2879
2880
2881
2882asmlinkage void schedule_tail(struct task_struct *prev)
2883 __releases(rq->lock)
2884{
2885 struct rq *rq = this_rq();
2886
2887 finish_task_switch(rq, prev);
2888
2889
2890
2891
2892
2893 post_schedule(rq);
2894
2895#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2896
2897 preempt_enable();
2898#endif
2899 if (current->set_child_tid)
2900 put_user(task_pid_vnr(current), current->set_child_tid);
2901}
2902
2903
2904
2905
2906
2907static inline void
2908context_switch(struct rq *rq, struct task_struct *prev,
2909 struct task_struct *next)
2910{
2911 struct mm_struct *mm, *oldmm;
2912
2913 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next);
2915 mm = next->mm;
2916 oldmm = prev->active_mm;
2917
2918
2919
2920
2921
2922 arch_start_context_switch(prev);
2923
2924 if (!mm) {
2925 next->active_mm = oldmm;
2926 atomic_inc(&oldmm->mm_count);
2927 enter_lazy_tlb(oldmm, next);
2928 } else
2929 switch_mm(oldmm, mm, next);
2930
2931 if (!prev->mm) {
2932 prev->active_mm = NULL;
2933 rq->prev_mm = oldmm;
2934 }
2935
2936
2937
2938
2939
2940
2941#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2942 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2943#endif
2944
2945
2946 switch_to(prev, next, prev);
2947
2948 barrier();
2949
2950
2951
2952
2953
2954 finish_task_switch(this_rq(), prev);
2955}
2956
2957
2958
2959
2960
2961
2962
2963
2964unsigned long nr_running(void)
2965{
2966 unsigned long i, sum = 0;
2967
2968 for_each_online_cpu(i)
2969 sum += cpu_rq(i)->nr_running;
2970
2971 return sum;
2972}
2973
2974unsigned long nr_uninterruptible(void)
2975{
2976 unsigned long i, sum = 0;
2977
2978 for_each_possible_cpu(i)
2979 sum += cpu_rq(i)->nr_uninterruptible;
2980
2981
2982
2983
2984
2985 if (unlikely((long)sum < 0))
2986 sum = 0;
2987
2988 return sum;
2989}
2990
2991unsigned long long nr_context_switches(void)
2992{
2993 int i;
2994 unsigned long long sum = 0;
2995
2996 for_each_possible_cpu(i)
2997 sum += cpu_rq(i)->nr_switches;
2998
2999 return sum;
3000}
3001
3002unsigned long nr_iowait(void)
3003{
3004 unsigned long i, sum = 0;
3005
3006 for_each_possible_cpu(i)
3007 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3008
3009 return sum;
3010}
3011
3012unsigned long nr_iowait_cpu(int cpu)
3013{
3014 struct rq *this = cpu_rq(cpu);
3015 return atomic_read(&this->nr_iowait);
3016}
3017
3018unsigned long this_cpu_load(void)
3019{
3020 struct rq *this = this_rq();
3021 return this->cpu_load[0];
3022}
3023
3024
3025
3026static atomic_long_t calc_load_tasks;
3027static unsigned long calc_load_update;
3028unsigned long avenrun[3];
3029EXPORT_SYMBOL(avenrun);
3030
3031static long calc_load_fold_active(struct rq *this_rq)
3032{
3033 long nr_active, delta = 0;
3034
3035 nr_active = this_rq->nr_running;
3036 nr_active += (long) this_rq->nr_uninterruptible;
3037
3038 if (nr_active != this_rq->calc_load_active) {
3039 delta = nr_active - this_rq->calc_load_active;
3040 this_rq->calc_load_active = nr_active;
3041 }
3042
3043 return delta;
3044}
3045
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 load += 1UL << (FSHIFT - 1);
3052 return load >> FSHIFT;
3053}
3054
3055#ifdef CONFIG_NO_HZ
3056
3057
3058
3059
3060
3061static atomic_long_t calc_load_tasks_idle;
3062
3063static void calc_load_account_idle(struct rq *this_rq)
3064{
3065 long delta;
3066
3067 delta = calc_load_fold_active(this_rq);
3068 if (delta)
3069 atomic_long_add(delta, &calc_load_tasks_idle);
3070}
3071
3072static long calc_load_fold_idle(void)
3073{
3074 long delta = 0;
3075
3076
3077
3078
3079 if (atomic_long_read(&calc_load_tasks_idle))
3080 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3081
3082 return delta;
3083}
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100static unsigned long
3101fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3102{
3103 unsigned long result = 1UL << frac_bits;
3104
3105 if (n) for (;;) {
3106 if (n & 1) {
3107 result *= x;
3108 result += 1UL << (frac_bits - 1);
3109 result >>= frac_bits;
3110 }
3111 n >>= 1;
3112 if (!n)
3113 break;
3114 x *= x;
3115 x += 1UL << (frac_bits - 1);
3116 x >>= frac_bits;
3117 }
3118
3119 return result;
3120}
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145static unsigned long
3146calc_load_n(unsigned long load, unsigned long exp,
3147 unsigned long active, unsigned int n)
3148{
3149
3150 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3151}
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162static void calc_global_nohz(unsigned long ticks)
3163{
3164 long delta, active, n;
3165
3166 if (time_before(jiffies, calc_load_update))
3167 return;
3168
3169
3170
3171
3172
3173
3174
3175 delta = calc_load_fold_idle();
3176 if (delta)
3177 atomic_long_add(delta, &calc_load_tasks);
3178
3179
3180
3181
3182 if (ticks >= LOAD_FREQ) {
3183 n = ticks / LOAD_FREQ;
3184
3185 active = atomic_long_read(&calc_load_tasks);
3186 active = active > 0 ? active * FIXED_1 : 0;
3187
3188 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3189 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3190 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3191
3192 calc_load_update += n * LOAD_FREQ;
3193 }
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205}
3206#else
3207static void calc_load_account_idle(struct rq *this_rq)
3208{
3209}
3210
3211static inline long calc_load_fold_idle(void)
3212{
3213 return 0;
3214}
3215
3216static void calc_global_nohz(unsigned long ticks)
3217{
3218}
3219#endif
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3230{
3231 loads[0] = (avenrun[0] + offset) << shift;
3232 loads[1] = (avenrun[1] + offset) << shift;
3233 loads[2] = (avenrun[2] + offset) << shift;
3234}
3235
3236
3237
3238
3239
3240void calc_global_load(unsigned long ticks)
3241{
3242 long active;
3243
3244 calc_global_nohz(ticks);
3245
3246 if (time_before(jiffies, calc_load_update + 10))
3247 return;
3248
3249 active = atomic_long_read(&calc_load_tasks);
3250 active = active > 0 ? active * FIXED_1 : 0;
3251
3252 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3253 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3254 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3255
3256 calc_load_update += LOAD_FREQ;
3257}
3258
3259
3260
3261
3262
3263static void calc_load_account_active(struct rq *this_rq)
3264{
3265 long delta;
3266
3267 if (time_before(jiffies, this_rq->calc_load_update))
3268 return;
3269
3270 delta = calc_load_fold_active(this_rq);
3271 delta += calc_load_fold_idle();
3272 if (delta)
3273 atomic_long_add(delta, &calc_load_tasks);
3274
3275 this_rq->calc_load_update += LOAD_FREQ;
3276}
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305#define DEGRADE_SHIFT 7
3306static const unsigned char
3307 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3308static const unsigned char
3309 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3310 {0, 0, 0, 0, 0, 0, 0, 0},
3311 {64, 32, 8, 0, 0, 0, 0, 0},
3312 {96, 72, 40, 12, 1, 0, 0},
3313 {112, 98, 75, 43, 15, 1, 0},
3314 {120, 112, 98, 76, 45, 16, 2} };
3315
3316
3317
3318
3319
3320
3321static unsigned long
3322decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3323{
3324 int j = 0;
3325
3326 if (!missed_updates)
3327 return load;
3328
3329 if (missed_updates >= degrade_zero_ticks[idx])
3330 return 0;
3331
3332 if (idx == 1)
3333 return load >> missed_updates;
3334
3335 while (missed_updates) {
3336 if (missed_updates % 2)
3337 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3338
3339 missed_updates >>= 1;
3340 j++;
3341 }
3342 return load;
3343}
3344
3345
3346
3347
3348
3349
3350static void update_cpu_load(struct rq *this_rq)
3351{
3352 unsigned long this_load = this_rq->load.weight;
3353 unsigned long curr_jiffies = jiffies;
3354 unsigned long pending_updates;
3355 int i, scale;
3356
3357 this_rq->nr_load_updates++;
3358
3359
3360 if (curr_jiffies == this_rq->last_load_update_tick)
3361 return;
3362
3363 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3364 this_rq->last_load_update_tick = curr_jiffies;
3365
3366
3367 this_rq->cpu_load[0] = this_load;
3368 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3369 unsigned long old_load, new_load;
3370
3371
3372
3373 old_load = this_rq->cpu_load[i];
3374 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3375 new_load = this_load;
3376
3377
3378
3379
3380
3381 if (new_load > old_load)
3382 new_load += scale - 1;
3383
3384 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3385 }
3386
3387 sched_avg_update(this_rq);
3388}
3389
3390static void update_cpu_load_active(struct rq *this_rq)
3391{
3392 update_cpu_load(this_rq);
3393
3394 calc_load_account_active(this_rq);
3395}
3396
3397#ifdef CONFIG_SMP
3398
3399
3400
3401
3402
3403void sched_exec(void)
3404{
3405 struct task_struct *p = current;
3406 unsigned long flags;
3407 struct rq *rq;
3408 int dest_cpu;
3409
3410 rq = task_rq_lock(p, &flags);
3411 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3412 if (dest_cpu == smp_processor_id())
3413 goto unlock;
3414
3415
3416
3417
3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3420 struct migration_arg arg = { p, dest_cpu };
3421
3422 task_rq_unlock(rq, &flags);
3423 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3424 return;
3425 }
3426unlock:
3427 task_rq_unlock(rq, &flags);
3428}
3429
3430#endif
3431
3432DEFINE_PER_CPU(struct kernel_stat, kstat);
3433
3434EXPORT_PER_CPU_SYMBOL(kstat);
3435
3436
3437
3438
3439
3440
3441
3442static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3443{
3444 u64 ns = 0;
3445
3446 if (task_current(rq, p)) {
3447 update_rq_clock(rq);
3448 ns = rq->clock_task - p->se.exec_start;
3449 if ((s64)ns < 0)
3450 ns = 0;
3451 }
3452
3453 return ns;
3454}
3455
3456unsigned long long task_delta_exec(struct task_struct *p)
3457{
3458 unsigned long flags;
3459 struct rq *rq;
3460 u64 ns = 0;
3461
3462 rq = task_rq_lock(p, &flags);
3463 ns = do_task_delta_exec(p, rq);
3464 task_rq_unlock(rq, &flags);
3465
3466 return ns;
3467}
3468
3469
3470
3471
3472
3473
3474unsigned long long task_sched_runtime(struct task_struct *p)
3475{
3476 unsigned long flags;
3477 struct rq *rq;
3478 u64 ns = 0;
3479
3480 rq = task_rq_lock(p, &flags);
3481 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3482 task_rq_unlock(rq, &flags);
3483
3484 return ns;
3485}
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496unsigned long long thread_group_sched_runtime(struct task_struct *p)
3497{
3498 struct task_cputime totals;
3499 unsigned long flags;
3500 struct rq *rq;
3501 u64 ns;
3502
3503 rq = task_rq_lock(p, &flags);
3504 thread_group_cputime(p, &totals);
3505 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3506 task_rq_unlock(rq, &flags);
3507
3508 return ns;
3509}
3510
3511
3512
3513
3514
3515
3516
3517void account_user_time(struct task_struct *p, cputime_t cputime,
3518 cputime_t cputime_scaled)
3519{
3520 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3521 cputime64_t tmp;
3522
3523
3524 p->utime = cputime_add(p->utime, cputime);
3525 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3526 account_group_user_time(p, cputime);
3527
3528
3529 tmp = cputime_to_cputime64(cputime);
3530 if (TASK_NICE(p) > 0)
3531 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3532 else
3533 cpustat->user = cputime64_add(cpustat->user, tmp);
3534
3535 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3536
3537 acct_update_integrals(p);
3538}
3539
3540
3541
3542
3543
3544
3545
3546static void account_guest_time(struct task_struct *p, cputime_t cputime,
3547 cputime_t cputime_scaled)
3548{
3549 cputime64_t tmp;
3550 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3551
3552 tmp = cputime_to_cputime64(cputime);
3553
3554
3555 p->utime = cputime_add(p->utime, cputime);
3556 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3557 account_group_user_time(p, cputime);
3558 p->gtime = cputime_add(p->gtime, cputime);
3559
3560
3561 if (TASK_NICE(p) > 0) {
3562 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3563 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3564 } else {
3565 cpustat->user = cputime64_add(cpustat->user, tmp);
3566 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3567 }
3568}
3569
3570
3571
3572
3573
3574
3575
3576
3577void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled)
3579{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp;
3582
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled);
3585 return;
3586 }
3587
3588
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3597 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3599 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp);
3601
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3603
3604
3605 acct_update_integrals(p);
3606}
3607
3608
3609
3610
3611
3612void account_steal_time(cputime_t cputime)
3613{
3614 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3615 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3616
3617 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3618}
3619
3620
3621
3622
3623
3624void account_idle_time(cputime_t cputime)
3625{
3626 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3627 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3628 struct rq *rq = this_rq();
3629
3630 if (atomic_read(&rq->nr_iowait) > 0)
3631 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3632 else
3633 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3634}
3635
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637
3638
3639
3640
3641
3642
3643void account_process_tick(struct task_struct *p, int user_tick)
3644{
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq();
3647
3648 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3651 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3652 one_jiffy_scaled);
3653 else
3654 account_idle_time(cputime_one_jiffy);
3655}
3656
3657
3658
3659
3660
3661
3662void account_steal_ticks(unsigned long ticks)
3663{
3664 account_steal_time(jiffies_to_cputime(ticks));
3665}
3666
3667
3668
3669
3670
3671void account_idle_ticks(unsigned long ticks)
3672{
3673 account_idle_time(jiffies_to_cputime(ticks));
3674}
3675
3676#endif
3677
3678
3679
3680
3681#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3682void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3683{
3684 *ut = p->utime;
3685 *st = p->stime;
3686}
3687
3688void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3689{
3690 struct task_cputime cputime;
3691
3692 thread_group_cputime(p, &cputime);
3693
3694 *ut = cputime.utime;
3695 *st = cputime.stime;
3696}
3697#else
3698
3699#ifndef nsecs_to_cputime
3700# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3701#endif
3702
3703void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3704{
3705 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3706
3707
3708
3709
3710 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3711
3712 if (total) {
3713 u64 temp = rtime;
3714
3715 temp *= utime;
3716 do_div(temp, total);
3717 utime = (cputime_t)temp;
3718 } else
3719 utime = rtime;
3720
3721
3722
3723
3724 p->prev_utime = max(p->prev_utime, utime);
3725 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3726
3727 *ut = p->prev_utime;
3728 *st = p->prev_stime;
3729}
3730
3731
3732
3733
3734void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3735{
3736 struct signal_struct *sig = p->signal;
3737 struct task_cputime cputime;
3738 cputime_t rtime, utime, total;
3739
3740 thread_group_cputime(p, &cputime);
3741
3742 total = cputime_add(cputime.utime, cputime.stime);
3743 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3744
3745 if (total) {
3746 u64 temp = rtime;
3747
3748 temp *= cputime.utime;
3749 do_div(temp, total);
3750 utime = (cputime_t)temp;
3751 } else
3752 utime = rtime;
3753
3754 sig->prev_utime = max(sig->prev_utime, utime);
3755 sig->prev_stime = max(sig->prev_stime,
3756 cputime_sub(rtime, sig->prev_utime));
3757
3758 *ut = sig->prev_utime;
3759 *st = sig->prev_stime;
3760}
3761#endif
3762
3763
3764
3765
3766
3767
3768
3769
3770void scheduler_tick(void)
3771{
3772 int cpu = smp_processor_id();
3773 struct rq *rq = cpu_rq(cpu);
3774 struct task_struct *curr = rq->curr;
3775
3776 sched_clock_tick();
3777
3778 raw_spin_lock(&rq->lock);
3779 update_rq_clock(rq);
3780 update_cpu_load_active(rq);
3781 curr->sched_class->task_tick(rq, curr, 0);
3782 raw_spin_unlock(&rq->lock);
3783
3784 perf_event_task_tick();
3785
3786#ifdef CONFIG_SMP
3787 rq->idle_at_tick = idle_cpu(cpu);
3788 trigger_load_balance(rq, cpu);
3789#endif
3790}
3791
3792notrace unsigned long get_parent_ip(unsigned long addr)
3793{
3794 if (in_lock_functions(addr)) {
3795 addr = CALLER_ADDR2;
3796 if (in_lock_functions(addr))
3797 addr = CALLER_ADDR3;
3798 }
3799 return addr;
3800}
3801
3802#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3803 defined(CONFIG_PREEMPT_TRACER))
3804
3805void __kprobes add_preempt_count(int val)
3806{
3807#ifdef CONFIG_DEBUG_PREEMPT
3808
3809
3810
3811 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3812 return;
3813#endif
3814 preempt_count() += val;
3815#ifdef CONFIG_DEBUG_PREEMPT
3816
3817
3818
3819 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3820 PREEMPT_MASK - 10);
3821#endif
3822 if (preempt_count() == val)
3823 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3824}
3825EXPORT_SYMBOL(add_preempt_count);
3826
3827void __kprobes sub_preempt_count(int val)
3828{
3829#ifdef CONFIG_DEBUG_PREEMPT
3830
3831
3832
3833 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3834 return;
3835
3836
3837
3838 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3839 !(preempt_count() & PREEMPT_MASK)))
3840 return;
3841#endif
3842
3843 if (preempt_count() == val)
3844 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3845 preempt_count() -= val;
3846}
3847EXPORT_SYMBOL(sub_preempt_count);
3848
3849#endif
3850
3851
3852
3853
3854static noinline void __schedule_bug(struct task_struct *prev)
3855{
3856 struct pt_regs *regs = get_irq_regs();
3857
3858 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3859 prev->comm, prev->pid, preempt_count());
3860
3861 debug_show_held_locks(prev);
3862 print_modules();
3863 if (irqs_disabled())
3864 print_irqtrace_events(prev);
3865
3866 if (regs)
3867 show_regs(regs);
3868 else
3869 dump_stack();
3870}
3871
3872
3873
3874
3875static inline void schedule_debug(struct task_struct *prev)
3876{
3877
3878
3879
3880
3881
3882 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3883 __schedule_bug(prev);
3884
3885 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3886
3887 schedstat_inc(this_rq(), sched_count);
3888#ifdef CONFIG_SCHEDSTATS
3889 if (unlikely(prev->lock_depth >= 0)) {
3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3891 schedstat_inc(prev, sched_info.bkl_count);
3892 }
3893#endif
3894}
3895
3896static void put_prev_task(struct rq *rq, struct task_struct *prev)
3897{
3898 if (prev->se.on_rq)
3899 update_rq_clock(rq);
3900 prev->sched_class->put_prev_task(rq, prev);
3901}
3902
3903
3904
3905
3906static inline struct task_struct *
3907pick_next_task(struct rq *rq)
3908{
3909 const struct sched_class *class;
3910 struct task_struct *p;
3911
3912
3913
3914
3915
3916 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3917 p = fair_sched_class.pick_next_task(rq);
3918 if (likely(p))
3919 return p;
3920 }
3921
3922 for_each_class(class) {
3923 p = class->pick_next_task(rq);
3924 if (p)
3925 return p;
3926 }
3927
3928 BUG();
3929}
3930
3931
3932
3933
3934asmlinkage void __sched schedule(void)
3935{
3936 struct task_struct *prev, *next;
3937 unsigned long *switch_count;
3938 struct rq *rq;
3939 int cpu;
3940
3941need_resched:
3942 preempt_disable();
3943 cpu = smp_processor_id();
3944 rq = cpu_rq(cpu);
3945 rcu_note_context_switch(cpu);
3946 prev = rq->curr;
3947
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev);
3952
3953 if (sched_feat(HRTICK))
3954 hrtick_clear(rq);
3955
3956 raw_spin_lock_irq(&rq->lock);
3957
3958 switch_count = &prev->nivcsw;
3959 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3960 if (unlikely(signal_pending_state(prev->state, prev))) {
3961 prev->state = TASK_RUNNING;
3962 } else {
3963
3964
3965
3966
3967
3968
3969 if (prev->flags & PF_WQ_WORKER) {
3970 struct task_struct *to_wakeup;
3971
3972 to_wakeup = wq_worker_sleeping(prev, cpu);
3973 if (to_wakeup)
3974 try_to_wake_up_local(to_wakeup);
3975 }
3976 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3977 }
3978 switch_count = &prev->nvcsw;
3979 }
3980
3981 pre_schedule(rq, prev);
3982
3983 if (unlikely(!rq->nr_running))
3984 idle_balance(cpu, rq);
3985
3986 put_prev_task(rq, prev);
3987 next = pick_next_task(rq);
3988 clear_tsk_need_resched(prev);
3989 rq->skip_clock_update = 0;
3990
3991 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++;
3996 rq->curr = next;
3997 ++*switch_count;
3998
3999 context_switch(rq, prev, next);
4000
4001
4002
4003
4004
4005
4006 cpu = smp_processor_id();
4007 rq = cpu_rq(cpu);
4008 } else
4009 raw_spin_unlock_irq(&rq->lock);
4010
4011 post_schedule(rq);
4012
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched();
4017 if (need_resched())
4018 goto need_resched;
4019}
4020EXPORT_SYMBOL(schedule);
4021
4022#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4023
4024
4025
4026
4027int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4028{
4029 unsigned int cpu;
4030 struct rq *rq;
4031
4032 if (!sched_feat(OWNER_SPIN))
4033 return 0;
4034
4035#ifdef CONFIG_DEBUG_PAGEALLOC
4036
4037
4038
4039
4040
4041 if (probe_kernel_address(&owner->cpu, cpu))
4042 return 0;
4043#else
4044 cpu = owner->cpu;
4045#endif
4046
4047
4048
4049
4050
4051 if (cpu >= nr_cpumask_bits)
4052 return 0;
4053
4054
4055
4056
4057
4058 if (!cpu_online(cpu))
4059 return 0;
4060
4061 rq = cpu_rq(cpu);
4062
4063 for (;;) {
4064
4065
4066
4067 if (lock->owner != owner) {
4068
4069
4070
4071
4072
4073 if (lock->owner)
4074 return 0;
4075 break;
4076 }
4077
4078
4079
4080
4081 if (task_thread_info(rq->curr) != owner || need_resched())
4082 return 0;
4083
4084 arch_mutex_cpu_relax();
4085 }
4086
4087 return 1;
4088}
4089#endif
4090
4091#ifdef CONFIG_PREEMPT
4092
4093
4094
4095
4096
4097asmlinkage void __sched notrace preempt_schedule(void)
4098{
4099 struct thread_info *ti = current_thread_info();
4100
4101
4102
4103
4104
4105 if (likely(ti->preempt_count || irqs_disabled()))
4106 return;
4107
4108 do {
4109 add_preempt_count_notrace(PREEMPT_ACTIVE);
4110 schedule();
4111 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4112
4113
4114
4115
4116
4117 barrier();
4118 } while (need_resched());
4119}
4120EXPORT_SYMBOL(preempt_schedule);
4121
4122
4123
4124
4125
4126
4127
4128asmlinkage void __sched preempt_schedule_irq(void)
4129{
4130 struct thread_info *ti = current_thread_info();
4131
4132
4133 BUG_ON(ti->preempt_count || !irqs_disabled());
4134
4135 do {
4136 add_preempt_count(PREEMPT_ACTIVE);
4137 local_irq_enable();
4138 schedule();
4139 local_irq_disable();
4140 sub_preempt_count(PREEMPT_ACTIVE);
4141
4142
4143
4144
4145
4146 barrier();
4147 } while (need_resched());
4148}
4149
4150#endif
4151
4152int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4153 void *key)
4154{
4155 return try_to_wake_up(curr->private, mode, wake_flags);
4156}
4157EXPORT_SYMBOL(default_wake_function);
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4169 int nr_exclusive, int wake_flags, void *key)
4170{
4171 wait_queue_t *curr, *next;
4172
4173 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4174 unsigned flags = curr->flags;
4175
4176 if (curr->func(curr, mode, wake_flags, key) &&
4177 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4178 break;
4179 }
4180}
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192void __wake_up(wait_queue_head_t *q, unsigned int mode,
4193 int nr_exclusive, void *key)
4194{
4195 unsigned long flags;
4196
4197 spin_lock_irqsave(&q->lock, flags);
4198 __wake_up_common(q, mode, nr_exclusive, 0, key);
4199 spin_unlock_irqrestore(&q->lock, flags);
4200}
4201EXPORT_SYMBOL(__wake_up);
4202
4203
4204
4205
4206void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4207{
4208 __wake_up_common(q, mode, 1, 0, NULL);
4209}
4210EXPORT_SYMBOL_GPL(__wake_up_locked);
4211
4212void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4213{
4214 __wake_up_common(q, mode, 1, 0, key);
4215}
4216EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4236 int nr_exclusive, void *key)
4237{
4238 unsigned long flags;
4239 int wake_flags = WF_SYNC;
4240
4241 if (unlikely(!q))
4242 return;
4243
4244 if (unlikely(!nr_exclusive))
4245 wake_flags = 0;
4246
4247 spin_lock_irqsave(&q->lock, flags);
4248 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4249 spin_unlock_irqrestore(&q->lock, flags);
4250}
4251EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4252
4253
4254
4255
4256void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4257{
4258 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4259}
4260EXPORT_SYMBOL_GPL(__wake_up_sync);
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274void complete(struct completion *x)
4275{
4276 unsigned long flags;
4277
4278 spin_lock_irqsave(&x->wait.lock, flags);
4279 x->done++;
4280 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4281 spin_unlock_irqrestore(&x->wait.lock, flags);
4282}
4283EXPORT_SYMBOL(complete);
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294void complete_all(struct completion *x)
4295{
4296 unsigned long flags;
4297
4298 spin_lock_irqsave(&x->wait.lock, flags);
4299 x->done += UINT_MAX/2;
4300 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4301 spin_unlock_irqrestore(&x->wait.lock, flags);
4302}
4303EXPORT_SYMBOL(complete_all);
4304
4305static inline long __sched
4306do_wait_for_common(struct completion *x, long timeout, int state)
4307{
4308 if (!x->done) {
4309 DECLARE_WAITQUEUE(wait, current);
4310
4311 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4312 do {
4313 if (signal_pending_state(state, current)) {
4314 timeout = -ERESTARTSYS;
4315 break;
4316 }
4317 __set_current_state(state);
4318 spin_unlock_irq(&x->wait.lock);
4319 timeout = schedule_timeout(timeout);
4320 spin_lock_irq(&x->wait.lock);
4321 } while (!x->done && timeout);
4322 __remove_wait_queue(&x->wait, &wait);
4323 if (!x->done)
4324 return timeout;
4325 }
4326 x->done--;
4327 return timeout ?: 1;
4328}
4329
4330static long __sched
4331wait_for_common(struct completion *x, long timeout, int state)
4332{
4333 might_sleep();
4334
4335 spin_lock_irq(&x->wait.lock);
4336 timeout = do_wait_for_common(x, timeout, state);
4337 spin_unlock_irq(&x->wait.lock);
4338 return timeout;
4339}
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351void __sched wait_for_completion(struct completion *x)
4352{
4353 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4354}
4355EXPORT_SYMBOL(wait_for_completion);
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366unsigned long __sched
4367wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4368{
4369 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4370}
4371EXPORT_SYMBOL(wait_for_completion_timeout);
4372
4373
4374
4375
4376
4377
4378
4379
4380int __sched wait_for_completion_interruptible(struct completion *x)
4381{
4382 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4383 if (t == -ERESTARTSYS)
4384 return t;
4385 return 0;
4386}
4387EXPORT_SYMBOL(wait_for_completion_interruptible);
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397long __sched
4398wait_for_completion_interruptible_timeout(struct completion *x,
4399 unsigned long timeout)
4400{
4401 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4402}
4403EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4404
4405
4406
4407
4408
4409
4410
4411
4412int __sched wait_for_completion_killable(struct completion *x)
4413{
4414 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4415 if (t == -ERESTARTSYS)
4416 return t;
4417 return 0;
4418}
4419EXPORT_SYMBOL(wait_for_completion_killable);
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430long __sched
4431wait_for_completion_killable_timeout(struct completion *x,
4432 unsigned long timeout)
4433{
4434 return wait_for_common(x, timeout, TASK_KILLABLE);
4435}
4436EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450bool try_wait_for_completion(struct completion *x)
4451{
4452 unsigned long flags;
4453 int ret = 1;
4454
4455 spin_lock_irqsave(&x->wait.lock, flags);
4456 if (!x->done)
4457 ret = 0;
4458 else
4459 x->done--;
4460 spin_unlock_irqrestore(&x->wait.lock, flags);
4461 return ret;
4462}
4463EXPORT_SYMBOL(try_wait_for_completion);
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473bool completion_done(struct completion *x)
4474{
4475 unsigned long flags;
4476 int ret = 1;
4477
4478 spin_lock_irqsave(&x->wait.lock, flags);
4479 if (!x->done)
4480 ret = 0;
4481 spin_unlock_irqrestore(&x->wait.lock, flags);
4482 return ret;
4483}
4484EXPORT_SYMBOL(completion_done);
4485
4486static long __sched
4487sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4488{
4489 unsigned long flags;
4490 wait_queue_t wait;
4491
4492 init_waitqueue_entry(&wait, current);
4493
4494 __set_current_state(state);
4495
4496 spin_lock_irqsave(&q->lock, flags);
4497 __add_wait_queue(q, &wait);
4498 spin_unlock(&q->lock);
4499 timeout = schedule_timeout(timeout);
4500 spin_lock_irq(&q->lock);
4501 __remove_wait_queue(q, &wait);
4502 spin_unlock_irqrestore(&q->lock, flags);
4503
4504 return timeout;
4505}
4506
4507void __sched interruptible_sleep_on(wait_queue_head_t *q)
4508{
4509 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4510}
4511EXPORT_SYMBOL(interruptible_sleep_on);
4512
4513long __sched
4514interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4515{
4516 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4517}
4518EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4519
4520void __sched sleep_on(wait_queue_head_t *q)
4521{
4522 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4523}
4524EXPORT_SYMBOL(sleep_on);
4525
4526long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4527{
4528 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4529}
4530EXPORT_SYMBOL(sleep_on_timeout);
4531
4532#ifdef CONFIG_RT_MUTEXES
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544void rt_mutex_setprio(struct task_struct *p, int prio)
4545{
4546 unsigned long flags;
4547 int oldprio, on_rq, running;
4548 struct rq *rq;
4549 const struct sched_class *prev_class;
4550
4551 BUG_ON(prio < 0 || prio > MAX_PRIO);
4552
4553 rq = task_rq_lock(p, &flags);
4554
4555 trace_sched_pi_setprio(p, prio);
4556 oldprio = p->prio;
4557 prev_class = p->sched_class;
4558 on_rq = p->se.on_rq;
4559 running = task_current(rq, p);
4560 if (on_rq)
4561 dequeue_task(rq, p, 0);
4562 if (running)
4563 p->sched_class->put_prev_task(rq, p);
4564
4565 if (rt_prio(prio))
4566 p->sched_class = &rt_sched_class;
4567 else
4568 p->sched_class = &fair_sched_class;
4569
4570 p->prio = prio;
4571
4572 if (running)
4573 p->sched_class->set_curr_task(rq);
4574 if (on_rq) {
4575 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4576
4577 check_class_changed(rq, p, prev_class, oldprio, running);
4578 }
4579 task_rq_unlock(rq, &flags);
4580}
4581
4582#endif
4583
4584void set_user_nice(struct task_struct *p, long nice)
4585{
4586 int old_prio, delta, on_rq;
4587 unsigned long flags;
4588 struct rq *rq;
4589
4590 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4591 return;
4592
4593
4594
4595
4596 rq = task_rq_lock(p, &flags);
4597
4598
4599
4600
4601
4602
4603 if (task_has_rt_policy(p)) {
4604 p->static_prio = NICE_TO_PRIO(nice);
4605 goto out_unlock;
4606 }
4607 on_rq = p->se.on_rq;
4608 if (on_rq)
4609 dequeue_task(rq, p, 0);
4610
4611 p->static_prio = NICE_TO_PRIO(nice);
4612 set_load_weight(p);
4613 old_prio = p->prio;
4614 p->prio = effective_prio(p);
4615 delta = p->prio - old_prio;
4616
4617 if (on_rq) {
4618 enqueue_task(rq, p, 0);
4619
4620
4621
4622
4623 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4624 resched_task(rq->curr);
4625 }
4626out_unlock:
4627 task_rq_unlock(rq, &flags);
4628}
4629EXPORT_SYMBOL(set_user_nice);
4630
4631
4632
4633
4634
4635
4636int can_nice(const struct task_struct *p, const int nice)
4637{
4638
4639 int nice_rlim = 20 - nice;
4640
4641 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4642 capable(CAP_SYS_NICE));
4643}
4644
4645#ifdef __ARCH_WANT_SYS_NICE
4646
4647
4648
4649
4650
4651
4652
4653
4654SYSCALL_DEFINE1(nice, int, increment)
4655{
4656 long nice, retval;
4657
4658
4659
4660
4661
4662
4663 if (increment < -40)
4664 increment = -40;
4665 if (increment > 40)
4666 increment = 40;
4667
4668 nice = TASK_NICE(current) + increment;
4669 if (nice < -20)
4670 nice = -20;
4671 if (nice > 19)
4672 nice = 19;
4673
4674 if (increment < 0 && !can_nice(current, nice))
4675 return -EPERM;
4676
4677 retval = security_task_setnice(current, nice);
4678 if (retval)
4679 return retval;
4680
4681 set_user_nice(current, nice);
4682 return 0;
4683}
4684
4685#endif
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695int task_prio(const struct task_struct *p)
4696{
4697 return p->prio - MAX_RT_PRIO;
4698}
4699
4700
4701
4702
4703
4704int task_nice(const struct task_struct *p)
4705{
4706 return TASK_NICE(p);
4707}
4708EXPORT_SYMBOL(task_nice);
4709
4710
4711
4712
4713
4714int idle_cpu(int cpu)
4715{
4716 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4717}
4718
4719
4720
4721
4722
4723struct task_struct *idle_task(int cpu)
4724{
4725 return cpu_rq(cpu)->idle;
4726}
4727
4728
4729
4730
4731
4732static struct task_struct *find_process_by_pid(pid_t pid)
4733{
4734 return pid ? find_task_by_vpid(pid) : current;
4735}
4736
4737
4738static void
4739__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4740{
4741 BUG_ON(p->se.on_rq);
4742
4743 p->policy = policy;
4744 p->rt_priority = prio;
4745 p->normal_prio = normal_prio(p);
4746
4747 p->prio = rt_mutex_getprio(p);
4748 if (rt_prio(p->prio))
4749 p->sched_class = &rt_sched_class;
4750 else
4751 p->sched_class = &fair_sched_class;
4752 set_load_weight(p);
4753}
4754
4755
4756
4757
4758static bool check_same_owner(struct task_struct *p)
4759{
4760 const struct cred *cred = current_cred(), *pcred;
4761 bool match;
4762
4763 rcu_read_lock();
4764 pcred = __task_cred(p);
4765 match = (cred->euid == pcred->euid ||
4766 cred->euid == pcred->uid);
4767 rcu_read_unlock();
4768 return match;
4769}
4770
4771static int __sched_setscheduler(struct task_struct *p, int policy,
4772 const struct sched_param *param, bool user)
4773{
4774 int retval, oldprio, oldpolicy = -1, on_rq, running;
4775 unsigned long flags;
4776 const struct sched_class *prev_class;
4777 struct rq *rq;
4778 int reset_on_fork;
4779
4780
4781 BUG_ON(in_interrupt());
4782recheck:
4783
4784 if (policy < 0) {
4785 reset_on_fork = p->sched_reset_on_fork;
4786 policy = oldpolicy = p->policy;
4787 } else {
4788 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4789 policy &= ~SCHED_RESET_ON_FORK;
4790
4791 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4792 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4793 policy != SCHED_IDLE)
4794 return -EINVAL;
4795 }
4796
4797
4798
4799
4800
4801
4802 if (param->sched_priority < 0 ||
4803 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4804 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4805 return -EINVAL;
4806 if (rt_policy(policy) != (param->sched_priority != 0))
4807 return -EINVAL;
4808
4809
4810
4811
4812 if (user && !capable(CAP_SYS_NICE)) {
4813 if (rt_policy(policy)) {
4814 unsigned long rlim_rtprio =
4815 task_rlimit(p, RLIMIT_RTPRIO);
4816
4817
4818 if (policy != p->policy && !rlim_rtprio)
4819 return -EPERM;
4820
4821
4822 if (param->sched_priority > p->rt_priority &&
4823 param->sched_priority > rlim_rtprio)
4824 return -EPERM;
4825 }
4826
4827
4828
4829
4830 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4831 return -EPERM;
4832
4833
4834 if (!check_same_owner(p))
4835 return -EPERM;
4836
4837
4838 if (p->sched_reset_on_fork && !reset_on_fork)
4839 return -EPERM;
4840 }
4841
4842 if (user) {
4843 retval = security_task_setscheduler(p);
4844 if (retval)
4845 return retval;
4846 }
4847
4848
4849
4850
4851
4852 raw_spin_lock_irqsave(&p->pi_lock, flags);
4853
4854
4855
4856
4857 rq = __task_rq_lock(p);
4858
4859
4860
4861
4862 if (p == rq->stop) {
4863 __task_rq_unlock(rq);
4864 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4865 return -EINVAL;
4866 }
4867
4868#ifdef CONFIG_RT_GROUP_SCHED
4869 if (user) {
4870
4871
4872
4873
4874 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4875 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4876 !task_group_is_autogroup(task_group(p))) {
4877 __task_rq_unlock(rq);
4878 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4879 return -EPERM;
4880 }
4881 }
4882#endif
4883
4884
4885 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4886 policy = oldpolicy = -1;
4887 __task_rq_unlock(rq);
4888 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4889 goto recheck;
4890 }
4891 on_rq = p->se.on_rq;
4892 running = task_current(rq, p);
4893 if (on_rq)
4894 deactivate_task(rq, p, 0);
4895 if (running)
4896 p->sched_class->put_prev_task(rq, p);
4897
4898 p->sched_reset_on_fork = reset_on_fork;
4899
4900 oldprio = p->prio;
4901 prev_class = p->sched_class;
4902 __setscheduler(rq, p, policy, param->sched_priority);
4903
4904 if (running)
4905 p->sched_class->set_curr_task(rq);
4906 if (on_rq) {
4907 activate_task(rq, p, 0);
4908
4909 check_class_changed(rq, p, prev_class, oldprio, running);
4910 }
4911 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913
4914 rt_mutex_adjust_pi(p);
4915
4916 return 0;
4917}
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927int sched_setscheduler(struct task_struct *p, int policy,
4928 const struct sched_param *param)
4929{
4930 return __sched_setscheduler(p, policy, param, true);
4931}
4932EXPORT_SYMBOL_GPL(sched_setscheduler);
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4946 const struct sched_param *param)
4947{
4948 return __sched_setscheduler(p, policy, param, false);
4949}
4950
4951static int
4952do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4953{
4954 struct sched_param lparam;
4955 struct task_struct *p;
4956 int retval;
4957
4958 if (!param || pid < 0)
4959 return -EINVAL;
4960 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4961 return -EFAULT;
4962
4963 rcu_read_lock();
4964 retval = -ESRCH;
4965 p = find_process_by_pid(pid);
4966 if (p != NULL)
4967 retval = sched_setscheduler(p, policy, &lparam);
4968 rcu_read_unlock();
4969
4970 return retval;
4971}
4972
4973
4974
4975
4976
4977
4978
4979SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4980 struct sched_param __user *, param)
4981{
4982
4983 if (policy < 0)
4984 return -EINVAL;
4985
4986 return do_sched_setscheduler(pid, policy, param);
4987}
4988
4989
4990
4991
4992
4993
4994SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4995{
4996 return do_sched_setscheduler(pid, -1, param);
4997}
4998
4999
5000
5001
5002
5003SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5004{
5005 struct task_struct *p;
5006 int retval;
5007
5008 if (pid < 0)
5009 return -EINVAL;
5010
5011 retval = -ESRCH;
5012 rcu_read_lock();
5013 p = find_process_by_pid(pid);
5014 if (p) {
5015 retval = security_task_getscheduler(p);
5016 if (!retval)
5017 retval = p->policy
5018 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5019 }
5020 rcu_read_unlock();
5021 return retval;
5022}
5023
5024
5025
5026
5027
5028
5029SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5030{
5031 struct sched_param lp;
5032 struct task_struct *p;
5033 int retval;
5034
5035 if (!param || pid < 0)
5036 return -EINVAL;
5037
5038 rcu_read_lock();
5039 p = find_process_by_pid(pid);
5040 retval = -ESRCH;
5041 if (!p)
5042 goto out_unlock;
5043
5044 retval = security_task_getscheduler(p);
5045 if (retval)
5046 goto out_unlock;
5047
5048 lp.sched_priority = p->rt_priority;
5049 rcu_read_unlock();
5050
5051
5052
5053
5054 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5055
5056 return retval;
5057
5058out_unlock:
5059 rcu_read_unlock();
5060 return retval;
5061}
5062
5063long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5064{
5065 cpumask_var_t cpus_allowed, new_mask;
5066 struct task_struct *p;
5067 int retval;
5068
5069 get_online_cpus();
5070 rcu_read_lock();
5071
5072 p = find_process_by_pid(pid);
5073 if (!p) {
5074 rcu_read_unlock();
5075 put_online_cpus();
5076 return -ESRCH;
5077 }
5078
5079
5080 get_task_struct(p);
5081 rcu_read_unlock();
5082
5083 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5084 retval = -ENOMEM;
5085 goto out_put_task;
5086 }
5087 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5088 retval = -ENOMEM;
5089 goto out_free_cpus_allowed;
5090 }
5091 retval = -EPERM;
5092 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5093 goto out_unlock;
5094
5095 retval = security_task_setscheduler(p);
5096 if (retval)
5097 goto out_unlock;
5098
5099 cpuset_cpus_allowed(p, cpus_allowed);
5100 cpumask_and(new_mask, in_mask, cpus_allowed);
5101again:
5102 retval = set_cpus_allowed_ptr(p, new_mask);
5103
5104 if (!retval) {
5105 cpuset_cpus_allowed(p, cpus_allowed);
5106 if (!cpumask_subset(new_mask, cpus_allowed)) {
5107
5108
5109
5110
5111
5112 cpumask_copy(new_mask, cpus_allowed);
5113 goto again;
5114 }
5115 }
5116out_unlock:
5117 free_cpumask_var(new_mask);
5118out_free_cpus_allowed:
5119 free_cpumask_var(cpus_allowed);
5120out_put_task:
5121 put_task_struct(p);
5122 put_online_cpus();
5123 return retval;
5124}
5125
5126static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5127 struct cpumask *new_mask)
5128{
5129 if (len < cpumask_size())
5130 cpumask_clear(new_mask);
5131 else if (len > cpumask_size())
5132 len = cpumask_size();
5133
5134 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5135}
5136
5137
5138
5139
5140
5141
5142
5143SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5144 unsigned long __user *, user_mask_ptr)
5145{
5146 cpumask_var_t new_mask;
5147 int retval;
5148
5149 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5150 return -ENOMEM;
5151
5152 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5153 if (retval == 0)
5154 retval = sched_setaffinity(pid, new_mask);
5155 free_cpumask_var(new_mask);
5156 return retval;
5157}
5158
5159long sched_getaffinity(pid_t pid, struct cpumask *mask)
5160{
5161 struct task_struct *p;
5162 unsigned long flags;
5163 struct rq *rq;
5164 int retval;
5165
5166 get_online_cpus();
5167 rcu_read_lock();
5168
5169 retval = -ESRCH;
5170 p = find_process_by_pid(pid);
5171 if (!p)
5172 goto out_unlock;
5173
5174 retval = security_task_getscheduler(p);
5175 if (retval)
5176 goto out_unlock;
5177
5178 rq = task_rq_lock(p, &flags);
5179 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5180 task_rq_unlock(rq, &flags);
5181
5182out_unlock:
5183 rcu_read_unlock();
5184 put_online_cpus();
5185
5186 return retval;
5187}
5188
5189
5190
5191
5192
5193
5194
5195SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5196 unsigned long __user *, user_mask_ptr)
5197{
5198 int ret;
5199 cpumask_var_t mask;
5200
5201 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5202 return -EINVAL;
5203 if (len & (sizeof(unsigned long)-1))
5204 return -EINVAL;
5205
5206 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5207 return -ENOMEM;
5208
5209 ret = sched_getaffinity(pid, mask);
5210 if (ret == 0) {
5211 size_t retlen = min_t(size_t, len, cpumask_size());
5212
5213 if (copy_to_user(user_mask_ptr, mask, retlen))
5214 ret = -EFAULT;
5215 else
5216 ret = retlen;
5217 }
5218 free_cpumask_var(mask);
5219
5220 return ret;
5221}
5222
5223
5224
5225
5226
5227
5228
5229SYSCALL_DEFINE0(sched_yield)
5230{
5231 struct rq *rq = this_rq_lock();
5232
5233 schedstat_inc(rq, yld_count);
5234 current->sched_class->yield_task(rq);
5235
5236
5237
5238
5239
5240 __release(rq->lock);
5241 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5242 do_raw_spin_unlock(&rq->lock);
5243 preempt_enable_no_resched();
5244
5245 schedule();
5246
5247 return 0;
5248}
5249
5250static inline int should_resched(void)
5251{
5252 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5253}
5254
5255static void __cond_resched(void)
5256{
5257 add_preempt_count(PREEMPT_ACTIVE);
5258 schedule();
5259 sub_preempt_count(PREEMPT_ACTIVE);
5260}
5261
5262int __sched _cond_resched(void)
5263{
5264 if (should_resched()) {
5265 __cond_resched();
5266 return 1;
5267 }
5268 return 0;
5269}
5270EXPORT_SYMBOL(_cond_resched);
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280int __cond_resched_lock(spinlock_t *lock)
5281{
5282 int resched = should_resched();
5283 int ret = 0;
5284
5285 lockdep_assert_held(lock);
5286
5287 if (spin_needbreak(lock) || resched) {
5288 spin_unlock(lock);
5289 if (resched)
5290 __cond_resched();
5291 else
5292 cpu_relax();
5293 ret = 1;
5294 spin_lock(lock);
5295 }
5296 return ret;
5297}
5298EXPORT_SYMBOL(__cond_resched_lock);
5299
5300int __sched __cond_resched_softirq(void)
5301{
5302 BUG_ON(!in_softirq());
5303
5304 if (should_resched()) {
5305 local_bh_enable();
5306 __cond_resched();
5307 local_bh_disable();
5308 return 1;
5309 }
5310 return 0;
5311}
5312EXPORT_SYMBOL(__cond_resched_softirq);
5313
5314
5315
5316
5317
5318
5319
5320void __sched yield(void)
5321{
5322 set_current_state(TASK_RUNNING);
5323 sys_sched_yield();
5324}
5325EXPORT_SYMBOL(yield);
5326
5327
5328
5329
5330
5331void __sched io_schedule(void)
5332{
5333 struct rq *rq = raw_rq();
5334
5335 delayacct_blkio_start();
5336 atomic_inc(&rq->nr_iowait);
5337 current->in_iowait = 1;
5338 schedule();
5339 current->in_iowait = 0;
5340 atomic_dec(&rq->nr_iowait);
5341 delayacct_blkio_end();
5342}
5343EXPORT_SYMBOL(io_schedule);
5344
5345long __sched io_schedule_timeout(long timeout)
5346{
5347 struct rq *rq = raw_rq();
5348 long ret;
5349
5350 delayacct_blkio_start();
5351 atomic_inc(&rq->nr_iowait);
5352 current->in_iowait = 1;
5353 ret = schedule_timeout(timeout);
5354 current->in_iowait = 0;
5355 atomic_dec(&rq->nr_iowait);
5356 delayacct_blkio_end();
5357 return ret;
5358}
5359
5360
5361
5362
5363
5364
5365
5366
5367SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5368{
5369 int ret = -EINVAL;
5370
5371 switch (policy) {
5372 case SCHED_FIFO:
5373 case SCHED_RR:
5374 ret = MAX_USER_RT_PRIO-1;
5375 break;
5376 case SCHED_NORMAL:
5377 case SCHED_BATCH:
5378 case SCHED_IDLE:
5379 ret = 0;
5380 break;
5381 }
5382 return ret;
5383}
5384
5385
5386
5387
5388
5389
5390
5391
5392SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5393{
5394 int ret = -EINVAL;
5395
5396 switch (policy) {
5397 case SCHED_FIFO:
5398 case SCHED_RR:
5399 ret = 1;
5400 break;
5401 case SCHED_NORMAL:
5402 case SCHED_BATCH:
5403 case SCHED_IDLE:
5404 ret = 0;
5405 }
5406 return ret;
5407}
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5418 struct timespec __user *, interval)
5419{
5420 struct task_struct *p;
5421 unsigned int time_slice;
5422 unsigned long flags;
5423 struct rq *rq;
5424 int retval;
5425 struct timespec t;
5426
5427 if (pid < 0)
5428 return -EINVAL;
5429
5430 retval = -ESRCH;
5431 rcu_read_lock();
5432 p = find_process_by_pid(pid);
5433 if (!p)
5434 goto out_unlock;
5435
5436 retval = security_task_getscheduler(p);
5437 if (retval)
5438 goto out_unlock;
5439
5440 rq = task_rq_lock(p, &flags);
5441 time_slice = p->sched_class->get_rr_interval(rq, p);
5442 task_rq_unlock(rq, &flags);
5443
5444 rcu_read_unlock();
5445 jiffies_to_timespec(time_slice, &t);
5446 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5447 return retval;
5448
5449out_unlock:
5450 rcu_read_unlock();
5451 return retval;
5452}
5453
5454static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5455
5456void sched_show_task(struct task_struct *p)
5457{
5458 unsigned long free = 0;
5459 unsigned state;
5460
5461 state = p->state ? __ffs(p->state) + 1 : 0;
5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5464#if BITS_PER_LONG == 32
5465 if (state == TASK_RUNNING)
5466 printk(KERN_CONT " running ");
5467 else
5468 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5469#else
5470 if (state == TASK_RUNNING)
5471 printk(KERN_CONT " running task ");
5472 else
5473 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5474#endif
5475#ifdef CONFIG_DEBUG_STACK_USAGE
5476 free = stack_not_used(p);
5477#endif
5478 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5479 task_pid_nr(p), task_pid_nr(p->real_parent),
5480 (unsigned long)task_thread_info(p)->flags);
5481
5482 show_stack(p, NULL);
5483}
5484
5485void show_state_filter(unsigned long state_filter)
5486{
5487 struct task_struct *g, *p;
5488
5489#if BITS_PER_LONG == 32
5490 printk(KERN_INFO
5491 " task PC stack pid father\n");
5492#else
5493 printk(KERN_INFO
5494 " task PC stack pid father\n");
5495#endif
5496 read_lock(&tasklist_lock);
5497 do_each_thread(g, p) {
5498
5499
5500
5501
5502 touch_nmi_watchdog();
5503 if (!state_filter || (p->state & state_filter))
5504 sched_show_task(p);
5505 } while_each_thread(g, p);
5506
5507 touch_all_softlockup_watchdogs();
5508
5509#ifdef CONFIG_SCHED_DEBUG
5510 sysrq_sched_debug_show();
5511#endif
5512 read_unlock(&tasklist_lock);
5513
5514
5515
5516 if (!state_filter)
5517 debug_show_all_locks();
5518}
5519
5520void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5521{
5522 idle->sched_class = &idle_sched_class;
5523}
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533void __cpuinit init_idle(struct task_struct *idle, int cpu)
5534{
5535 struct rq *rq = cpu_rq(cpu);
5536 unsigned long flags;
5537
5538 raw_spin_lock_irqsave(&rq->lock, flags);
5539
5540 __sched_fork(idle);
5541 idle->state = TASK_RUNNING;
5542 idle->se.exec_start = sched_clock();
5543
5544 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555 rcu_read_lock();
5556 __set_task_cpu(idle, cpu);
5557 rcu_read_unlock();
5558
5559 rq->curr = rq->idle = idle;
5560#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5561 idle->oncpu = 1;
5562#endif
5563 raw_spin_unlock_irqrestore(&rq->lock, flags);
5564
5565
5566#if defined(CONFIG_PREEMPT)
5567 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5568#else
5569 task_thread_info(idle)->preempt_count = 0;
5570#endif
5571
5572
5573
5574 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle);
5576}
5577
5578
5579
5580
5581
5582
5583
5584
5585cpumask_var_t nohz_cpu_mask;
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596static int get_update_sysctl_factor(void)
5597{
5598 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5599 unsigned int factor;
5600
5601 switch (sysctl_sched_tunable_scaling) {
5602 case SCHED_TUNABLESCALING_NONE:
5603 factor = 1;
5604 break;
5605 case SCHED_TUNABLESCALING_LINEAR:
5606 factor = cpus;
5607 break;
5608 case SCHED_TUNABLESCALING_LOG:
5609 default:
5610 factor = 1 + ilog2(cpus);
5611 break;
5612 }
5613
5614 return factor;
5615}
5616
5617static void update_sysctl(void)
5618{
5619 unsigned int factor = get_update_sysctl_factor();
5620
5621#define SET_SYSCTL(name) \
5622 (sysctl_##name = (factor) * normalized_sysctl_##name)
5623 SET_SYSCTL(sched_min_granularity);
5624 SET_SYSCTL(sched_latency);
5625 SET_SYSCTL(sched_wakeup_granularity);
5626#undef SET_SYSCTL
5627}
5628
5629static inline void sched_init_granularity(void)
5630{
5631 update_sysctl();
5632}
5633
5634#ifdef CONFIG_SMP
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5659{
5660 unsigned long flags;
5661 struct rq *rq;
5662 unsigned int dest_cpu;
5663 int ret = 0;
5664
5665
5666
5667
5668
5669again:
5670 while (task_is_waking(p))
5671 cpu_relax();
5672 rq = task_rq_lock(p, &flags);
5673 if (task_is_waking(p)) {
5674 task_rq_unlock(rq, &flags);
5675 goto again;
5676 }
5677
5678 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5679 ret = -EINVAL;
5680 goto out;
5681 }
5682
5683 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5684 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5685 ret = -EINVAL;
5686 goto out;
5687 }
5688
5689 if (p->sched_class->set_cpus_allowed)
5690 p->sched_class->set_cpus_allowed(p, new_mask);
5691 else {
5692 cpumask_copy(&p->cpus_allowed, new_mask);
5693 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5694 }
5695
5696
5697 if (cpumask_test_cpu(task_cpu(p), new_mask))
5698 goto out;
5699
5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5701 if (migrate_task(p, rq)) {
5702 struct migration_arg arg = { p, dest_cpu };
5703
5704 task_rq_unlock(rq, &flags);
5705 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5706 tlb_migrate_finish(p->mm);
5707 return 0;
5708 }
5709out:
5710 task_rq_unlock(rq, &flags);
5711
5712 return ret;
5713}
5714EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5728{
5729 struct rq *rq_dest, *rq_src;
5730 int ret = 0;
5731
5732 if (unlikely(!cpu_active(dest_cpu)))
5733 return ret;
5734
5735 rq_src = cpu_rq(src_cpu);
5736 rq_dest = cpu_rq(dest_cpu);
5737
5738 double_rq_lock(rq_src, rq_dest);
5739
5740 if (task_cpu(p) != src_cpu)
5741 goto done;
5742
5743 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
5744 goto fail;
5745
5746
5747
5748
5749
5750 if (p->se.on_rq) {
5751 deactivate_task(rq_src, p, 0);
5752 set_task_cpu(p, dest_cpu);
5753 activate_task(rq_dest, p, 0);
5754 check_preempt_curr(rq_dest, p, 0);
5755 }
5756done:
5757 ret = 1;
5758fail:
5759 double_rq_unlock(rq_src, rq_dest);
5760 return ret;
5761}
5762
5763
5764
5765
5766
5767
5768static int migration_cpu_stop(void *data)
5769{
5770 struct migration_arg *arg = data;
5771
5772
5773
5774
5775
5776 local_irq_disable();
5777 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5778 local_irq_enable();
5779 return 0;
5780}
5781
5782#ifdef CONFIG_HOTPLUG_CPU
5783
5784
5785
5786
5787
5788void idle_task_exit(void)
5789{
5790 struct mm_struct *mm = current->active_mm;
5791
5792 BUG_ON(cpu_online(smp_processor_id()));
5793
5794 if (mm != &init_mm)
5795 switch_mm(mm, &init_mm, current);
5796 mmdrop(mm);
5797}
5798
5799
5800
5801
5802
5803
5804
5805
5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5807{
5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5809
5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5811 rq_src->nr_uninterruptible = 0;
5812}
5813
5814
5815
5816
5817static void calc_global_load_remove(struct rq *rq)
5818{
5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5820 rq->calc_load_active = 0;
5821}
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831static void migrate_tasks(unsigned int dead_cpu)
5832{
5833 struct rq *rq = cpu_rq(dead_cpu);
5834 struct task_struct *next, *stop = rq->stop;
5835 int dest_cpu;
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846 rq->stop = NULL;
5847
5848 for ( ; ; ) {
5849
5850
5851
5852
5853 if (rq->nr_running == 1)
5854 break;
5855
5856 next = pick_next_task(rq);
5857 BUG_ON(!next);
5858 next->sched_class->put_prev_task(rq, next);
5859
5860
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
5867 }
5868
5869 rq->stop = stop;
5870}
5871
5872#endif
5873
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5875
5876static struct ctl_table sd_ctl_dir[] = {
5877 {
5878 .procname = "sched_domain",
5879 .mode = 0555,
5880 },
5881 {}
5882};
5883
5884static struct ctl_table sd_ctl_root[] = {
5885 {
5886 .procname = "kernel",
5887 .mode = 0555,
5888 .child = sd_ctl_dir,
5889 },
5890 {}
5891};
5892
5893static struct ctl_table *sd_alloc_ctl_entry(int n)
5894{
5895 struct ctl_table *entry =
5896 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5897
5898 return entry;
5899}
5900
5901static void sd_free_ctl_entry(struct ctl_table **tablep)
5902{
5903 struct ctl_table *entry;
5904
5905
5906
5907
5908
5909
5910
5911 for (entry = *tablep; entry->mode; entry++) {
5912 if (entry->child)
5913 sd_free_ctl_entry(&entry->child);
5914 if (entry->proc_handler == NULL)
5915 kfree(entry->procname);
5916 }
5917
5918 kfree(*tablep);
5919 *tablep = NULL;
5920}
5921
5922static void
5923set_table_entry(struct ctl_table *entry,
5924 const char *procname, void *data, int maxlen,
5925 mode_t mode, proc_handler *proc_handler)
5926{
5927 entry->procname = procname;
5928 entry->data = data;
5929 entry->maxlen = maxlen;
5930 entry->mode = mode;
5931 entry->proc_handler = proc_handler;
5932}
5933
5934static struct ctl_table *
5935sd_alloc_ctl_domain_table(struct sched_domain *sd)
5936{
5937 struct ctl_table *table = sd_alloc_ctl_entry(13);
5938
5939 if (table == NULL)
5940 return NULL;
5941
5942 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5943 sizeof(long), 0644, proc_doulongvec_minmax);
5944 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5945 sizeof(long), 0644, proc_doulongvec_minmax);
5946 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5947 sizeof(int), 0644, proc_dointvec_minmax);
5948 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5949 sizeof(int), 0644, proc_dointvec_minmax);
5950 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5951 sizeof(int), 0644, proc_dointvec_minmax);
5952 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5953 sizeof(int), 0644, proc_dointvec_minmax);
5954 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5955 sizeof(int), 0644, proc_dointvec_minmax);
5956 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5957 sizeof(int), 0644, proc_dointvec_minmax);
5958 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5959 sizeof(int), 0644, proc_dointvec_minmax);
5960 set_table_entry(&table[9], "cache_nice_tries",
5961 &sd->cache_nice_tries,
5962 sizeof(int), 0644, proc_dointvec_minmax);
5963 set_table_entry(&table[10], "flags", &sd->flags,
5964 sizeof(int), 0644, proc_dointvec_minmax);
5965 set_table_entry(&table[11], "name", sd->name,
5966 CORENAME_MAX_SIZE, 0444, proc_dostring);
5967
5968
5969 return table;
5970}
5971
5972static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5973{
5974 struct ctl_table *entry, *table;
5975 struct sched_domain *sd;
5976 int domain_num = 0, i;
5977 char buf[32];
5978
5979 for_each_domain(cpu, sd)
5980 domain_num++;
5981 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5982 if (table == NULL)
5983 return NULL;
5984
5985 i = 0;
5986 for_each_domain(cpu, sd) {
5987 snprintf(buf, 32, "domain%d", i);
5988 entry->procname = kstrdup(buf, GFP_KERNEL);
5989 entry->mode = 0555;
5990 entry->child = sd_alloc_ctl_domain_table(sd);
5991 entry++;
5992 i++;
5993 }
5994 return table;
5995}
5996
5997static struct ctl_table_header *sd_sysctl_header;
5998static void register_sched_domain_sysctl(void)
5999{
6000 int i, cpu_num = num_possible_cpus();
6001 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6002 char buf[32];
6003
6004 WARN_ON(sd_ctl_dir[0].child);
6005 sd_ctl_dir[0].child = entry;
6006
6007 if (entry == NULL)
6008 return;
6009
6010 for_each_possible_cpu(i) {
6011 snprintf(buf, 32, "cpu%d", i);
6012 entry->procname = kstrdup(buf, GFP_KERNEL);
6013 entry->mode = 0555;
6014 entry->child = sd_alloc_ctl_cpu_table(i);
6015 entry++;
6016 }
6017
6018 WARN_ON(sd_sysctl_header);
6019 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6020}
6021
6022
6023static void unregister_sched_domain_sysctl(void)
6024{
6025 if (sd_sysctl_header)
6026 unregister_sysctl_table(sd_sysctl_header);
6027 sd_sysctl_header = NULL;
6028 if (sd_ctl_dir[0].child)
6029 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6030}
6031#else
6032static void register_sched_domain_sysctl(void)
6033{
6034}
6035static void unregister_sched_domain_sysctl(void)
6036{
6037}
6038#endif
6039
6040static void set_rq_online(struct rq *rq)
6041{
6042 if (!rq->online) {
6043 const struct sched_class *class;
6044
6045 cpumask_set_cpu(rq->cpu, rq->rd->online);
6046 rq->online = 1;
6047
6048 for_each_class(class) {
6049 if (class->rq_online)
6050 class->rq_online(rq);
6051 }
6052 }
6053}
6054
6055static void set_rq_offline(struct rq *rq)
6056{
6057 if (rq->online) {
6058 const struct sched_class *class;
6059
6060 for_each_class(class) {
6061 if (class->rq_offline)
6062 class->rq_offline(rq);
6063 }
6064
6065 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6066 rq->online = 0;
6067 }
6068}
6069
6070
6071
6072
6073
6074static int __cpuinit
6075migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6076{
6077 int cpu = (long)hcpu;
6078 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu);
6080
6081 switch (action & ~CPU_TASKS_FROZEN) {
6082
6083 case CPU_UP_PREPARE:
6084 rq->calc_load_update = calc_load_update;
6085 break;
6086
6087 case CPU_ONLINE:
6088
6089 raw_spin_lock_irqsave(&rq->lock, flags);
6090 if (rq->rd) {
6091 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6092
6093 set_rq_online(rq);
6094 }
6095 raw_spin_unlock_irqrestore(&rq->lock, flags);
6096 break;
6097
6098#ifdef CONFIG_HOTPLUG_CPU
6099 case CPU_DYING:
6100
6101 raw_spin_lock_irqsave(&rq->lock, flags);
6102 if (rq->rd) {
6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6104 set_rq_offline(rq);
6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1);
6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6112 break;
6113#endif
6114 }
6115 return NOTIFY_OK;
6116}
6117
6118
6119
6120
6121
6122
6123static struct notifier_block __cpuinitdata migration_notifier = {
6124 .notifier_call = migration_call,
6125 .priority = CPU_PRI_MIGRATION,
6126};
6127
6128static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6129 unsigned long action, void *hcpu)
6130{
6131 switch (action & ~CPU_TASKS_FROZEN) {
6132 case CPU_ONLINE:
6133 case CPU_DOWN_FAILED:
6134 set_cpu_active((long)hcpu, true);
6135 return NOTIFY_OK;
6136 default:
6137 return NOTIFY_DONE;
6138 }
6139}
6140
6141static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6142 unsigned long action, void *hcpu)
6143{
6144 switch (action & ~CPU_TASKS_FROZEN) {
6145 case CPU_DOWN_PREPARE:
6146 set_cpu_active((long)hcpu, false);
6147 return NOTIFY_OK;
6148 default:
6149 return NOTIFY_DONE;
6150 }
6151}
6152
6153static int __init migration_init(void)
6154{
6155 void *cpu = (void *)(long)smp_processor_id();
6156 int err;
6157
6158
6159 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6160 BUG_ON(err == NOTIFY_BAD);
6161 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6162 register_cpu_notifier(&migration_notifier);
6163
6164
6165 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6166 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6167
6168 return 0;
6169}
6170early_initcall(migration_init);
6171#endif
6172
6173#ifdef CONFIG_SMP
6174
6175#ifdef CONFIG_SCHED_DEBUG
6176
6177static __read_mostly int sched_domain_debug_enabled;
6178
6179static int __init sched_domain_debug_setup(char *str)
6180{
6181 sched_domain_debug_enabled = 1;
6182
6183 return 0;
6184}
6185early_param("sched_debug", sched_domain_debug_setup);
6186
6187static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6188 struct cpumask *groupmask)
6189{
6190 struct sched_group *group = sd->groups;
6191 char str[256];
6192
6193 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6194 cpumask_clear(groupmask);
6195
6196 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6197
6198 if (!(sd->flags & SD_LOAD_BALANCE)) {
6199 printk("does not load-balance\n");
6200 if (sd->parent)
6201 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6202 " has parent");
6203 return -1;
6204 }
6205
6206 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6207
6208 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6209 printk(KERN_ERR "ERROR: domain->span does not contain "
6210 "CPU%d\n", cpu);
6211 }
6212 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6213 printk(KERN_ERR "ERROR: domain->groups does not contain"
6214 " CPU%d\n", cpu);
6215 }
6216
6217 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6218 do {
6219 if (!group) {
6220 printk("\n");
6221 printk(KERN_ERR "ERROR: group is NULL\n");
6222 break;
6223 }
6224
6225 if (!group->cpu_power) {
6226 printk(KERN_CONT "\n");
6227 printk(KERN_ERR "ERROR: domain->cpu_power not "
6228 "set\n");
6229 break;
6230 }
6231
6232 if (!cpumask_weight(sched_group_cpus(group))) {
6233 printk(KERN_CONT "\n");
6234 printk(KERN_ERR "ERROR: empty group\n");
6235 break;
6236 }
6237
6238 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6239 printk(KERN_CONT "\n");
6240 printk(KERN_ERR "ERROR: repeated CPUs\n");
6241 break;
6242 }
6243
6244 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6245
6246 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6247
6248 printk(KERN_CONT " %s", str);
6249 if (group->cpu_power != SCHED_LOAD_SCALE) {
6250 printk(KERN_CONT " (cpu_power = %d)",
6251 group->cpu_power);
6252 }
6253
6254 group = group->next;
6255 } while (group != sd->groups);
6256 printk(KERN_CONT "\n");
6257
6258 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6259 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6260
6261 if (sd->parent &&
6262 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6263 printk(KERN_ERR "ERROR: parent span is not a superset "
6264 "of domain->span\n");
6265 return 0;
6266}
6267
6268static void sched_domain_debug(struct sched_domain *sd, int cpu)
6269{
6270 cpumask_var_t groupmask;
6271 int level = 0;
6272
6273 if (!sched_domain_debug_enabled)
6274 return;
6275
6276 if (!sd) {
6277 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6278 return;
6279 }
6280
6281 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6282
6283 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6284 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6285 return;
6286 }
6287
6288 for (;;) {
6289 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6290 break;
6291 level++;
6292 sd = sd->parent;
6293 if (!sd)
6294 break;
6295 }
6296 free_cpumask_var(groupmask);
6297}
6298#else
6299# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif
6301
6302static int sd_degenerate(struct sched_domain *sd)
6303{
6304 if (cpumask_weight(sched_domain_span(sd)) == 1)
6305 return 1;
6306
6307
6308 if (sd->flags & (SD_LOAD_BALANCE |
6309 SD_BALANCE_NEWIDLE |
6310 SD_BALANCE_FORK |
6311 SD_BALANCE_EXEC |
6312 SD_SHARE_CPUPOWER |
6313 SD_SHARE_PKG_RESOURCES)) {
6314 if (sd->groups != sd->groups->next)
6315 return 0;
6316 }
6317
6318
6319 if (sd->flags & (SD_WAKE_AFFINE))
6320 return 0;
6321
6322 return 1;
6323}
6324
6325static int
6326sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6327{
6328 unsigned long cflags = sd->flags, pflags = parent->flags;
6329
6330 if (sd_degenerate(parent))
6331 return 1;
6332
6333 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6334 return 0;
6335
6336
6337 if (parent->groups == parent->groups->next) {
6338 pflags &= ~(SD_LOAD_BALANCE |
6339 SD_BALANCE_NEWIDLE |
6340 SD_BALANCE_FORK |
6341 SD_BALANCE_EXEC |
6342 SD_SHARE_CPUPOWER |
6343 SD_SHARE_PKG_RESOURCES);
6344 if (nr_node_ids == 1)
6345 pflags &= ~SD_SERIALIZE;
6346 }
6347 if (~cflags & pflags)
6348 return 0;
6349
6350 return 1;
6351}
6352
6353static void free_rootdomain(struct root_domain *rd)
6354{
6355 synchronize_sched();
6356
6357 cpupri_cleanup(&rd->cpupri);
6358
6359 free_cpumask_var(rd->rto_mask);
6360 free_cpumask_var(rd->online);
6361 free_cpumask_var(rd->span);
6362 kfree(rd);
6363}
6364
6365static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6366{
6367 struct root_domain *old_rd = NULL;
6368 unsigned long flags;
6369
6370 raw_spin_lock_irqsave(&rq->lock, flags);
6371
6372 if (rq->rd) {
6373 old_rd = rq->rd;
6374
6375 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6376 set_rq_offline(rq);
6377
6378 cpumask_clear_cpu(rq->cpu, old_rd->span);
6379
6380
6381
6382
6383
6384
6385 if (!atomic_dec_and_test(&old_rd->refcount))
6386 old_rd = NULL;
6387 }
6388