1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h>
60#include <linux/seq_file.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74
75#include <asm/tlb.h>
76#include <asm/irq_regs.h>
77
78#include "sched_cpupri.h"
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/sched.h>
82
83
84
85
86
87
88#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
89#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
90#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
91
92
93
94
95
96
97#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
98#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
100
101
102
103
104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
105
106#define NICE_0_LOAD SCHED_LOAD_SCALE
107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
108
109
110
111
112
113
114
115#define DEF_TIMESLICE (100 * HZ / 1000)
116
117
118
119
120#define RUNTIME_INF ((u64)~0ULL)
121
122static inline int rt_policy(int policy)
123{
124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
125 return 1;
126 return 0;
127}
128
129static inline int task_has_rt_policy(struct task_struct *p)
130{
131 return rt_policy(p->policy);
132}
133
134
135
136
137struct rt_prio_array {
138 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
139 struct list_head queue[MAX_RT_PRIO];
140};
141
142struct rt_bandwidth {
143
144 spinlock_t rt_runtime_lock;
145 ktime_t rt_period;
146 u64 rt_runtime;
147 struct hrtimer rt_period_timer;
148};
149
150static struct rt_bandwidth def_rt_bandwidth;
151
152static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
153
154static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
155{
156 struct rt_bandwidth *rt_b =
157 container_of(timer, struct rt_bandwidth, rt_period_timer);
158 ktime_t now;
159 int overrun;
160 int idle = 0;
161
162 for (;;) {
163 now = hrtimer_cb_get_time(timer);
164 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
165
166 if (!overrun)
167 break;
168
169 idle = do_sched_rt_period_timer(rt_b, overrun);
170 }
171
172 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
173}
174
175static
176void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
177{
178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime;
180
181 spin_lock_init(&rt_b->rt_runtime_lock);
182
183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
185 rt_b->rt_period_timer.function = sched_rt_period_timer;
186}
187
188static inline int rt_bandwidth_enabled(void)
189{
190 return sysctl_sched_rt_runtime >= 0;
191}
192
193static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
194{
195 ktime_t now;
196
197 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
198 return;
199
200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return;
202
203 spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) {
205 unsigned long delta;
206 ktime_t soft, hard;
207
208 if (hrtimer_active(&rt_b->rt_period_timer))
209 break;
210
211 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
212 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
213
214 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
215 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
216 delta = ktime_to_ns(ktime_sub(hard, soft));
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0);
219 }
220 spin_unlock(&rt_b->rt_runtime_lock);
221}
222
223#ifdef CONFIG_RT_GROUP_SCHED
224static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
225{
226 hrtimer_cancel(&rt_b->rt_period_timer);
227}
228#endif
229
230
231
232
233
234static DEFINE_MUTEX(sched_domains_mutex);
235
236#ifdef CONFIG_GROUP_SCHED
237
238#include <linux/cgroup.h>
239
240struct cfs_rq;
241
242static LIST_HEAD(task_groups);
243
244
245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253
254#ifdef CONFIG_FAIR_GROUP_SCHED
255
256 struct sched_entity **se;
257
258 struct cfs_rq **cfs_rq;
259 unsigned long shares;
260#endif
261
262#ifdef CONFIG_RT_GROUP_SCHED
263 struct sched_rt_entity **rt_se;
264 struct rt_rq **rt_rq;
265
266 struct rt_bandwidth rt_bandwidth;
267#endif
268
269 struct rcu_head rcu;
270 struct list_head list;
271
272 struct task_group *parent;
273 struct list_head siblings;
274 struct list_head children;
275};
276
277#ifdef CONFIG_USER_SCHED
278
279
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285
286
287
288
289
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
302#endif
303#else
304#define root_task_group init_task_group
305#endif
306
307
308
309
310static DEFINE_SPINLOCK(task_group_lock);
311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
314#ifdef CONFIG_SMP
315static int root_task_group_empty(void)
316{
317 return list_empty(&root_task_group.children);
318}
319#endif
320
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif
326
327
328
329
330
331
332
333
334
335#define MIN_SHARES 2
336#define MAX_SHARES (1UL << 18)
337
338static int init_task_group_load = INIT_TASK_GROUP_LOAD;
339#endif
340
341
342
343
344struct task_group init_task_group;
345
346
347static inline struct task_group *task_group(struct task_struct *p)
348{
349 struct task_group *tg;
350
351#ifdef CONFIG_USER_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css);
358#else
359 tg = &init_task_group;
360#endif
361 return tg;
362}
363
364
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
366{
367#ifdef CONFIG_FAIR_GROUP_SCHED
368 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
369 p->se.parent = task_group(p)->se[cpu];
370#endif
371
372#ifdef CONFIG_RT_GROUP_SCHED
373 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
374 p->rt.parent = task_group(p)->rt_se[cpu];
375#endif
376}
377
378#else
379
380static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
381static inline struct task_group *task_group(struct task_struct *p)
382{
383 return NULL;
384}
385
386#endif
387
388
389struct cfs_rq {
390 struct load_weight load;
391 unsigned long nr_running;
392
393 u64 exec_clock;
394 u64 min_vruntime;
395
396 struct rb_root tasks_timeline;
397 struct rb_node *rb_leftmost;
398
399 struct list_head tasks;
400 struct list_head *balance_iterator;
401
402
403
404
405
406 struct sched_entity *curr, *next, *last;
407
408 unsigned int nr_spread_over;
409
410#ifdef CONFIG_FAIR_GROUP_SCHED
411 struct rq *rq;
412
413
414
415
416
417
418
419
420
421 struct list_head leaf_cfs_rq_list;
422 struct task_group *tg;
423
424#ifdef CONFIG_SMP
425
426
427
428 unsigned long task_weight;
429
430
431
432
433
434
435
436 unsigned long h_load;
437
438
439
440
441 unsigned long shares;
442
443
444
445
446 unsigned long rq_weight;
447#endif
448#endif
449};
450
451
452struct rt_rq {
453 struct rt_prio_array active;
454 unsigned long rt_nr_running;
455#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
456 struct {
457 int curr;
458#ifdef CONFIG_SMP
459 int next;
460#endif
461 } highest_prio;
462#endif
463#ifdef CONFIG_SMP
464 unsigned long rt_nr_migratory;
465 unsigned long rt_nr_total;
466 int overloaded;
467 struct plist_head pushable_tasks;
468#endif
469 int rt_throttled;
470 u64 rt_time;
471 u64 rt_runtime;
472
473 spinlock_t rt_runtime_lock;
474
475#ifdef CONFIG_RT_GROUP_SCHED
476 unsigned long rt_nr_boosted;
477
478 struct rq *rq;
479 struct list_head leaf_rt_rq_list;
480 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif
483};
484
485#ifdef CONFIG_SMP
486
487
488
489
490
491
492
493
494
495struct root_domain {
496 atomic_t refcount;
497 cpumask_var_t span;
498 cpumask_var_t online;
499
500
501
502
503
504 cpumask_var_t rto_mask;
505 atomic_t rto_count;
506#ifdef CONFIG_SMP
507 struct cpupri cpupri;
508#endif
509};
510
511
512
513
514
515static struct root_domain def_root_domain;
516
517#endif
518
519
520
521
522
523
524
525
526struct rq {
527
528 spinlock_t lock;
529
530
531
532
533
534 unsigned long nr_running;
535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently;
540#endif
541
542 struct load_weight load;
543 unsigned long nr_load_updates;
544 u64 nr_switches;
545 u64 nr_migrations_in;
546
547 struct cfs_rq cfs;
548 struct rt_rq rt;
549
550#ifdef CONFIG_FAIR_GROUP_SCHED
551
552 struct list_head leaf_cfs_rq_list;
553#endif
554#ifdef CONFIG_RT_GROUP_SCHED
555 struct list_head leaf_rt_rq_list;
556#endif
557
558
559
560
561
562
563
564 unsigned long nr_uninterruptible;
565
566 struct task_struct *curr, *idle;
567 unsigned long next_balance;
568 struct mm_struct *prev_mm;
569
570 u64 clock;
571
572 atomic_t nr_iowait;
573
574#ifdef CONFIG_SMP
575 struct root_domain *rd;
576 struct sched_domain *sd;
577
578 unsigned char idle_at_tick;
579
580 int post_schedule;
581 int active_balance;
582 int push_cpu;
583
584 int cpu;
585 int online;
586
587 unsigned long avg_load_per_task;
588
589 struct task_struct *migration_thread;
590 struct list_head migration_queue;
591
592 u64 rt_avg;
593 u64 age_stamp;
594#endif
595
596
597 unsigned long calc_load_update;
598 long calc_load_active;
599
600#ifdef CONFIG_SCHED_HRTICK
601#ifdef CONFIG_SMP
602 int hrtick_csd_pending;
603 struct call_single_data hrtick_csd;
604#endif
605 struct hrtimer hrtick_timer;
606#endif
607
608#ifdef CONFIG_SCHEDSTATS
609
610 struct sched_info rq_sched_info;
611 unsigned long long rq_cpu_time;
612
613
614
615 unsigned int yld_count;
616
617
618 unsigned int sched_switch;
619 unsigned int sched_count;
620 unsigned int sched_goidle;
621
622
623 unsigned int ttwu_count;
624 unsigned int ttwu_local;
625
626
627 unsigned int bkl_count;
628#endif
629};
630
631static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
632
633static inline
634void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
635{
636 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
637}
638
639static inline int cpu_of(struct rq *rq)
640{
641#ifdef CONFIG_SMP
642 return rq->cpu;
643#else
644 return 0;
645#endif
646}
647
648
649
650
651
652
653
654
655#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues))
660#define task_rq(p) cpu_rq(task_cpu(p))
661#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
662#define raw_rq() (&__raw_get_cpu_var(runqueues))
663
664inline void update_rq_clock(struct rq *rq)
665{
666 rq->clock = sched_clock_cpu(cpu_of(rq));
667}
668
669
670
671
672#ifdef CONFIG_SCHED_DEBUG
673# define const_debug __read_mostly
674#else
675# define const_debug static const
676#endif
677
678
679
680
681
682
683
684
685
686int runqueue_is_locked(int cpu)
687{
688 return spin_is_locked(&cpu_rq(cpu)->lock);
689}
690
691
692
693
694
695#define SCHED_FEAT(name, enabled) \
696 __SCHED_FEAT_##name ,
697
698enum {
699#include "sched_features.h"
700};
701
702#undef SCHED_FEAT
703
704#define SCHED_FEAT(name, enabled) \
705 (1UL << __SCHED_FEAT_##name) * enabled |
706
707const_debug unsigned int sysctl_sched_features =
708#include "sched_features.h"
709 0;
710
711#undef SCHED_FEAT
712
713#ifdef CONFIG_SCHED_DEBUG
714#define SCHED_FEAT(name, enabled) \
715 #name ,
716
717static __read_mostly char *sched_feat_names[] = {
718#include "sched_features.h"
719 NULL
720};
721
722#undef SCHED_FEAT
723
724static int sched_feat_show(struct seq_file *m, void *v)
725{
726 int i;
727
728 for (i = 0; sched_feat_names[i]; i++) {
729 if (!(sysctl_sched_features & (1UL << i)))
730 seq_puts(m, "NO_");
731 seq_printf(m, "%s ", sched_feat_names[i]);
732 }
733 seq_puts(m, "\n");
734
735 return 0;
736}
737
738static ssize_t
739sched_feat_write(struct file *filp, const char __user *ubuf,
740 size_t cnt, loff_t *ppos)
741{
742 char buf[64];
743 char *cmp = buf;
744 int neg = 0;
745 int i;
746
747 if (cnt > 63)
748 cnt = 63;
749
750 if (copy_from_user(&buf, ubuf, cnt))
751 return -EFAULT;
752
753 buf[cnt] = 0;
754
755 if (strncmp(buf, "NO_", 3) == 0) {
756 neg = 1;
757 cmp += 3;
758 }
759
760 for (i = 0; sched_feat_names[i]; i++) {
761 int len = strlen(sched_feat_names[i]);
762
763 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
764 if (neg)
765 sysctl_sched_features &= ~(1UL << i);
766 else
767 sysctl_sched_features |= (1UL << i);
768 break;
769 }
770 }
771
772 if (!sched_feat_names[i])
773 return -EINVAL;
774
775 filp->f_pos += cnt;
776
777 return cnt;
778}
779
780static int sched_feat_open(struct inode *inode, struct file *filp)
781{
782 return single_open(filp, sched_feat_show, NULL);
783}
784
785static const struct file_operations sched_feat_fops = {
786 .open = sched_feat_open,
787 .write = sched_feat_write,
788 .read = seq_read,
789 .llseek = seq_lseek,
790 .release = single_release,
791};
792
793static __init int sched_init_debug(void)
794{
795 debugfs_create_file("sched_features", 0644, NULL, NULL,
796 &sched_feat_fops);
797
798 return 0;
799}
800late_initcall(sched_init_debug);
801
802#endif
803
804#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
805
806
807
808
809
810const_debug unsigned int sysctl_sched_nr_migrate = 32;
811
812
813
814
815
816unsigned int sysctl_sched_shares_ratelimit = 250000;
817
818
819
820
821
822
823unsigned int sysctl_sched_shares_thresh = 4;
824
825
826
827
828
829
830
831const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
832
833
834
835
836
837unsigned int sysctl_sched_rt_period = 1000000;
838
839static __read_mostly int scheduler_running;
840
841
842
843
844
845int sysctl_sched_rt_runtime = 950000;
846
847static inline u64 global_rt_period(void)
848{
849 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
850}
851
852static inline u64 global_rt_runtime(void)
853{
854 if (sysctl_sched_rt_runtime < 0)
855 return RUNTIME_INF;
856
857 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
858}
859
860#ifndef prepare_arch_switch
861# define prepare_arch_switch(next) do { } while (0)
862#endif
863#ifndef finish_arch_switch
864# define finish_arch_switch(prev) do { } while (0)
865#endif
866
867static inline int task_current(struct rq *rq, struct task_struct *p)
868{
869 return rq->curr == p;
870}
871
872#ifndef __ARCH_WANT_UNLOCKED_CTXSW
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875 return task_current(rq, p);
876}
877
878static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
879{
880}
881
882static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
883{
884#ifdef CONFIG_DEBUG_SPINLOCK
885
886 rq->lock.owner = current;
887#endif
888
889
890
891
892
893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
894
895 spin_unlock_irq(&rq->lock);
896}
897
898#else
899static inline int task_running(struct rq *rq, struct task_struct *p)
900{
901#ifdef CONFIG_SMP
902 return p->oncpu;
903#else
904 return task_current(rq, p);
905#endif
906}
907
908static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
909{
910#ifdef CONFIG_SMP
911
912
913
914
915
916 next->oncpu = 1;
917#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 spin_unlock_irq(&rq->lock);
920#else
921 spin_unlock(&rq->lock);
922#endif
923}
924
925static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
926{
927#ifdef CONFIG_SMP
928
929
930
931
932
933 smp_wmb();
934 prev->oncpu = 0;
935#endif
936#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
937 local_irq_enable();
938#endif
939}
940#endif
941
942
943
944
945
946static inline struct rq *__task_rq_lock(struct task_struct *p)
947 __acquires(rq->lock)
948{
949 for (;;) {
950 struct rq *rq = task_rq(p);
951 spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p)))
953 return rq;
954 spin_unlock(&rq->lock);
955 }
956}
957
958
959
960
961
962
963static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
964 __acquires(rq->lock)
965{
966 struct rq *rq;
967
968 for (;;) {
969 local_irq_save(*flags);
970 rq = task_rq(p);
971 spin_lock(&rq->lock);
972 if (likely(rq == task_rq(p)))
973 return rq;
974 spin_unlock_irqrestore(&rq->lock, *flags);
975 }
976}
977
978void task_rq_unlock_wait(struct task_struct *p)
979{
980 struct rq *rq = task_rq(p);
981
982 smp_mb();
983 spin_unlock_wait(&rq->lock);
984}
985
986static void __task_rq_unlock(struct rq *rq)
987 __releases(rq->lock)
988{
989 spin_unlock(&rq->lock);
990}
991
992static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
993 __releases(rq->lock)
994{
995 spin_unlock_irqrestore(&rq->lock, *flags);
996}
997
998
999
1000
1001static struct rq *this_rq_lock(void)
1002 __acquires(rq->lock)
1003{
1004 struct rq *rq;
1005
1006 local_irq_disable();
1007 rq = this_rq();
1008 spin_lock(&rq->lock);
1009
1010 return rq;
1011}
1012
1013#ifdef CONFIG_SCHED_HRTICK
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030static inline int hrtick_enabled(struct rq *rq)
1031{
1032 if (!sched_feat(HRTICK))
1033 return 0;
1034 if (!cpu_active(cpu_of(rq)))
1035 return 0;
1036 return hrtimer_is_hres_active(&rq->hrtick_timer);
1037}
1038
1039static void hrtick_clear(struct rq *rq)
1040{
1041 if (hrtimer_active(&rq->hrtick_timer))
1042 hrtimer_cancel(&rq->hrtick_timer);
1043}
1044
1045
1046
1047
1048
1049static enum hrtimer_restart hrtick(struct hrtimer *timer)
1050{
1051 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1052
1053 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1054
1055 spin_lock(&rq->lock);
1056 update_rq_clock(rq);
1057 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1058 spin_unlock(&rq->lock);
1059
1060 return HRTIMER_NORESTART;
1061}
1062
1063#ifdef CONFIG_SMP
1064
1065
1066
1067static void __hrtick_start(void *arg)
1068{
1069 struct rq *rq = arg;
1070
1071 spin_lock(&rq->lock);
1072 hrtimer_restart(&rq->hrtick_timer);
1073 rq->hrtick_csd_pending = 0;
1074 spin_unlock(&rq->lock);
1075}
1076
1077
1078
1079
1080
1081
1082static void hrtick_start(struct rq *rq, u64 delay)
1083{
1084 struct hrtimer *timer = &rq->hrtick_timer;
1085 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1086
1087 hrtimer_set_expires(timer, time);
1088
1089 if (rq == this_rq()) {
1090 hrtimer_restart(timer);
1091 } else if (!rq->hrtick_csd_pending) {
1092 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1093 rq->hrtick_csd_pending = 1;
1094 }
1095}
1096
1097static int
1098hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1099{
1100 int cpu = (int)(long)hcpu;
1101
1102 switch (action) {
1103 case CPU_UP_CANCELED:
1104 case CPU_UP_CANCELED_FROZEN:
1105 case CPU_DOWN_PREPARE:
1106 case CPU_DOWN_PREPARE_FROZEN:
1107 case CPU_DEAD:
1108 case CPU_DEAD_FROZEN:
1109 hrtick_clear(cpu_rq(cpu));
1110 return NOTIFY_OK;
1111 }
1112
1113 return NOTIFY_DONE;
1114}
1115
1116static __init void init_hrtick(void)
1117{
1118 hotcpu_notifier(hotplug_hrtick, 0);
1119}
1120#else
1121
1122
1123
1124
1125
1126static void hrtick_start(struct rq *rq, u64 delay)
1127{
1128 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1129 HRTIMER_MODE_REL_PINNED, 0);
1130}
1131
1132static inline void init_hrtick(void)
1133{
1134}
1135#endif
1136
1137static void init_rq_hrtick(struct rq *rq)
1138{
1139#ifdef CONFIG_SMP
1140 rq->hrtick_csd_pending = 0;
1141
1142 rq->hrtick_csd.flags = 0;
1143 rq->hrtick_csd.func = __hrtick_start;
1144 rq->hrtick_csd.info = rq;
1145#endif
1146
1147 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1148 rq->hrtick_timer.function = hrtick;
1149}
1150#else
1151static inline void hrtick_clear(struct rq *rq)
1152{
1153}
1154
1155static inline void init_rq_hrtick(struct rq *rq)
1156{
1157}
1158
1159static inline void init_hrtick(void)
1160{
1161}
1162#endif
1163
1164
1165
1166
1167
1168
1169
1170
1171#ifdef CONFIG_SMP
1172
1173#ifndef tsk_is_polling
1174#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1175#endif
1176
1177static void resched_task(struct task_struct *p)
1178{
1179 int cpu;
1180
1181 assert_spin_locked(&task_rq(p)->lock);
1182
1183 if (test_tsk_need_resched(p))
1184 return;
1185
1186 set_tsk_need_resched(p);
1187
1188 cpu = task_cpu(p);
1189 if (cpu == smp_processor_id())
1190 return;
1191
1192
1193 smp_mb();
1194 if (!tsk_is_polling(p))
1195 smp_send_reschedule(cpu);
1196}
1197
1198static void resched_cpu(int cpu)
1199{
1200 struct rq *rq = cpu_rq(cpu);
1201 unsigned long flags;
1202
1203 if (!spin_trylock_irqsave(&rq->lock, flags))
1204 return;
1205 resched_task(cpu_curr(cpu));
1206 spin_unlock_irqrestore(&rq->lock, flags);
1207}
1208
1209#ifdef CONFIG_NO_HZ
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220void wake_up_idle_cpu(int cpu)
1221{
1222 struct rq *rq = cpu_rq(cpu);
1223
1224 if (cpu == smp_processor_id())
1225 return;
1226
1227
1228
1229
1230
1231
1232
1233
1234 if (rq->curr != rq->idle)
1235 return;
1236
1237
1238
1239
1240
1241
1242 set_tsk_need_resched(rq->idle);
1243
1244
1245 smp_mb();
1246 if (!tsk_is_polling(rq->idle))
1247 smp_send_reschedule(cpu);
1248}
1249#endif
1250
1251static u64 sched_avg_period(void)
1252{
1253 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1254}
1255
1256static void sched_avg_update(struct rq *rq)
1257{
1258 s64 period = sched_avg_period();
1259
1260 while ((s64)(rq->clock - rq->age_stamp) > period) {
1261 rq->age_stamp += period;
1262 rq->rt_avg /= 2;
1263 }
1264}
1265
1266static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1267{
1268 rq->rt_avg += rt_delta;
1269 sched_avg_update(rq);
1270}
1271
1272#else
1273static void resched_task(struct task_struct *p)
1274{
1275 assert_spin_locked(&task_rq(p)->lock);
1276 set_tsk_need_resched(p);
1277}
1278
1279static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1280{
1281}
1282#endif
1283
1284#if BITS_PER_LONG == 32
1285# define WMULT_CONST (~0UL)
1286#else
1287# define WMULT_CONST (1UL << 32)
1288#endif
1289
1290#define WMULT_SHIFT 32
1291
1292
1293
1294
1295#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1296
1297
1298
1299
1300static unsigned long
1301calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1302 struct load_weight *lw)
1303{
1304 u64 tmp;
1305
1306 if (!lw->inv_weight) {
1307 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1308 lw->inv_weight = 1;
1309 else
1310 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1311 / (lw->weight+1);
1312 }
1313
1314 tmp = (u64)delta_exec * weight;
1315
1316
1317
1318 if (unlikely(tmp > WMULT_CONST))
1319 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1320 WMULT_SHIFT/2);
1321 else
1322 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1323
1324 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1325}
1326
1327static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1328{
1329 lw->weight += inc;
1330 lw->inv_weight = 0;
1331}
1332
1333static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1334{
1335 lw->weight -= dec;
1336 lw->inv_weight = 0;
1337}
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348#define WEIGHT_IDLEPRIO 3
1349#define WMULT_IDLEPRIO 1431655765
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363static const int prio_to_weight[40] = {
1364 88761, 71755, 56483, 46273, 36291,
1365 29154, 23254, 18705, 14949, 11916,
1366 9548, 7620, 6100, 4904, 3906,
1367 3121, 2501, 1991, 1586, 1277,
1368 1024, 820, 655, 526, 423,
1369 335, 272, 215, 172, 137,
1370 110, 87, 70, 56, 45,
1371 36, 29, 23, 18, 15,
1372};
1373
1374
1375
1376
1377
1378
1379
1380
1381static const u32 prio_to_wmult[40] = {
1382 48388, 59856, 76040, 92818, 118348,
1383 147320, 184698, 229616, 287308, 360437,
1384 449829, 563644, 704093, 875809, 1099582,
1385 1376151, 1717300, 2157191, 2708050, 3363326,
1386 4194304, 5237765, 6557202, 8165337, 10153587,
1387 12820798, 15790321, 19976592, 24970740, 31350126,
1388 39045157, 49367440, 61356676, 76695844, 95443717,
1389 119304647, 148102320, 186737708, 238609294, 286331153,
1390};
1391
1392static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1393
1394
1395
1396
1397
1398
1399struct rq_iterator {
1400 void *arg;
1401 struct task_struct *(*start)(void *);
1402 struct task_struct *(*next)(void *);
1403};
1404
1405#ifdef CONFIG_SMP
1406static unsigned long
1407balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1408 unsigned long max_load_move, struct sched_domain *sd,
1409 enum cpu_idle_type idle, int *all_pinned,
1410 int *this_best_prio, struct rq_iterator *iterator);
1411
1412static int
1413iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1414 struct sched_domain *sd, enum cpu_idle_type idle,
1415 struct rq_iterator *iterator);
1416#endif
1417
1418
1419enum cpuacct_stat_index {
1420 CPUACCT_STAT_USER,
1421 CPUACCT_STAT_SYSTEM,
1422
1423 CPUACCT_STAT_NSTATS,
1424};
1425
1426#ifdef CONFIG_CGROUP_CPUACCT
1427static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1428static void cpuacct_update_stats(struct task_struct *tsk,
1429 enum cpuacct_stat_index idx, cputime_t val);
1430#else
1431static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1432static inline void cpuacct_update_stats(struct task_struct *tsk,
1433 enum cpuacct_stat_index idx, cputime_t val) {}
1434#endif
1435
1436static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1437{
1438 update_load_add(&rq->load, load);
1439}
1440
1441static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1442{
1443 update_load_sub(&rq->load, load);
1444}
1445
1446#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1447typedef int (*tg_visitor)(struct task_group *, void *);
1448
1449
1450
1451
1452
1453static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1454{
1455 struct task_group *parent, *child;
1456 int ret;
1457
1458 rcu_read_lock();
1459 parent = &root_task_group;
1460down:
1461 ret = (*down)(parent, data);
1462 if (ret)
1463 goto out_unlock;
1464 list_for_each_entry_rcu(child, &parent->children, siblings) {
1465 parent = child;
1466 goto down;
1467
1468up:
1469 continue;
1470 }
1471 ret = (*up)(parent, data);
1472 if (ret)
1473 goto out_unlock;
1474
1475 child = parent;
1476 parent = parent->parent;
1477 if (parent)
1478 goto up;
1479out_unlock:
1480 rcu_read_unlock();
1481
1482 return ret;
1483}
1484
1485static int tg_nop(struct task_group *tg, void *data)
1486{
1487 return 0;
1488}
1489#endif
1490
1491#ifdef CONFIG_SMP
1492
1493static unsigned long weighted_cpuload(const int cpu)
1494{
1495 return cpu_rq(cpu)->load.weight;
1496}
1497
1498
1499
1500
1501
1502
1503
1504
1505static unsigned long source_load(int cpu, int type)
1506{
1507 struct rq *rq = cpu_rq(cpu);
1508 unsigned long total = weighted_cpuload(cpu);
1509
1510 if (type == 0 || !sched_feat(LB_BIAS))
1511 return total;
1512
1513 return min(rq->cpu_load[type-1], total);
1514}
1515
1516
1517
1518
1519
1520static unsigned long target_load(int cpu, int type)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long total = weighted_cpuload(cpu);
1524
1525 if (type == 0 || !sched_feat(LB_BIAS))
1526 return total;
1527
1528 return max(rq->cpu_load[type-1], total);
1529}
1530
1531static struct sched_group *group_of(int cpu)
1532{
1533 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1534
1535 if (!sd)
1536 return NULL;
1537
1538 return sd->groups;
1539}
1540
1541static unsigned long power_of(int cpu)
1542{
1543 struct sched_group *group = group_of(cpu);
1544
1545 if (!group)
1546 return SCHED_LOAD_SCALE;
1547
1548 return group->cpu_power;
1549}
1550
1551static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1552
1553static unsigned long cpu_avg_load_per_task(int cpu)
1554{
1555 struct rq *rq = cpu_rq(cpu);
1556 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1557
1558 if (nr_running)
1559 rq->avg_load_per_task = rq->load.weight / nr_running;
1560 else
1561 rq->avg_load_per_task = 0;
1562
1563 return rq->avg_load_per_task;
1564}
1565
1566#ifdef CONFIG_FAIR_GROUP_SCHED
1567
1568static __read_mostly unsigned long *update_shares_data;
1569
1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1571
1572
1573
1574
1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1576 unsigned long sd_shares,
1577 unsigned long sd_rq_weight,
1578 unsigned long *usd_rq_weight)
1579{
1580 unsigned long shares, rq_weight;
1581 int boost = 0;
1582
1583 rq_weight = usd_rq_weight[cpu];
1584 if (!rq_weight) {
1585 boost = 1;
1586 rq_weight = NICE_0_LOAD;
1587 }
1588
1589
1590
1591
1592
1593
1594 shares = (sd_shares * rq_weight) / sd_rq_weight;
1595 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1596
1597 if (abs(shares - tg->se[cpu]->load.weight) >
1598 sysctl_sched_shares_thresh) {
1599 struct rq *rq = cpu_rq(cpu);
1600 unsigned long flags;
1601
1602 spin_lock_irqsave(&rq->lock, flags);
1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1605 __set_se_shares(tg->se[cpu], shares);
1606 spin_unlock_irqrestore(&rq->lock, flags);
1607 }
1608}
1609
1610
1611
1612
1613
1614
1615static int tg_shares_up(struct task_group *tg, void *data)
1616{
1617 unsigned long weight, rq_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data;
1620 unsigned long flags;
1621 int i;
1622
1623 if (!tg->se[0])
1624 return 0;
1625
1626 local_irq_save(flags);
1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1628
1629 for_each_cpu(i, sched_domain_span(sd)) {
1630 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight;
1632
1633
1634
1635
1636
1637
1638 if (!weight)
1639 weight = NICE_0_LOAD;
1640
1641 rq_weight += weight;
1642 shares += tg->cfs_rq[i]->shares;
1643 }
1644
1645 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares;
1647
1648 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1649 shares = tg->shares;
1650
1651 for_each_cpu(i, sched_domain_span(sd))
1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1653
1654 local_irq_restore(flags);
1655
1656 return 0;
1657}
1658
1659
1660
1661
1662
1663
1664static int tg_load_down(struct task_group *tg, void *data)
1665{
1666 unsigned long load;
1667 long cpu = (long)data;
1668
1669 if (!tg->parent) {
1670 load = cpu_rq(cpu)->load.weight;
1671 } else {
1672 load = tg->parent->cfs_rq[cpu]->h_load;
1673 load *= tg->cfs_rq[cpu]->shares;
1674 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1675 }
1676
1677 tg->cfs_rq[cpu]->h_load = load;
1678
1679 return 0;
1680}
1681
1682static void update_shares(struct sched_domain *sd)
1683{
1684 s64 elapsed;
1685 u64 now;
1686
1687 if (root_task_group_empty())
1688 return;
1689
1690 now = cpu_clock(raw_smp_processor_id());
1691 elapsed = now - sd->last_update;
1692
1693 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1694 sd->last_update = now;
1695 walk_tg_tree(tg_nop, tg_shares_up, sd);
1696 }
1697}
1698
1699static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1700{
1701 if (root_task_group_empty())
1702 return;
1703
1704 spin_unlock(&rq->lock);
1705 update_shares(sd);
1706 spin_lock(&rq->lock);
1707}
1708
1709static void update_h_load(long cpu)
1710{
1711 if (root_task_group_empty())
1712 return;
1713
1714 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1715}
1716
1717#else
1718
1719static inline void update_shares(struct sched_domain *sd)
1720{
1721}
1722
1723static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1724{
1725}
1726
1727#endif
1728
1729#ifdef CONFIG_PREEMPT
1730
1731static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1742 __releases(this_rq->lock)
1743 __acquires(busiest->lock)
1744 __acquires(this_rq->lock)
1745{
1746 spin_unlock(&this_rq->lock);
1747 double_rq_lock(this_rq, busiest);
1748
1749 return 1;
1750}
1751
1752#else
1753
1754
1755
1756
1757
1758
1759
1760static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1761 __releases(this_rq->lock)
1762 __acquires(busiest->lock)
1763 __acquires(this_rq->lock)
1764{
1765 int ret = 0;
1766
1767 if (unlikely(!spin_trylock(&busiest->lock))) {
1768 if (busiest < this_rq) {
1769 spin_unlock(&this_rq->lock);
1770 spin_lock(&busiest->lock);
1771 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1772 ret = 1;
1773 } else
1774 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1775 }
1776 return ret;
1777}
1778
1779#endif
1780
1781
1782
1783
1784static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1785{
1786 if (unlikely(!irqs_disabled())) {
1787
1788 spin_unlock(&this_rq->lock);
1789 BUG_ON(1);
1790 }
1791
1792 return _double_lock_balance(this_rq, busiest);
1793}
1794
1795static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1796 __releases(busiest->lock)
1797{
1798 spin_unlock(&busiest->lock);
1799 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1800}
1801#endif
1802
1803#ifdef CONFIG_FAIR_GROUP_SCHED
1804static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805{
1806#ifdef CONFIG_SMP
1807 cfs_rq->shares = shares;
1808#endif
1809}
1810#endif
1811
1812static void calc_load_account_active(struct rq *this_rq);
1813
1814#include "sched_stats.h"
1815#include "sched_idletask.c"
1816#include "sched_fair.c"
1817#include "sched_rt.c"
1818#ifdef CONFIG_SCHED_DEBUG
1819# include "sched_debug.c"
1820#endif
1821
1822#define sched_class_highest (&rt_sched_class)
1823#define for_each_class(class) \
1824 for (class = sched_class_highest; class; class = class->next)
1825
1826static void inc_nr_running(struct rq *rq)
1827{
1828 rq->nr_running++;
1829}
1830
1831static void dec_nr_running(struct rq *rq)
1832{
1833 rq->nr_running--;
1834}
1835
1836static void set_load_weight(struct task_struct *p)
1837{
1838 if (task_has_rt_policy(p)) {
1839 p->se.load.weight = prio_to_weight[0] * 2;
1840 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1841 return;
1842 }
1843
1844
1845
1846
1847 if (p->policy == SCHED_IDLE) {
1848 p->se.load.weight = WEIGHT_IDLEPRIO;
1849 p->se.load.inv_weight = WMULT_IDLEPRIO;
1850 return;
1851 }
1852
1853 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1854 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1855}
1856
1857static void update_avg(u64 *avg, u64 sample)
1858{
1859 s64 diff = sample - *avg;
1860 *avg += diff >> 3;
1861}
1862
1863static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1864{
1865 if (wakeup)
1866 p->se.start_runtime = p->se.sum_exec_runtime;
1867
1868 sched_info_queued(p);
1869 p->sched_class->enqueue_task(rq, p, wakeup);
1870 p->se.on_rq = 1;
1871}
1872
1873static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1874{
1875 if (sleep) {
1876 if (p->se.last_wakeup) {
1877 update_avg(&p->se.avg_overlap,
1878 p->se.sum_exec_runtime - p->se.last_wakeup);
1879 p->se.last_wakeup = 0;
1880 } else {
1881 update_avg(&p->se.avg_wakeup,
1882 sysctl_sched_wakeup_granularity);
1883 }
1884 }
1885
1886 sched_info_dequeued(p);
1887 p->sched_class->dequeue_task(rq, p, sleep);
1888 p->se.on_rq = 0;
1889}
1890
1891
1892
1893
1894static inline int __normal_prio(struct task_struct *p)
1895{
1896 return p->static_prio;
1897}
1898
1899
1900
1901
1902
1903
1904
1905
1906static inline int normal_prio(struct task_struct *p)
1907{
1908 int prio;
1909
1910 if (task_has_rt_policy(p))
1911 prio = MAX_RT_PRIO-1 - p->rt_priority;
1912 else
1913 prio = __normal_prio(p);
1914 return prio;
1915}
1916
1917
1918
1919
1920
1921
1922
1923
1924static int effective_prio(struct task_struct *p)
1925{
1926 p->normal_prio = normal_prio(p);
1927
1928
1929
1930
1931
1932 if (!rt_prio(p->prio))
1933 return p->normal_prio;
1934 return p->prio;
1935}
1936
1937
1938
1939
1940static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1941{
1942 if (task_contributes_to_load(p))
1943 rq->nr_uninterruptible--;
1944
1945 enqueue_task(rq, p, wakeup);
1946 inc_nr_running(rq);
1947}
1948
1949
1950
1951
1952static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1953{
1954 if (task_contributes_to_load(p))
1955 rq->nr_uninterruptible++;
1956
1957 dequeue_task(rq, p, sleep);
1958 dec_nr_running(rq);
1959}
1960
1961
1962
1963
1964
1965inline int task_curr(const struct task_struct *p)
1966{
1967 return cpu_curr(task_cpu(p)) == p;
1968}
1969
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974
1975
1976
1977
1978
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class,
1986 int oldprio, int running)
1987{
1988 if (prev_class != p->sched_class) {
1989 if (prev_class->switched_from)
1990 prev_class->switched_from(rq, p, running);
1991 p->sched_class->switched_to(rq, p, running);
1992 } else
1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1;
2023 p->flags |= PF_THREAD_BOUND;
2024 spin_unlock_irqrestore(&rq->lock, flags);
2025}
2026EXPORT_SYMBOL(kthread_bind);
2027
2028#ifdef CONFIG_SMP
2029
2030
2031
2032static int
2033task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2034{
2035 s64 delta;
2036
2037
2038
2039
2040 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2041 (&p->se == cfs_rq_of(&p->se)->next ||
2042 &p->se == cfs_rq_of(&p->se)->last))
2043 return 1;
2044
2045 if (p->sched_class != &fair_sched_class)
2046 return 0;
2047
2048 if (sysctl_sched_migration_cost == -1)
2049 return 1;
2050 if (sysctl_sched_migration_cost == 0)
2051 return 0;
2052
2053 delta = now - p->se.exec_start;
2054
2055 return delta < (s64)sysctl_sched_migration_cost;
2056}
2057
2058
2059void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2060{
2061 int old_cpu = task_cpu(p);
2062 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2063 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2064 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2065 u64 clock_offset;
2066
2067 clock_offset = old_rq->clock - new_rq->clock;
2068
2069 trace_sched_migrate_task(p, new_cpu);
2070
2071#ifdef CONFIG_SCHEDSTATS
2072 if (p->se.wait_start)
2073 p->se.wait_start -= clock_offset;
2074 if (p->se.sleep_start)
2075 p->se.sleep_start -= clock_offset;
2076 if (p->se.block_start)
2077 p->se.block_start -= clock_offset;
2078#endif
2079 if (old_cpu != new_cpu) {
2080 p->se.nr_migrations++;
2081 new_rq->nr_migrations_in++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0);
2088 }
2089 p->se.vruntime -= old_cfsrq->min_vruntime -
2090 new_cfsrq->min_vruntime;
2091
2092 __set_task_cpu(p, new_cpu);
2093}
2094
2095struct migration_req {
2096 struct list_head list;
2097
2098 struct task_struct *task;
2099 int dest_cpu;
2100
2101 struct completion done;
2102};
2103
2104
2105
2106
2107
2108static int
2109migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2110{
2111 struct rq *rq = task_rq(p);
2112
2113
2114
2115
2116
2117 if (!p->se.on_rq && !task_running(rq, p)) {
2118 set_task_cpu(p, dest_cpu);
2119 return 0;
2120 }
2121
2122 init_completion(&req->done);
2123 req->task = p;
2124 req->dest_cpu = dest_cpu;
2125 list_add(&req->list, &rq->migration_queue);
2126
2127 return 1;
2128}
2129
2130
2131
2132
2133
2134
2135
2136void wait_task_context_switch(struct task_struct *p)
2137{
2138 unsigned long nvcsw, nivcsw, flags;
2139 int running;
2140 struct rq *rq;
2141
2142 nvcsw = p->nvcsw;
2143 nivcsw = p->nivcsw;
2144 for (;;) {
2145
2146
2147
2148
2149
2150
2151
2152
2153 rq = task_rq_lock(p, &flags);
2154 running = task_running(rq, p);
2155 task_rq_unlock(rq, &flags);
2156
2157 if (likely(!running))
2158 break;
2159
2160
2161
2162
2163
2164 if ((p->nvcsw - nvcsw) > 1)
2165 break;
2166 if ((p->nivcsw - nivcsw) > 1)
2167 break;
2168
2169 cpu_relax();
2170 }
2171}
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2190{
2191 unsigned long flags;
2192 int running, on_rq;
2193 unsigned long ncsw;
2194 struct rq *rq;
2195
2196 for (;;) {
2197
2198
2199
2200
2201
2202
2203 rq = task_rq(p);
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216 while (task_running(rq, p)) {
2217 if (match_state && unlikely(p->state != match_state))
2218 return 0;
2219 cpu_relax();
2220 }
2221
2222
2223
2224
2225
2226
2227 rq = task_rq_lock(p, &flags);
2228 trace_sched_wait_task(rq, p);
2229 running = task_running(rq, p);
2230 on_rq = p->se.on_rq;
2231 ncsw = 0;
2232 if (!match_state || p->state == match_state)
2233 ncsw = p->nvcsw | LONG_MIN;
2234 task_rq_unlock(rq, &flags);
2235
2236
2237
2238
2239 if (unlikely(!ncsw))
2240 break;
2241
2242
2243
2244
2245
2246
2247
2248 if (unlikely(running)) {
2249 cpu_relax();
2250 continue;
2251 }
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262 if (unlikely(on_rq)) {
2263 schedule_timeout_uninterruptible(1);
2264 continue;
2265 }
2266
2267
2268
2269
2270
2271
2272 break;
2273 }
2274
2275 return ncsw;
2276}
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291void kick_process(struct task_struct *p)
2292{
2293 int cpu;
2294
2295 preempt_disable();
2296 cpu = task_cpu(p);
2297 if ((cpu != smp_processor_id()) && task_curr(p))
2298 smp_send_reschedule(cpu);
2299 preempt_enable();
2300}
2301EXPORT_SYMBOL_GPL(kick_process);
2302#endif
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313void task_oncpu_function_call(struct task_struct *p,
2314 void (*func) (void *info), void *info)
2315{
2316 int cpu;
2317
2318 preempt_disable();
2319 cpu = task_cpu(p);
2320 if (task_curr(p))
2321 smp_call_function_single(cpu, func, info, 1);
2322 preempt_enable();
2323}
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339static int try_to_wake_up(struct task_struct *p, unsigned int state,
2340 int wake_flags)
2341{
2342 int cpu, orig_cpu, this_cpu, success = 0;
2343 unsigned long flags;
2344 struct rq *rq, *orig_rq;
2345
2346 if (!sched_feat(SYNC_WAKEUPS))
2347 wake_flags &= ~WF_SYNC;
2348
2349 this_cpu = get_cpu();
2350
2351 smp_wmb();
2352 rq = orig_rq = task_rq_lock(p, &flags);
2353 update_rq_clock(rq);
2354 if (!(p->state & state))
2355 goto out;
2356
2357 if (p->se.on_rq)
2358 goto out_running;
2359
2360 cpu = task_cpu(p);
2361 orig_cpu = cpu;
2362
2363#ifdef CONFIG_SMP
2364 if (unlikely(task_running(rq, p)))
2365 goto out_activate;
2366
2367
2368
2369
2370
2371
2372
2373 if (task_contributes_to_load(p))
2374 rq->nr_uninterruptible--;
2375 p->state = TASK_WAKING;
2376 task_rq_unlock(rq, &flags);
2377
2378 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2379 if (cpu != orig_cpu)
2380 set_task_cpu(p, cpu);
2381
2382 rq = task_rq_lock(p, &flags);
2383
2384 if (rq != orig_rq)
2385 update_rq_clock(rq);
2386
2387 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p);
2389
2390#ifdef CONFIG_SCHEDSTATS
2391 schedstat_inc(rq, ttwu_count);
2392 if (cpu == this_cpu)
2393 schedstat_inc(rq, ttwu_local);
2394 else {
2395 struct sched_domain *sd;
2396 for_each_domain(this_cpu, sd) {
2397 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2398 schedstat_inc(sd, ttwu_wake_remote);
2399 break;
2400 }
2401 }
2402 }
2403#endif
2404
2405out_activate:
2406#endif
2407 schedstat_inc(p, se.nr_wakeups);
2408 if (wake_flags & WF_SYNC)
2409 schedstat_inc(p, se.nr_wakeups_sync);
2410 if (orig_cpu != cpu)
2411 schedstat_inc(p, se.nr_wakeups_migrate);
2412 if (cpu == this_cpu)
2413 schedstat_inc(p, se.nr_wakeups_local);
2414 else
2415 schedstat_inc(p, se.nr_wakeups_remote);
2416 activate_task(rq, p, 1);
2417 success = 1;
2418
2419
2420
2421
2422 if (!in_interrupt()) {
2423 struct sched_entity *se = ¤t->se;
2424 u64 sample = se->sum_exec_runtime;
2425
2426 if (se->last_wakeup)
2427 sample -= se->last_wakeup;
2428 else
2429 sample -= se->start_runtime;
2430 update_avg(&se->avg_wakeup, sample);
2431
2432 se->last_wakeup = se->sum_exec_runtime;
2433 }
2434
2435out_running:
2436 trace_sched_wakeup(rq, p, success);
2437 check_preempt_curr(rq, p, wake_flags);
2438
2439 p->state = TASK_RUNNING;
2440#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up)
2442 p->sched_class->task_wake_up(rq, p);
2443#endif
2444out:
2445 task_rq_unlock(rq, &flags);
2446 put_cpu();
2447
2448 return success;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462int wake_up_process(struct task_struct *p)
2463{
2464 return try_to_wake_up(p, TASK_ALL, 0);
2465}
2466EXPORT_SYMBOL(wake_up_process);
2467
2468int wake_up_state(struct task_struct *p, unsigned int state)
2469{
2470 return try_to_wake_up(p, state, 0);
2471}
2472
2473
2474
2475
2476
2477
2478
2479static void __sched_fork(struct task_struct *p)
2480{
2481 p->se.exec_start = 0;
2482 p->se.sum_exec_runtime = 0;
2483 p->se.prev_sum_exec_runtime = 0;
2484 p->se.nr_migrations = 0;
2485 p->se.last_wakeup = 0;
2486 p->se.avg_overlap = 0;
2487 p->se.start_runtime = 0;
2488 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2489 p->se.avg_running = 0;
2490
2491#ifdef CONFIG_SCHEDSTATS
2492 p->se.wait_start = 0;
2493 p->se.wait_max = 0;
2494 p->se.wait_count = 0;
2495 p->se.wait_sum = 0;
2496
2497 p->se.sleep_start = 0;
2498 p->se.sleep_max = 0;
2499 p->se.sum_sleep_runtime = 0;
2500
2501 p->se.block_start = 0;
2502 p->se.block_max = 0;
2503 p->se.exec_max = 0;
2504 p->se.slice_max = 0;
2505
2506 p->se.nr_migrations_cold = 0;
2507 p->se.nr_failed_migrations_affine = 0;
2508 p->se.nr_failed_migrations_running = 0;
2509 p->se.nr_failed_migrations_hot = 0;
2510 p->se.nr_forced_migrations = 0;
2511 p->se.nr_forced2_migrations = 0;
2512
2513 p->se.nr_wakeups = 0;
2514 p->se.nr_wakeups_sync = 0;
2515 p->se.nr_wakeups_migrate = 0;
2516 p->se.nr_wakeups_local = 0;
2517 p->se.nr_wakeups_remote = 0;
2518 p->se.nr_wakeups_affine = 0;
2519 p->se.nr_wakeups_affine_attempts = 0;
2520 p->se.nr_wakeups_passive = 0;
2521 p->se.nr_wakeups_idle = 0;
2522
2523#endif
2524
2525 INIT_LIST_HEAD(&p->rt.run_list);
2526 p->se.on_rq = 0;
2527 INIT_LIST_HEAD(&p->se.group_node);
2528
2529#ifdef CONFIG_PREEMPT_NOTIFIERS
2530 INIT_HLIST_HEAD(&p->preempt_notifiers);
2531#endif
2532
2533
2534
2535
2536
2537
2538
2539 p->state = TASK_RUNNING;
2540}
2541
2542
2543
2544
2545void sched_fork(struct task_struct *p, int clone_flags)
2546{
2547 int cpu = get_cpu();
2548
2549 __sched_fork(p);
2550
2551
2552
2553
2554 if (unlikely(p->sched_reset_on_fork)) {
2555 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2556 p->policy = SCHED_NORMAL;
2557 p->normal_prio = p->static_prio;
2558 }
2559
2560 if (PRIO_TO_NICE(p->static_prio) < 0) {
2561 p->static_prio = NICE_TO_PRIO(0);
2562 p->normal_prio = p->static_prio;
2563 set_load_weight(p);
2564 }
2565
2566
2567
2568
2569
2570 p->sched_reset_on_fork = 0;
2571 }
2572
2573
2574
2575
2576 p->prio = current->normal_prio;
2577
2578 if (!rt_prio(p->prio))
2579 p->sched_class = &fair_sched_class;
2580
2581#ifdef CONFIG_SMP
2582 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2583#endif
2584 set_task_cpu(p, cpu);
2585
2586#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2587 if (likely(sched_info_on()))
2588 memset(&p->sched_info, 0, sizeof(p->sched_info));
2589#endif
2590#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2591 p->oncpu = 0;
2592#endif
2593#ifdef CONFIG_PREEMPT
2594
2595 task_thread_info(p)->preempt_count = 1;
2596#endif
2597 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2598
2599 put_cpu();
2600}
2601
2602
2603
2604
2605
2606
2607
2608
2609void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2610{
2611 unsigned long flags;
2612 struct rq *rq;
2613
2614 rq = task_rq_lock(p, &flags);
2615 BUG_ON(p->state != TASK_RUNNING);
2616 update_rq_clock(rq);
2617
2618 if (!p->sched_class->task_new || !current->se.on_rq) {
2619 activate_task(rq, p, 0);
2620 } else {
2621
2622
2623
2624
2625 p->sched_class->task_new(rq, p);
2626 inc_nr_running(rq);
2627 }
2628 trace_sched_wakeup_new(rq, p, 1);
2629 check_preempt_curr(rq, p, WF_FORK);
2630#ifdef CONFIG_SMP
2631 if (p->sched_class->task_wake_up)
2632 p->sched_class->task_wake_up(rq, p);
2633#endif
2634 task_rq_unlock(rq, &flags);
2635}
2636
2637#ifdef CONFIG_PREEMPT_NOTIFIERS
2638
2639
2640
2641
2642
2643void preempt_notifier_register(struct preempt_notifier *notifier)
2644{
2645 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2646}
2647EXPORT_SYMBOL_GPL(preempt_notifier_register);
2648
2649
2650
2651
2652
2653
2654
2655void preempt_notifier_unregister(struct preempt_notifier *notifier)
2656{
2657 hlist_del(¬ifier->link);
2658}
2659EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2660
2661static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2662{
2663 struct preempt_notifier *notifier;
2664 struct hlist_node *node;
2665
2666 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2667 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2668}
2669
2670static void
2671fire_sched_out_preempt_notifiers(struct task_struct *curr,
2672 struct task_struct *next)
2673{
2674 struct preempt_notifier *notifier;
2675 struct hlist_node *node;
2676
2677 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2678 notifier->ops->sched_out(notifier, next);
2679}
2680
2681#else
2682
2683static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2684{
2685}
2686
2687static void
2688fire_sched_out_preempt_notifiers(struct task_struct *curr,
2689 struct task_struct *next)
2690{
2691}
2692
2693#endif
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708static inline void
2709prepare_task_switch(struct rq *rq, struct task_struct *prev,
2710 struct task_struct *next)
2711{
2712 fire_sched_out_preempt_notifiers(prev, next);
2713 prepare_lock_switch(rq, next);
2714 prepare_arch_switch(next);
2715}
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2733 __releases(rq->lock)
2734{
2735 struct mm_struct *mm = rq->prev_mm;
2736 long prev_state;
2737
2738 rq->prev_mm = NULL;
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751 prev_state = prev->state;
2752 finish_arch_switch(prev);
2753 perf_event_task_sched_in(current, cpu_of(rq));
2754 finish_lock_switch(rq, prev);
2755
2756 fire_sched_in_preempt_notifiers(current);
2757 if (mm)
2758 mmdrop(mm);
2759 if (unlikely(prev_state == TASK_DEAD)) {
2760
2761
2762
2763
2764 kprobe_flush_task(prev);
2765 put_task_struct(prev);
2766 }
2767}
2768
2769#ifdef CONFIG_SMP
2770
2771
2772static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2773{
2774 if (prev->sched_class->pre_schedule)
2775 prev->sched_class->pre_schedule(rq, prev);
2776}
2777
2778
2779static inline void post_schedule(struct rq *rq)
2780{
2781 if (rq->post_schedule) {
2782 unsigned long flags;
2783
2784 spin_lock_irqsave(&rq->lock, flags);
2785 if (rq->curr->sched_class->post_schedule)
2786 rq->curr->sched_class->post_schedule(rq);
2787 spin_unlock_irqrestore(&rq->lock, flags);
2788
2789 rq->post_schedule = 0;
2790 }
2791}
2792
2793#else
2794
2795static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2796{
2797}
2798
2799static inline void post_schedule(struct rq *rq)
2800{
2801}
2802
2803#endif
2804
2805
2806
2807
2808
2809asmlinkage void schedule_tail(struct task_struct *prev)
2810 __releases(rq->lock)
2811{
2812 struct rq *rq = this_rq();
2813
2814 finish_task_switch(rq, prev);
2815
2816
2817
2818
2819
2820 post_schedule(rq);
2821
2822#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2823
2824 preempt_enable();
2825#endif
2826 if (current->set_child_tid)
2827 put_user(task_pid_vnr(current), current->set_child_tid);
2828}
2829
2830
2831
2832
2833
2834static inline void
2835context_switch(struct rq *rq, struct task_struct *prev,
2836 struct task_struct *next)
2837{
2838 struct mm_struct *mm, *oldmm;
2839
2840 prepare_task_switch(rq, prev, next);
2841 trace_sched_switch(rq, prev, next);
2842 mm = next->mm;
2843 oldmm = prev->active_mm;
2844
2845
2846
2847
2848
2849 arch_start_context_switch(prev);
2850
2851 if (unlikely(!mm)) {
2852 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next);
2855 } else
2856 switch_mm(oldmm, mm, next);
2857
2858 if (unlikely(!prev->mm)) {
2859 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm;
2861 }
2862
2863
2864
2865
2866
2867
2868#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2869 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2870#endif
2871
2872
2873 switch_to(prev, next, prev);
2874
2875 barrier();
2876
2877
2878
2879
2880
2881 finish_task_switch(this_rq(), prev);
2882}
2883
2884
2885
2886
2887
2888
2889
2890
2891unsigned long nr_running(void)
2892{
2893 unsigned long i, sum = 0;
2894
2895 for_each_online_cpu(i)
2896 sum += cpu_rq(i)->nr_running;
2897
2898 return sum;
2899}
2900
2901unsigned long nr_uninterruptible(void)
2902{
2903 unsigned long i, sum = 0;
2904
2905 for_each_possible_cpu(i)
2906 sum += cpu_rq(i)->nr_uninterruptible;
2907
2908
2909
2910
2911
2912 if (unlikely((long)sum < 0))
2913 sum = 0;
2914
2915 return sum;
2916}
2917
2918unsigned long long nr_context_switches(void)
2919{
2920 int i;
2921 unsigned long long sum = 0;
2922
2923 for_each_possible_cpu(i)
2924 sum += cpu_rq(i)->nr_switches;
2925
2926 return sum;
2927}
2928
2929unsigned long nr_iowait(void)
2930{
2931 unsigned long i, sum = 0;
2932
2933 for_each_possible_cpu(i)
2934 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2935
2936 return sum;
2937}
2938
2939unsigned long nr_iowait_cpu(void)
2940{
2941 struct rq *this = this_rq();
2942 return atomic_read(&this->nr_iowait);
2943}
2944
2945unsigned long this_cpu_load(void)
2946{
2947 struct rq *this = this_rq();
2948 return this->cpu_load[0];
2949}
2950
2951
2952
2953static atomic_long_t calc_load_tasks;
2954static unsigned long calc_load_update;
2955unsigned long avenrun[3];
2956EXPORT_SYMBOL(avenrun);
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2967{
2968 loads[0] = (avenrun[0] + offset) << shift;
2969 loads[1] = (avenrun[1] + offset) << shift;
2970 loads[2] = (avenrun[2] + offset) << shift;
2971}
2972
2973static unsigned long
2974calc_load(unsigned long load, unsigned long exp, unsigned long active)
2975{
2976 load *= exp;
2977 load += active * (FIXED_1 - exp);
2978 return load >> FSHIFT;
2979}
2980
2981
2982
2983
2984
2985void calc_global_load(void)
2986{
2987 unsigned long upd = calc_load_update + 10;
2988 long active;
2989
2990 if (time_before(jiffies, upd))
2991 return;
2992
2993 active = atomic_long_read(&calc_load_tasks);
2994 active = active > 0 ? active * FIXED_1 : 0;
2995
2996 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2997 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2998 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2999
3000 calc_load_update += LOAD_FREQ;
3001}
3002
3003
3004
3005
3006static void calc_load_account_active(struct rq *this_rq)
3007{
3008 long nr_active, delta;
3009
3010 nr_active = this_rq->nr_running;
3011 nr_active += (long) this_rq->nr_uninterruptible;
3012
3013 if (nr_active != this_rq->calc_load_active) {
3014 delta = nr_active - this_rq->calc_load_active;
3015 this_rq->calc_load_active = nr_active;
3016 atomic_long_add(delta, &calc_load_tasks);
3017 }
3018}
3019
3020
3021
3022
3023
3024u64 cpu_nr_migrations(int cpu)
3025{
3026 return cpu_rq(cpu)->nr_migrations_in;
3027}
3028
3029
3030
3031
3032
3033static void update_cpu_load(struct rq *this_rq)
3034{
3035 unsigned long this_load = this_rq->load.weight;
3036 int i, scale;
3037
3038 this_rq->nr_load_updates++;
3039
3040
3041 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3042 unsigned long old_load, new_load;
3043
3044
3045
3046 old_load = this_rq->cpu_load[i];
3047 new_load = this_load;
3048
3049
3050
3051
3052
3053 if (new_load > old_load)
3054 new_load += scale-1;
3055 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3056 }
3057
3058 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3059 this_rq->calc_load_update += LOAD_FREQ;
3060 calc_load_account_active(this_rq);
3061 }
3062}
3063
3064#ifdef CONFIG_SMP
3065
3066
3067
3068
3069
3070
3071
3072static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3073 __acquires(rq1->lock)
3074 __acquires(rq2->lock)
3075{
3076 BUG_ON(!irqs_disabled());
3077 if (rq1 == rq2) {
3078 spin_lock(&rq1->lock);
3079 __acquire(rq2->lock);
3080 } else {
3081 if (rq1 < rq2) {
3082 spin_lock(&rq1->lock);
3083 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3084 } else {
3085 spin_lock(&rq2->lock);
3086 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3087 }
3088 }
3089 update_rq_clock(rq1);
3090 update_rq_clock(rq2);
3091}
3092
3093
3094
3095
3096
3097
3098
3099static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3100 __releases(rq1->lock)
3101 __releases(rq2->lock)
3102{
3103 spin_unlock(&rq1->lock);
3104 if (rq1 != rq2)
3105 spin_unlock(&rq2->lock);
3106 else
3107 __release(rq2->lock);
3108}
3109
3110
3111
3112
3113
3114
3115
3116static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3117{
3118 struct migration_req req;
3119 unsigned long flags;
3120 struct rq *rq;
3121
3122 rq = task_rq_lock(p, &flags);
3123 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3124 || unlikely(!cpu_active(dest_cpu)))
3125 goto out;
3126
3127
3128 if (migrate_task(p, dest_cpu, &req)) {
3129
3130 struct task_struct *mt = rq->migration_thread;
3131
3132 get_task_struct(mt);
3133 task_rq_unlock(rq, &flags);
3134 wake_up_process(mt);
3135 put_task_struct(mt);
3136 wait_for_completion(&req.done);
3137
3138 return;
3139 }
3140out:
3141 task_rq_unlock(rq, &flags);
3142}
3143
3144
3145
3146
3147
3148void sched_exec(void)
3149{
3150 int new_cpu, this_cpu = get_cpu();
3151 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3152 put_cpu();
3153 if (new_cpu != this_cpu)
3154 sched_migrate_task(current, new_cpu);
3155}
3156
3157
3158
3159
3160
3161static void pull_task(struct rq *src_rq, struct task_struct *p,
3162 struct rq *this_rq, int this_cpu)
3163{
3164 deactivate_task(src_rq, p, 0);
3165 set_task_cpu(p, this_cpu);
3166 activate_task(this_rq, p, 0);
3167
3168
3169
3170
3171 check_preempt_curr(this_rq, p, 0);
3172}
3173
3174
3175
3176
3177static
3178int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3179 struct sched_domain *sd, enum cpu_idle_type idle,
3180 int *all_pinned)
3181{
3182 int tsk_cache_hot = 0;
3183
3184
3185
3186
3187
3188
3189 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3190 schedstat_inc(p, se.nr_failed_migrations_affine);
3191 return 0;
3192 }
3193 *all_pinned = 0;
3194
3195 if (task_running(rq, p)) {
3196 schedstat_inc(p, se.nr_failed_migrations_running);
3197 return 0;
3198 }
3199
3200
3201
3202
3203
3204
3205
3206 tsk_cache_hot = task_hot(p, rq->clock, sd);
3207 if (!tsk_cache_hot ||
3208 sd->nr_balance_failed > sd->cache_nice_tries) {
3209#ifdef CONFIG_SCHEDSTATS
3210 if (tsk_cache_hot) {
3211 schedstat_inc(sd, lb_hot_gained[idle]);
3212 schedstat_inc(p, se.nr_forced_migrations);
3213 }
3214#endif
3215 return 1;
3216 }
3217
3218 if (tsk_cache_hot) {
3219 schedstat_inc(p, se.nr_failed_migrations_hot);
3220 return 0;
3221 }
3222 return 1;
3223}
3224
3225static unsigned long
3226balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3227 unsigned long max_load_move, struct sched_domain *sd,
3228 enum cpu_idle_type idle, int *all_pinned,
3229 int *this_best_prio, struct rq_iterator *iterator)
3230{
3231 int loops = 0, pulled = 0, pinned = 0;
3232 struct task_struct *p;
3233 long rem_load_move = max_load_move;
3234
3235 if (max_load_move == 0)
3236 goto out;
3237
3238 pinned = 1;
3239
3240
3241
3242
3243 p = iterator->start(iterator->arg);
3244next:
3245 if (!p || loops++ > sysctl_sched_nr_migrate)
3246 goto out;
3247
3248 if ((p->se.load.weight >> 1) > rem_load_move ||
3249 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3250 p = iterator->next(iterator->arg);
3251 goto next;
3252 }
3253
3254 pull_task(busiest, p, this_rq, this_cpu);
3255 pulled++;
3256 rem_load_move -= p->se.load.weight;
3257
3258#ifdef CONFIG_PREEMPT
3259
3260
3261
3262
3263
3264 if (idle == CPU_NEWLY_IDLE)
3265 goto out;
3266#endif
3267
3268
3269
3270
3271 if (rem_load_move > 0) {
3272 if (p->prio < *this_best_prio)
3273 *this_best_prio = p->prio;
3274 p = iterator->next(iterator->arg);
3275 goto next;
3276 }
3277out:
3278
3279
3280
3281
3282
3283 schedstat_add(sd, lb_gained[idle], pulled);
3284
3285 if (all_pinned)
3286 *all_pinned = pinned;
3287
3288 return max_load_move - rem_load_move;
3289}
3290
3291
3292
3293
3294
3295
3296
3297
3298static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3299 unsigned long max_load_move,
3300 struct sched_domain *sd, enum cpu_idle_type idle,
3301 int *all_pinned)
3302{
3303 const struct sched_class *class = sched_class_highest;
3304 unsigned long total_load_moved = 0;
3305 int this_best_prio = this_rq->curr->prio;
3306
3307 do {
3308 total_load_moved +=
3309 class->load_balance(this_rq, this_cpu, busiest,
3310 max_load_move - total_load_moved,
3311 sd, idle, all_pinned, &this_best_prio);
3312 class = class->next;
3313
3314#ifdef CONFIG_PREEMPT
3315
3316
3317
3318
3319
3320 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3321 break;
3322#endif
3323 } while (class && max_load_move > total_load_moved);
3324
3325 return total_load_moved > 0;
3326}
3327
3328static int
3329iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3330 struct sched_domain *sd, enum cpu_idle_type idle,
3331 struct rq_iterator *iterator)
3332{
3333 struct task_struct *p = iterator->start(iterator->arg);
3334 int pinned = 0;
3335
3336 while (p) {
3337 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3338 pull_task(busiest, p, this_rq, this_cpu);
3339
3340
3341
3342
3343
3344 schedstat_inc(sd, lb_gained[idle]);
3345
3346 return 1;
3347 }
3348 p = iterator->next(iterator->arg);
3349 }
3350
3351 return 0;
3352}
3353
3354
3355
3356
3357
3358
3359
3360
3361static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3362 struct sched_domain *sd, enum cpu_idle_type idle)
3363{
3364 const struct sched_class *class;
3365
3366 for_each_class(class) {
3367 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3368 return 1;
3369 }
3370
3371 return 0;
3372}
3373
3374
3375
3376
3377
3378struct sd_lb_stats {
3379 struct sched_group *busiest;
3380 struct sched_group *this;
3381 unsigned long total_load;
3382 unsigned long total_pwr;
3383 unsigned long avg_load;
3384
3385
3386 unsigned long this_load;
3387 unsigned long this_load_per_task;
3388 unsigned long this_nr_running;
3389
3390
3391 unsigned long max_load;
3392 unsigned long busiest_load_per_task;
3393 unsigned long busiest_nr_running;
3394
3395 int group_imb;
3396#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3397 int power_savings_balance;
3398 struct sched_group *group_min;
3399 struct sched_group *group_leader;
3400 unsigned long min_load_per_task;
3401 unsigned long leader_nr_running;
3402 unsigned long min_nr_running;
3403#endif
3404};
3405
3406
3407
3408
3409struct sg_lb_stats {
3410 unsigned long avg_load;
3411 unsigned long group_load;
3412 unsigned long sum_nr_running;
3413 unsigned long sum_weighted_load;
3414 unsigned long group_capacity;
3415 int group_imb;
3416};
3417
3418
3419
3420
3421
3422static inline unsigned int group_first_cpu(struct sched_group *group)
3423{
3424 return cpumask_first(sched_group_cpus(group));
3425}
3426
3427
3428
3429
3430
3431
3432static inline int get_sd_load_idx(struct sched_domain *sd,
3433 enum cpu_idle_type idle)
3434{
3435 int load_idx;
3436
3437 switch (idle) {
3438 case CPU_NOT_IDLE:
3439 load_idx = sd->busy_idx;
3440 break;
3441
3442 case CPU_NEWLY_IDLE:
3443 load_idx = sd->newidle_idx;
3444 break;
3445 default:
3446 load_idx = sd->idle_idx;
3447 break;
3448 }
3449
3450 return load_idx;
3451}
3452
3453
3454#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3455
3456
3457
3458
3459
3460
3461
3462
3463static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3464 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3465{
3466
3467
3468
3469
3470 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3471 sds->power_savings_balance = 0;
3472 else {
3473 sds->power_savings_balance = 1;
3474 sds->min_nr_running = ULONG_MAX;
3475 sds->leader_nr_running = 0;
3476 }
3477}
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489static inline void update_sd_power_savings_stats(struct sched_group *group,
3490 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3491{
3492
3493 if (!sds->power_savings_balance)
3494 return;
3495
3496
3497
3498
3499
3500 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3501 !sds->this_nr_running))
3502 sds->power_savings_balance = 0;
3503
3504
3505
3506
3507
3508 if (!sds->power_savings_balance ||
3509 sgs->sum_nr_running >= sgs->group_capacity ||
3510 !sgs->sum_nr_running)
3511 return;
3512
3513
3514
3515
3516
3517
3518 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3519 (sgs->sum_nr_running == sds->min_nr_running &&
3520 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3521 sds->group_min = group;
3522 sds->min_nr_running = sgs->sum_nr_running;
3523 sds->min_load_per_task = sgs->sum_weighted_load /
3524 sgs->sum_nr_running;
3525 }
3526
3527
3528
3529
3530
3531
3532 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3533 return;
3534
3535 if (sgs->sum_nr_running > sds->leader_nr_running ||
3536 (sgs->sum_nr_running == sds->leader_nr_running &&
3537 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3538 sds->group_leader = group;
3539 sds->leader_nr_running = sgs->sum_nr_running;
3540 }
3541}
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3559 int this_cpu, unsigned long *imbalance)
3560{
3561 if (!sds->power_savings_balance)
3562 return 0;
3563
3564 if (sds->this != sds->group_leader ||
3565 sds->group_leader == sds->group_min)
3566 return 0;
3567
3568 *imbalance = sds->min_load_per_task;
3569 sds->busiest = sds->group_min;
3570
3571 return 1;
3572
3573}
3574#else
3575static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3576 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3577{
3578 return;
3579}
3580
3581static inline void update_sd_power_savings_stats(struct sched_group *group,
3582 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3583{
3584 return;
3585}
3586
3587static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3588 int this_cpu, unsigned long *imbalance)
3589{
3590 return 0;
3591}
3592#endif
3593
3594
3595unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3596{
3597 return SCHED_LOAD_SCALE;
3598}
3599
3600unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3601{
3602 return default_scale_freq_power(sd, cpu);
3603}
3604
3605unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3606{
3607 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3608 unsigned long smt_gain = sd->smt_gain;
3609
3610 smt_gain /= weight;
3611
3612 return smt_gain;
3613}
3614
3615unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3616{
3617 return default_scale_smt_power(sd, cpu);
3618}
3619
3620unsigned long scale_rt_power(int cpu)
3621{
3622 struct rq *rq = cpu_rq(cpu);
3623 u64 total, available;
3624
3625 sched_avg_update(rq);
3626
3627 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3628 available = total - rq->rt_avg;
3629
3630 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3631 total = SCHED_LOAD_SCALE;
3632
3633 total >>= SCHED_LOAD_SHIFT;
3634
3635 return div_u64(available, total);
3636}
3637
3638static void update_cpu_power(struct sched_domain *sd, int cpu)
3639{
3640 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3641 unsigned long power = SCHED_LOAD_SCALE;
3642 struct sched_group *sdg = sd->groups;
3643
3644 if (sched_feat(ARCH_POWER))
3645 power *= arch_scale_freq_power(sd, cpu);
3646 else
3647 power *= default_scale_freq_power(sd, cpu);
3648
3649 power >>= SCHED_LOAD_SHIFT;
3650
3651 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3652 if (sched_feat(ARCH_POWER))
3653 power *= arch_scale_smt_power(sd, cpu);
3654 else
3655 power *= default_scale_smt_power(sd, cpu);
3656
3657 power >>= SCHED_LOAD_SHIFT;
3658 }
3659
3660 power *= scale_rt_power(cpu);
3661 power >>= SCHED_LOAD_SHIFT;
3662
3663 if (!power)
3664 power = 1;
3665
3666 sdg->cpu_power = power;
3667}
3668
3669static void update_group_power(struct sched_domain *sd, int cpu)
3670{
3671 struct sched_domain *child = sd->child;
3672 struct sched_group *group, *sdg = sd->groups;
3673 unsigned long power;
3674
3675 if (!child) {
3676 update_cpu_power(sd, cpu);
3677 return;
3678 }
3679
3680 power = 0;
3681
3682 group = child->groups;
3683 do {
3684 power += group->cpu_power;
3685 group = group->next;
3686 } while (group != child->groups);
3687
3688 sdg->cpu_power = power;
3689}
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704static inline void update_sg_lb_stats(struct sched_domain *sd,
3705 struct sched_group *group, int this_cpu,
3706 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3707 int local_group, const struct cpumask *cpus,
3708 int *balance, struct sg_lb_stats *sgs)
3709{
3710 unsigned long load, max_cpu_load, min_cpu_load;
3711 int i;
3712 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3713 unsigned long sum_avg_load_per_task;
3714 unsigned long avg_load_per_task;
3715
3716 if (local_group) {
3717 balance_cpu = group_first_cpu(group);
3718 if (balance_cpu == this_cpu)
3719 update_group_power(sd, this_cpu);
3720 }
3721
3722
3723 sum_avg_load_per_task = avg_load_per_task = 0;
3724 max_cpu_load = 0;
3725 min_cpu_load = ~0UL;
3726
3727 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3728 struct rq *rq = cpu_rq(i);
3729
3730 if (*sd_idle && rq->nr_running)
3731 *sd_idle = 0;
3732
3733
3734 if (local_group) {
3735 if (idle_cpu(i) && !first_idle_cpu) {
3736 first_idle_cpu = 1;
3737 balance_cpu = i;
3738 }
3739
3740 load = target_load(i, load_idx);
3741 } else {
3742 load = source_load(i, load_idx);
3743 if (load > max_cpu_load)
3744 max_cpu_load = load;
3745 if (min_cpu_load > load)
3746 min_cpu_load = load;
3747 }
3748
3749 sgs->group_load += load;
3750 sgs->sum_nr_running += rq->nr_running;
3751 sgs->sum_weighted_load += weighted_cpuload(i);
3752
3753 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3754 }
3755
3756
3757
3758
3759
3760
3761
3762 if (idle != CPU_NEWLY_IDLE && local_group &&
3763 balance_cpu != this_cpu && balance) {
3764 *balance = 0;
3765 return;
3766 }
3767
3768
3769 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3782 group->cpu_power;
3783
3784 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3785 sgs->group_imb = 1;
3786
3787 sgs->group_capacity =
3788 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3789}
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3802 enum cpu_idle_type idle, int *sd_idle,
3803 const struct cpumask *cpus, int *balance,
3804 struct sd_lb_stats *sds)
3805{
3806 struct sched_domain *child = sd->child;
3807 struct sched_group *group = sd->groups;
3808 struct sg_lb_stats sgs;
3809 int load_idx, prefer_sibling = 0;
3810
3811 if (child && child->flags & SD_PREFER_SIBLING)
3812 prefer_sibling = 1;
3813
3814 init_sd_power_savings_stats(sd, sds, idle);
3815 load_idx = get_sd_load_idx(sd, idle);
3816
3817 do {
3818 int local_group;
3819
3820 local_group = cpumask_test_cpu(this_cpu,
3821 sched_group_cpus(group));
3822 memset(&sgs, 0, sizeof(sgs));
3823 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3824 local_group, cpus, balance, &sgs);
3825
3826 if (local_group && balance && !(*balance))
3827 return;
3828
3829 sds->total_load += sgs.group_load;
3830 sds->total_pwr += group->cpu_power;
3831
3832
3833
3834
3835
3836
3837 if (prefer_sibling)
3838 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3839
3840 if (local_group) {
3841 sds->this_load = sgs.avg_load;
3842 sds->this = group;
3843 sds->this_nr_running = sgs.sum_nr_running;
3844 sds->this_load_per_task = sgs.sum_weighted_load;
3845 } else if (sgs.avg_load > sds->max_load &&
3846 (sgs.sum_nr_running > sgs.group_capacity ||
3847 sgs.group_imb)) {
3848 sds->max_load = sgs.avg_load;
3849 sds->busiest = group;
3850 sds->busiest_nr_running = sgs.sum_nr_running;
3851 sds->busiest_load_per_task = sgs.sum_weighted_load;
3852 sds->group_imb = sgs.group_imb;
3853 }
3854
3855 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3856 group = group->next;
3857 } while (group != sd->groups);
3858}
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3869 int this_cpu, unsigned long *imbalance)
3870{
3871 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3872 unsigned int imbn = 2;
3873
3874 if (sds->this_nr_running) {
3875 sds->this_load_per_task /= sds->this_nr_running;
3876 if (sds->busiest_load_per_task >
3877 sds->this_load_per_task)
3878 imbn = 1;
3879 } else
3880 sds->this_load_per_task =
3881 cpu_avg_load_per_task(this_cpu);
3882
3883 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3884 sds->busiest_load_per_task * imbn) {
3885 *imbalance = sds->busiest_load_per_task;
3886 return;
3887 }
3888
3889
3890
3891
3892
3893
3894
3895 pwr_now += sds->busiest->cpu_power *
3896 min(sds->busiest_load_per_task, sds->max_load);
3897 pwr_now += sds->this->cpu_power *
3898 min(sds->this_load_per_task, sds->this_load);
3899 pwr_now /= SCHED_LOAD_SCALE;
3900
3901
3902 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3903 sds->busiest->cpu_power;
3904 if (sds->max_load > tmp)
3905 pwr_move += sds->busiest->cpu_power *
3906 min(sds->busiest_load_per_task, sds->max_load - tmp);
3907
3908
3909 if (sds->max_load * sds->busiest->cpu_power <
3910 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3911 tmp = (sds->max_load * sds->busiest->cpu_power) /
3912 sds->this->cpu_power;
3913 else
3914 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3915 sds->this->cpu_power;
3916 pwr_move += sds->this->cpu_power *
3917 min(sds->this_load_per_task, sds->this_load + tmp);
3918 pwr_move /= SCHED_LOAD_SCALE;
3919
3920
3921 if (pwr_move > pwr_now)
3922 *imbalance = sds->busiest_load_per_task;
3923}
3924
3925
3926
3927
3928
3929
3930
3931
3932static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3933 unsigned long *imbalance)
3934{
3935 unsigned long max_pull;
3936
3937
3938
3939
3940
3941 if (sds->max_load < sds->avg_load) {
3942 *imbalance = 0;
3943 return fix_small_imbalance(sds, this_cpu, imbalance);
3944 }
3945
3946
3947 max_pull = min(sds->max_load - sds->avg_load,
3948 sds->max_load - sds->busiest_load_per_task);
3949
3950
3951 *imbalance = min(max_pull * sds->busiest->cpu_power,
3952 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3953 / SCHED_LOAD_SCALE;
3954
3955
3956
3957
3958
3959
3960
3961 if (*imbalance < sds->busiest_load_per_task)
3962 return fix_small_imbalance(sds, this_cpu, imbalance);
3963
3964}
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992static struct sched_group *
3993find_busiest_group(struct sched_domain *sd, int this_cpu,
3994 unsigned long *imbalance, enum cpu_idle_type idle,
3995 int *sd_idle, const struct cpumask *cpus, int *balance)
3996{
3997 struct sd_lb_stats sds;
3998
3999 memset(&sds, 0, sizeof(sds));
4000
4001
4002
4003
4004
4005 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4006 balance, &sds);
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018 if (balance && !(*balance))
4019 goto ret;
4020
4021 if (!sds.busiest || sds.busiest_nr_running == 0)
4022 goto out_balanced;
4023
4024 if (sds.this_load >= sds.max_load)
4025 goto out_balanced;
4026
4027 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4028
4029 if (sds.this_load >= sds.avg_load)
4030 goto out_balanced;
4031
4032 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4033 goto out_balanced;
4034
4035 sds.busiest_load_per_task /= sds.busiest_nr_running;
4036 if (sds.group_imb)
4037 sds.busiest_load_per_task =
4038 min(sds.busiest_load_per_task, sds.avg_load);
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051 if (sds.max_load <= sds.busiest_load_per_task)
4052 goto out_balanced;
4053
4054
4055 calculate_imbalance(&sds, this_cpu, imbalance);
4056 return sds.busiest;
4057
4058out_balanced:
4059
4060
4061
4062
4063 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4064 return sds.busiest;
4065ret:
4066 *imbalance = 0;
4067 return NULL;
4068}
4069
4070
4071
4072
4073static struct rq *
4074find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4075 unsigned long imbalance, const struct cpumask *cpus)
4076{
4077 struct rq *busiest = NULL, *rq;
4078 unsigned long max_load = 0;
4079 int i;
4080
4081 for_each_cpu(i, sched_group_cpus(group)) {
4082 unsigned long power = power_of(i);
4083 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4084 unsigned long wl;
4085
4086 if (!cpumask_test_cpu(i, cpus))
4087 continue;
4088
4089 rq = cpu_rq(i);
4090 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4091 wl /= power;
4092
4093 if (capacity && rq->nr_running == 1 && wl > imbalance)
4094 continue;
4095
4096 if (wl > max_load) {
4097 max_load = wl;
4098 busiest = rq;
4099 }
4100 }
4101
4102 return busiest;
4103}
4104
4105
4106
4107
4108
4109#define MAX_PINNED_INTERVAL 512
4110
4111
4112static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4113
4114
4115
4116
4117
4118static int load_balance(int this_cpu, struct rq *this_rq,
4119 struct sched_domain *sd, enum cpu_idle_type idle,
4120 int *balance)
4121{
4122 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4123 struct sched_group *group;
4124 unsigned long imbalance;
4125 struct rq *busiest;
4126 unsigned long flags;
4127 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4128
4129 cpumask_setall(cpus);
4130
4131
4132
4133
4134
4135
4136
4137 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4138 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4139 sd_idle = 1;
4140
4141 schedstat_inc(sd, lb_count[idle]);
4142
4143redo:
4144 update_shares(sd);
4145 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4146 cpus, balance);
4147
4148 if (*balance == 0)
4149 goto out_balanced;
4150
4151 if (!group) {
4152 schedstat_inc(sd, lb_nobusyg[idle]);
4153 goto out_balanced;
4154 }
4155
4156 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4157 if (!busiest) {
4158 schedstat_inc(sd, lb_nobusyq[idle]);
4159 goto out_balanced;
4160 }
4161
4162 BUG_ON(busiest == this_rq);
4163
4164 schedstat_add(sd, lb_imbalance[idle], imbalance);
4165
4166 ld_moved = 0;
4167 if (busiest->nr_running > 1) {
4168
4169
4170
4171
4172
4173
4174 local_irq_save(flags);
4175 double_rq_lock(this_rq, busiest);
4176 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4177 imbalance, sd, idle, &all_pinned);
4178 double_rq_unlock(this_rq, busiest);
4179 local_irq_restore(flags);
4180
4181
4182
4183
4184 if (ld_moved && this_cpu != smp_processor_id())
4185 resched_cpu(this_cpu);
4186
4187
4188 if (unlikely(all_pinned)) {
4189 cpumask_clear_cpu(cpu_of(busiest), cpus);
4190 if (!cpumask_empty(cpus))
4191 goto redo;
4192 goto out_balanced;
4193 }
4194 }
4195
4196 if (!ld_moved) {
4197 schedstat_inc(sd, lb_failed[idle]);
4198 sd->nr_balance_failed++;
4199
4200 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4201
4202 spin_lock_irqsave(&busiest->lock, flags);
4203
4204
4205
4206
4207 if (!cpumask_test_cpu(this_cpu,
4208 &busiest->curr->cpus_allowed)) {
4209 spin_unlock_irqrestore(&busiest->lock, flags);
4210 all_pinned = 1;
4211 goto out_one_pinned;
4212 }
4213
4214 if (!busiest->active_balance) {
4215 busiest->active_balance = 1;
4216 busiest->push_cpu = this_cpu;
4217 active_balance = 1;
4218 }
4219 spin_unlock_irqrestore(&busiest->lock, flags);
4220 if (active_balance)
4221 wake_up_process(busiest->migration_thread);
4222
4223
4224
4225
4226
4227 sd->nr_balance_failed = sd->cache_nice_tries+1;
4228 }
4229 } else
4230 sd->nr_balance_failed = 0;
4231
4232 if (likely(!active_balance)) {
4233
4234 sd->balance_interval = sd->min_interval;
4235 } else {
4236
4237
4238
4239
4240
4241
4242 if (sd->balance_interval < sd->max_interval)
4243 sd->balance_interval *= 2;
4244 }
4245
4246 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4247 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4248 ld_moved = -1;
4249
4250 goto out;
4251
4252out_balanced:
4253 schedstat_inc(sd, lb_balanced[idle]);
4254
4255 sd->nr_balance_failed = 0;
4256
4257out_one_pinned:
4258
4259 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4260 (sd->balance_interval < sd->max_interval))
4261 sd->balance_interval *= 2;
4262
4263 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4264 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4265 ld_moved = -1;
4266 else
4267 ld_moved = 0;
4268out:
4269 if (ld_moved)
4270 update_shares(sd);
4271 return ld_moved;
4272}
4273
4274
4275
4276
4277
4278
4279
4280
4281static int
4282load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4283{
4284 struct sched_group *group;
4285 struct rq *busiest = NULL;
4286 unsigned long imbalance;
4287 int ld_moved = 0;
4288 int sd_idle = 0;
4289 int all_pinned = 0;
4290 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4291
4292 cpumask_setall(cpus);
4293
4294
4295
4296
4297
4298
4299
4300 if (sd->flags & SD_SHARE_CPUPOWER &&
4301 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4302 sd_idle = 1;
4303
4304 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4305redo:
4306 update_shares_locked(this_rq, sd);
4307 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4308 &sd_idle, cpus, NULL);
4309 if (!group) {
4310 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4311 goto out_balanced;
4312 }
4313
4314 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4315 if (!busiest) {
4316 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4317 goto out_balanced;
4318 }
4319
4320 BUG_ON(busiest == this_rq);
4321
4322 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4323
4324 ld_moved = 0;
4325 if (busiest->nr_running > 1) {
4326
4327 double_lock_balance(this_rq, busiest);
4328
4329 update_rq_clock(busiest);
4330 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4331 imbalance, sd, CPU_NEWLY_IDLE,
4332 &all_pinned);
4333 double_unlock_balance(this_rq, busiest);
4334
4335 if (unlikely(all_pinned)) {
4336 cpumask_clear_cpu(cpu_of(busiest), cpus);
4337 if (!cpumask_empty(cpus))
4338 goto redo;
4339 }
4340 }
4341
4342 if (!ld_moved) {
4343 int active_balance = 0;
4344
4345 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4346 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4347 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4348 return -1;
4349
4350 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4351 return -1;
4352
4353 if (sd->nr_balance_failed++ < 2)
4354 return -1;
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379 double_lock_balance(this_rq, busiest);
4380
4381
4382
4383
4384
4385 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4386 double_unlock_balance(this_rq, busiest);
4387 all_pinned = 1;
4388 return ld_moved;
4389 }
4390
4391 if (!busiest->active_balance) {
4392 busiest->active_balance = 1;
4393 busiest->push_cpu = this_cpu;
4394 active_balance = 1;
4395 }
4396
4397 double_unlock_balance(this_rq, busiest);
4398
4399
4400
4401 spin_unlock(&this_rq->lock);
4402 if (active_balance)
4403 wake_up_process(busiest->migration_thread);
4404 spin_lock(&this_rq->lock);
4405
4406 } else
4407 sd->nr_balance_failed = 0;
4408
4409 update_shares_locked(this_rq, sd);
4410 return ld_moved;
4411
4412out_balanced:
4413 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4414 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4415 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4416 return -1;
4417 sd->nr_balance_failed = 0;
4418
4419 return 0;
4420}
4421
4422
4423
4424
4425
4426static void idle_balance(int this_cpu, struct rq *this_rq)
4427{
4428 struct sched_domain *sd;
4429 int pulled_task = 0;
4430 unsigned long next_balance = jiffies + HZ;
4431
4432 for_each_domain(this_cpu, sd) {
4433 unsigned long interval;
4434
4435 if (!(sd->flags & SD_LOAD_BALANCE))
4436 continue;
4437
4438 if (sd->flags & SD_BALANCE_NEWIDLE)
4439
4440 pulled_task = load_balance_newidle(this_cpu, this_rq,
4441 sd);
4442
4443 interval = msecs_to_jiffies(sd->balance_interval);
4444 if (time_after(next_balance, sd->last_balance + interval))
4445 next_balance = sd->last_balance + interval;
4446 if (pulled_task)
4447 break;
4448 }
4449 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4450
4451
4452
4453
4454 this_rq->next_balance = next_balance;
4455 }
4456}
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4467{
4468 int target_cpu = busiest_rq->push_cpu;
4469 struct sched_domain *sd;
4470 struct rq *target_rq;
4471
4472
4473 if (busiest_rq->nr_running <= 1)
4474 return;
4475
4476 target_rq = cpu_rq(target_cpu);
4477
4478
4479
4480
4481
4482
4483 BUG_ON(busiest_rq == target_rq);
4484
4485
4486 double_lock_balance(busiest_rq, target_rq);
4487 update_rq_clock(busiest_rq);
4488 update_rq_clock(target_rq);
4489
4490
4491 for_each_domain(target_cpu, sd) {
4492 if ((sd->flags & SD_LOAD_BALANCE) &&
4493 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4494 break;
4495 }
4496
4497 if (likely(sd)) {
4498 schedstat_inc(sd, alb_count);
4499
4500 if (move_one_task(target_rq, target_cpu, busiest_rq,
4501 sd, CPU_IDLE))
4502 schedstat_inc(sd, alb_pushed);
4503 else
4504 schedstat_inc(sd, alb_failed);
4505 }
4506 double_unlock_balance(busiest_rq, target_rq);
4507}
4508
4509#ifdef CONFIG_NO_HZ
4510static struct {
4511 atomic_t load_balancer;
4512 cpumask_var_t cpu_mask;
4513 cpumask_var_t ilb_grp_nohz_mask;
4514} nohz ____cacheline_aligned = {
4515 .load_balancer = ATOMIC_INIT(-1),
4516};
4517
4518int get_nohz_load_balancer(void)
4519{
4520 return atomic_read(&nohz.load_balancer);
4521}
4522
4523#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4534{
4535 struct sched_domain *sd;
4536
4537 for_each_domain(cpu, sd)
4538 if (sd && (sd->flags & flag))
4539 break;
4540
4541 return sd;
4542}
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554#define for_each_flag_domain(cpu, sd, flag) \
4555 for (sd = lowest_flag_domain(cpu, flag); \
4556 (sd && (sd->flags & flag)); sd = sd->parent)
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568static inline int is_semi_idle_group(struct sched_group *ilb_group)
4569{
4570 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4571 sched_group_cpus(ilb_group));
4572
4573
4574
4575
4576
4577 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4578 return 0;
4579
4580 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4581 return 0;
4582
4583 return 1;
4584}
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597static int find_new_ilb(int cpu)
4598{
4599 struct sched_domain *sd;
4600 struct sched_group *ilb_group;
4601
4602
4603
4604
4605
4606 if (!(sched_smt_power_savings || sched_mc_power_savings))
4607 goto out_done;
4608
4609
4610
4611
4612
4613 if (cpumask_weight(nohz.cpu_mask) < 2)
4614 goto out_done;
4615
4616 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4617 ilb_group = sd->groups;
4618
4619 do {
4620 if (is_semi_idle_group(ilb_group))
4621 return cpumask_first(nohz.ilb_grp_nohz_mask);
4622
4623 ilb_group = ilb_group->next;
4624
4625 } while (ilb_group != sd->groups);
4626 }
4627
4628out_done:
4629 return cpumask_first(nohz.cpu_mask);
4630}
4631#else
4632static inline int find_new_ilb(int call_cpu)
4633{
4634 return cpumask_first(nohz.cpu_mask);
4635}
4636#endif
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658int select_nohz_load_balancer(int stop_tick)
4659{
4660 int cpu = smp_processor_id();
4661
4662 if (stop_tick) {
4663 cpu_rq(cpu)->in_nohz_recently = 1;
4664
4665 if (!cpu_active(cpu)) {
4666 if (atomic_read(&nohz.load_balancer) != cpu)
4667 return 0;
4668
4669
4670
4671
4672
4673 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4674 BUG();
4675
4676 return 0;
4677 }
4678
4679 cpumask_set_cpu(cpu, nohz.cpu_mask);
4680
4681
4682 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4683 if (atomic_read(&nohz.load_balancer) == cpu)
4684 atomic_set(&nohz.load_balancer, -1);
4685 return 0;
4686 }
4687
4688 if (atomic_read(&nohz.load_balancer) == -1) {
4689
4690 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4691 return 1;
4692 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4693 int new_ilb;
4694
4695 if (!(sched_smt_power_savings ||
4696 sched_mc_power_savings))
4697 return 1;
4698
4699
4700
4701
4702 new_ilb = find_new_ilb(cpu);
4703 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4704 atomic_set(&nohz.load_balancer, -1);
4705 resched_cpu(new_ilb);
4706 return 0;
4707 }
4708 return 1;
4709 }
4710 } else {
4711 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4712 return 0;
4713
4714 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4715
4716 if (atomic_read(&nohz.load_balancer) == cpu)
4717 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4718 BUG();
4719 }
4720 return 0;
4721}
4722#endif
4723
4724static DEFINE_SPINLOCK(balancing);
4725
4726
4727
4728
4729
4730
4731
4732static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4733{
4734 int balance = 1;
4735 struct rq *rq = cpu_rq(cpu);
4736 unsigned long interval;
4737 struct sched_domain *sd;
4738
4739 unsigned long next_balance = jiffies + 60*HZ;
4740 int update_next_balance = 0;
4741 int need_serialize;
4742
4743 for_each_domain(cpu, sd) {
4744 if (!(sd->flags & SD_LOAD_BALANCE))
4745 continue;
4746
4747 interval = sd->balance_interval;
4748 if (idle != CPU_IDLE)
4749 interval *= sd->busy_factor;
4750
4751
4752 interval = msecs_to_jiffies(interval);
4753 if (unlikely(!interval))
4754 interval = 1;
4755 if (interval > HZ*NR_CPUS/10)
4756 interval = HZ*NR_CPUS/10;
4757
4758 need_serialize = sd->flags & SD_SERIALIZE;
4759
4760 if (need_serialize) {
4761 if (!spin_trylock(&balancing))
4762 goto out;
4763 }
4764
4765 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4766 if (load_balance(cpu, rq, sd, idle, &balance)) {
4767
4768
4769
4770
4771
4772 idle = CPU_NOT_IDLE;
4773 }
4774 sd->last_balance = jiffies;
4775 }
4776 if (need_serialize)
4777 spin_unlock(&balancing);
4778out:
4779 if (time_after(next_balance, sd->last_balance + interval)) {
4780 next_balance = sd->last_balance + interval;
4781 update_next_balance = 1;
4782 }
4783
4784
4785
4786
4787
4788
4789 if (!balance)
4790 break;
4791 }
4792
4793
4794
4795
4796
4797
4798 if (likely(update_next_balance))
4799 rq->next_balance = next_balance;
4800}
4801
4802
4803
4804
4805
4806
4807static void run_rebalance_domains(struct softirq_action *h)
4808{
4809 int this_cpu = smp_processor_id();
4810 struct rq *this_rq = cpu_rq(this_cpu);
4811 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4812 CPU_IDLE : CPU_NOT_IDLE;
4813
4814 rebalance_domains(this_cpu, idle);
4815
4816#ifdef CONFIG_NO_HZ
4817
4818
4819
4820
4821
4822 if (this_rq->idle_at_tick &&
4823 atomic_read(&nohz.load_balancer) == this_cpu) {
4824 struct rq *rq;
4825 int balance_cpu;
4826
4827 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4828 if (balance_cpu == this_cpu)
4829 continue;
4830
4831
4832
4833
4834
4835
4836 if (need_resched())
4837 break;
4838
4839 rebalance_domains(balance_cpu, CPU_IDLE);
4840
4841 rq = cpu_rq(balance_cpu);
4842 if (time_after(this_rq->next_balance, rq->next_balance))
4843 this_rq->next_balance = rq->next_balance;
4844 }
4845 }
4846#endif
4847}
4848
4849static inline int on_null_domain(int cpu)
4850{
4851 return !rcu_dereference(cpu_rq(cpu)->sd);
4852}
4853
4854
4855
4856
4857
4858
4859
4860
4861static inline void trigger_load_balance(struct rq *rq, int cpu)
4862{
4863#ifdef CONFIG_NO_HZ
4864
4865
4866
4867
4868
4869 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4870 rq->in_nohz_recently = 0;
4871
4872 if (atomic_read(&nohz.load_balancer) == cpu) {
4873 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4874 atomic_set(&nohz.load_balancer, -1);
4875 }
4876
4877 if (atomic_read(&nohz.load_balancer) == -1) {
4878 int ilb = find_new_ilb(cpu);
4879
4880 if (ilb < nr_cpu_ids)
4881 resched_cpu(ilb);
4882 }
4883 }
4884
4885
4886
4887
4888
4889 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4890 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4891 resched_cpu(cpu);
4892 return;
4893 }
4894
4895
4896
4897
4898
4899 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4900 cpumask_test_cpu(cpu, nohz.cpu_mask))
4901 return;
4902#endif
4903
4904 if (time_after_eq(jiffies, rq->next_balance) &&
4905 likely(!on_null_domain(cpu)))
4906 raise_softirq(SCHED_SOFTIRQ);
4907}
4908
4909#else
4910
4911
4912
4913
4914static inline void idle_balance(int cpu, struct rq *rq)
4915{
4916}
4917
4918#endif
4919
4920DEFINE_PER_CPU(struct kernel_stat, kstat);
4921
4922EXPORT_PER_CPU_SYMBOL(kstat);
4923
4924
4925
4926
4927
4928
4929
4930static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4931{
4932 u64 ns = 0;
4933
4934 if (task_current(rq, p)) {
4935 update_rq_clock(rq);
4936 ns = rq->clock - p->se.exec_start;
4937 if ((s64)ns < 0)
4938 ns = 0;
4939 }
4940
4941 return ns;
4942}
4943
4944unsigned long long task_delta_exec(struct task_struct *p)
4945{
4946 unsigned long flags;
4947 struct rq *rq;
4948 u64 ns = 0;
4949
4950 rq = task_rq_lock(p, &flags);
4951 ns = do_task_delta_exec(p, rq);
4952 task_rq_unlock(rq, &flags);
4953
4954 return ns;
4955}
4956
4957
4958
4959
4960
4961
4962unsigned long long task_sched_runtime(struct task_struct *p)
4963{
4964 unsigned long flags;
4965 struct rq *rq;
4966 u64 ns = 0;
4967
4968 rq = task_rq_lock(p, &flags);
4969 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4970 task_rq_unlock(rq, &flags);
4971
4972 return ns;
4973}
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984unsigned long long thread_group_sched_runtime(struct task_struct *p)
4985{
4986 struct task_cputime totals;
4987 unsigned long flags;
4988 struct rq *rq;
4989 u64 ns;
4990
4991 rq = task_rq_lock(p, &flags);
4992 thread_group_cputime(p, &totals);
4993 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4994 task_rq_unlock(rq, &flags);
4995
4996 return ns;
4997}
4998
4999
5000
5001
5002
5003
5004
5005void account_user_time(struct task_struct *p, cputime_t cputime,
5006 cputime_t cputime_scaled)
5007{
5008 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5009 cputime64_t tmp;
5010
5011
5012 p->utime = cputime_add(p->utime, cputime);
5013 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
5014 account_group_user_time(p, cputime);
5015
5016
5017 tmp = cputime_to_cputime64(cputime);
5018 if (TASK_NICE(p) > 0)
5019 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5020 else
5021 cpustat->user = cputime64_add(cpustat->user, tmp);
5022
5023 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
5024
5025 acct_update_integrals(p);
5026}
5027
5028
5029
5030
5031
5032
5033
5034static void account_guest_time(struct task_struct *p, cputime_t cputime,
5035 cputime_t cputime_scaled)
5036{
5037 cputime64_t tmp;
5038 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5039
5040 tmp = cputime_to_cputime64(cputime);
5041
5042
5043 p->utime = cputime_add(p->utime, cputime);
5044 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
5045 account_group_user_time(p, cputime);
5046 p->gtime = cputime_add(p->gtime, cputime);
5047
5048
5049 cpustat->user = cputime64_add(cpustat->user, tmp);
5050 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5051}
5052
5053
5054
5055
5056
5057
5058
5059
5060void account_system_time(struct task_struct *p, int hardirq_offset,
5061 cputime_t cputime, cputime_t cputime_scaled)
5062{
5063 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5064 cputime64_t tmp;
5065
5066 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
5067 account_guest_time(p, cputime, cputime_scaled);
5068 return;
5069 }
5070
5071
5072 p->stime = cputime_add(p->stime, cputime);
5073 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
5074 account_group_system_time(p, cputime);
5075
5076
5077 tmp = cputime_to_cputime64(cputime);
5078 if (hardirq_count() - hardirq_offset)
5079 cpustat->irq = cputime64_add(cpustat->irq, tmp);
5080 else if (softirq_count())
5081 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
5082 else
5083 cpustat->system = cputime64_add(cpustat->system, tmp);
5084
5085 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
5086
5087
5088 acct_update_integrals(p);
5089}
5090
5091
5092
5093
5094
5095void account_steal_time(cputime_t cputime)
5096{
5097 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5098 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5099
5100 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
5101}
5102
5103
5104
5105
5106
5107void account_idle_time(cputime_t cputime)
5108{
5109 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5110 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5111 struct rq *rq = this_rq();
5112
5113 if (atomic_read(&rq->nr_iowait) > 0)
5114 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
5115 else
5116 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
5117}
5118
5119#ifndef CONFIG_VIRT_CPU_ACCOUNTING
5120
5121
5122
5123
5124
5125
5126void account_process_tick(struct task_struct *p, int user_tick)
5127{
5128 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5129 struct rq *rq = this_rq();
5130
5131 if (user_tick)
5132 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5133 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5134 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5135 one_jiffy_scaled);
5136 else
5137 account_idle_time(cputime_one_jiffy);
5138}
5139
5140
5141
5142
5143
5144
5145void account_steal_ticks(unsigned long ticks)
5146{
5147 account_steal_time(jiffies_to_cputime(ticks));
5148}
5149
5150
5151
5152
5153
5154void account_idle_ticks(unsigned long ticks)
5155{
5156 account_idle_time(jiffies_to_cputime(ticks));
5157}
5158
5159#endif
5160
5161
5162
5163
5164#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5165cputime_t task_utime(struct task_struct *p)
5166{
5167 return p->utime;
5168}
5169
5170cputime_t task_stime(struct task_struct *p)
5171{
5172 return p->stime;
5173}
5174#else
5175cputime_t task_utime(struct task_struct *p)
5176{
5177 clock_t utime = cputime_to_clock_t(p->utime),
5178 total = utime + cputime_to_clock_t(p->stime);
5179 u64 temp;
5180
5181
5182
5183
5184 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
5185
5186 if (total) {
5187 temp *= utime;
5188 do_div(temp, total);
5189 }
5190 utime = (clock_t)temp;
5191
5192 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
5193 return p->prev_utime;
5194}
5195
5196cputime_t task_stime(struct task_struct *p)
5197{
5198 clock_t stime;
5199
5200
5201
5202
5203
5204
5205 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5206 cputime_to_clock_t(task_utime(p));
5207
5208 if (stime >= 0)
5209 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
5210
5211 return p->prev_stime;
5212}
5213#endif
5214
5215inline cputime_t task_gtime(struct task_struct *p)
5216{
5217 return p->gtime;
5218}
5219
5220
5221
5222
5223
5224
5225
5226
5227void scheduler_tick(void)
5228{
5229 int cpu = smp_processor_id();
5230 struct rq *rq = cpu_rq(cpu);
5231 struct task_struct *curr = rq->curr;
5232
5233 sched_clock_tick();
5234
5235 spin_lock(&rq->lock);
5236 update_rq_clock(rq);
5237 update_cpu_load(rq);
5238 curr->sched_class->task_tick(rq, curr, 0);
5239 spin_unlock(&rq->lock);
5240
5241 perf_event_task_tick(curr, cpu);
5242
5243#ifdef CONFIG_SMP
5244 rq->idle_at_tick = idle_cpu(cpu);
5245 trigger_load_balance(rq, cpu);
5246#endif
5247}
5248
5249notrace unsigned long get_parent_ip(unsigned long addr)
5250{
5251 if (in_lock_functions(addr)) {
5252 addr = CALLER_ADDR2;
5253 if (in_lock_functions(addr))
5254 addr = CALLER_ADDR3;
5255 }
5256 return addr;
5257}
5258
5259#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5260 defined(CONFIG_PREEMPT_TRACER))
5261
5262void __kprobes add_preempt_count(int val)
5263{
5264#ifdef CONFIG_DEBUG_PREEMPT
5265
5266
5267
5268 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5269 return;
5270#endif
5271 preempt_count() += val;
5272#ifdef CONFIG_DEBUG_PREEMPT
5273
5274
5275
5276 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5277 PREEMPT_MASK - 10);
5278#endif
5279 if (preempt_count() == val)
5280 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5281}
5282EXPORT_SYMBOL(add_preempt_count);
5283
5284void __kprobes sub_preempt_count(int val)
5285{
5286#ifdef CONFIG_DEBUG_PREEMPT
5287
5288
5289
5290 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5291 return;
5292
5293
5294
5295 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5296 !(preempt_count() & PREEMPT_MASK)))
5297 return;
5298#endif
5299
5300 if (preempt_count() == val)
5301 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5302 preempt_count() -= val;
5303}
5304EXPORT_SYMBOL(sub_preempt_count);
5305
5306#endif
5307
5308
5309
5310
5311static noinline void __schedule_bug(struct task_struct *prev)
5312{
5313 struct pt_regs *regs = get_irq_regs();
5314
5315 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5316 prev->comm, prev->pid, preempt_count());
5317
5318 debug_show_held_locks(prev);
5319 print_modules();
5320 if (irqs_disabled())
5321 print_irqtrace_events(prev);
5322
5323 if (regs)
5324 show_regs(regs);
5325 else
5326 dump_stack();
5327}
5328
5329
5330
5331
5332static inline void schedule_debug(struct task_struct *prev)
5333{
5334
5335
5336
5337
5338
5339 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
5340 __schedule_bug(prev);
5341
5342 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5343
5344 schedstat_inc(this_rq(), sched_count);
5345#ifdef CONFIG_SCHEDSTATS
5346 if (unlikely(prev->lock_depth >= 0)) {
5347 schedstat_inc(this_rq(), bkl_count);
5348 schedstat_inc(prev, sched_info.bkl_count);
5349 }
5350#endif
5351}
5352
5353static void put_prev_task(struct rq *rq, struct task_struct *p)
5354{
5355 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5356
5357 update_avg(&p->se.avg_running, runtime);
5358
5359 if (p->state == TASK_RUNNING) {
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5370 update_avg(&p->se.avg_overlap, runtime);
5371 } else {
5372 update_avg(&p->se.avg_running, 0);
5373 }
5374 p->sched_class->put_prev_task(rq, p);
5375}
5376
5377
5378
5379
5380static inline struct task_struct *
5381pick_next_task(struct rq *rq)
5382{
5383 const struct sched_class *class;
5384 struct task_struct *p;
5385
5386
5387
5388
5389
5390 if (likely(rq->nr_running == rq->cfs.nr_running)) {
5391 p = fair_sched_class.pick_next_task(rq);
5392 if (likely(p))
5393 return p;
5394 }
5395
5396 class = sched_class_highest;
5397 for ( ; ; ) {
5398 p = class->pick_next_task(rq);
5399 if (p)
5400 return p;
5401
5402
5403
5404
5405 class = class->next;
5406 }
5407}
5408
5409
5410
5411
5412asmlinkage void __sched schedule(void)
5413{
5414 struct task_struct *prev, *next;
5415 unsigned long *switch_count;
5416 struct rq *rq;
5417 int cpu;
5418
5419need_resched:
5420 preempt_disable();
5421 cpu = smp_processor_id();
5422 rq = cpu_rq(cpu);
5423 rcu_sched_qs(cpu);
5424 prev = rq->curr;
5425 switch_count = &prev->nivcsw;
5426
5427 release_kernel_lock(prev);
5428need_resched_nonpreemptible:
5429
5430 schedule_debug(prev);
5431
5432 if (sched_feat(HRTICK))
5433 hrtick_clear(rq);
5434
5435 spin_lock_irq(&rq->lock);
5436 update_rq_clock(rq);
5437 clear_tsk_need_resched(prev);
5438
5439 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5440 if (unlikely(signal_pending_state(prev->state, prev)))
5441 prev->state = TASK_RUNNING;
5442 else
5443 deactivate_task(rq, prev, 1);
5444 switch_count = &prev->nvcsw;
5445 }
5446
5447 pre_schedule(rq, prev);
5448
5449 if (unlikely(!rq->nr_running))
5450 idle_balance(cpu, rq);
5451
5452 put_prev_task(rq, prev);
5453 next = pick_next_task(rq);
5454
5455 if (likely(prev != next)) {
5456 sched_info_switch(prev, next);
5457 perf_event_task_sched_out(prev, next, cpu);
5458
5459 rq->nr_switches++;
5460 rq->curr = next;
5461 ++*switch_count;
5462
5463 context_switch(rq, prev, next);
5464
5465
5466
5467
5468 cpu = smp_processor_id();
5469 rq = cpu_rq(cpu);
5470 } else
5471 spin_unlock_irq(&rq->lock);
5472
5473 post_schedule(rq);
5474
5475 if (unlikely(reacquire_kernel_lock(current) < 0))
5476 goto need_resched_nonpreemptible;
5477
5478 preempt_enable_no_resched();
5479 if (need_resched())
5480 goto need_resched;
5481}
5482EXPORT_SYMBOL(schedule);
5483
5484#ifdef CONFIG_SMP
5485
5486
5487
5488
5489int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5490{
5491 unsigned int cpu;
5492 struct rq *rq;
5493
5494 if (!sched_feat(OWNER_SPIN))
5495 return 0;
5496
5497#ifdef CONFIG_DEBUG_PAGEALLOC
5498
5499
5500
5501
5502
5503 if (probe_kernel_address(&owner->cpu, cpu))
5504 goto out;
5505#else
5506 cpu = owner->cpu;
5507#endif
5508
5509
5510
5511
5512
5513 if (cpu >= nr_cpumask_bits)
5514 goto out;
5515
5516
5517
5518
5519
5520 if (!cpu_online(cpu))
5521 goto out;
5522
5523 rq = cpu_rq(cpu);
5524
5525 for (;;) {
5526
5527
5528
5529 if (lock->owner != owner)
5530 break;
5531
5532
5533
5534
5535 if (task_thread_info(rq->curr) != owner || need_resched())
5536 return 0;
5537
5538 cpu_relax();
5539 }
5540out:
5541 return 1;
5542}
5543#endif
5544
5545#ifdef CONFIG_PREEMPT
5546
5547
5548
5549
5550
5551asmlinkage void __sched preempt_schedule(void)
5552{
5553 struct thread_info *ti = current_thread_info();
5554
5555
5556
5557
5558
5559 if (likely(ti->preempt_count || irqs_disabled()))
5560 return;
5561
5562 do {
5563 add_preempt_count(PREEMPT_ACTIVE);
5564 schedule();
5565 sub_preempt_count(PREEMPT_ACTIVE);
5566
5567
5568
5569
5570
5571 barrier();
5572 } while (need_resched());
5573}
5574EXPORT_SYMBOL(preempt_schedule);
5575
5576
5577
5578
5579
5580
5581
5582asmlinkage void __sched preempt_schedule_irq(void)
5583{
5584 struct thread_info *ti = current_thread_info();
5585
5586
5587 BUG_ON(ti->preempt_count || !irqs_disabled());
5588
5589 do {
5590 add_preempt_count(PREEMPT_ACTIVE);
5591 local_irq_enable();
5592 schedule();
5593 local_irq_disable();
5594 sub_preempt_count(PREEMPT_ACTIVE);
5595
5596
5597
5598
5599
5600 barrier();
5601 } while (need_resched());
5602}
5603
5604#endif
5605
5606int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5607 void *key)
5608{
5609 return try_to_wake_up(curr->private, mode, wake_flags);
5610}
5611EXPORT_SYMBOL(default_wake_function);
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5623 int nr_exclusive, int wake_flags, void *key)
5624{
5625 wait_queue_t *curr, *next;
5626
5627 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5628 unsigned flags = curr->flags;
5629
5630 if (curr->func(curr, mode, wake_flags, key) &&
5631 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5632 break;
5633 }
5634}
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646void __wake_up(wait_queue_head_t *q, unsigned int mode,
5647 int nr_exclusive, void *key)
5648{
5649 unsigned long flags;
5650
5651 spin_lock_irqsave(&q->lock, flags);
5652 __wake_up_common(q, mode, nr_exclusive, 0, key);
5653 spin_unlock_irqrestore(&q->lock, flags);
5654}
5655EXPORT_SYMBOL(__wake_up);
5656
5657
5658
5659
5660void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5661{
5662 __wake_up_common(q, mode, 1, 0, NULL);
5663}
5664
5665void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5666{
5667 __wake_up_common(q, mode, 1, 0, key);
5668}
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5688 int nr_exclusive, void *key)
5689{
5690 unsigned long flags;
5691 int wake_flags = WF_SYNC;
5692
5693 if (unlikely(!q))
5694 return;
5695
5696 if (unlikely(!nr_exclusive))
5697 wake_flags = 0;
5698
5699 spin_lock_irqsave(&q->lock, flags);
5700 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5701 spin_unlock_irqrestore(&q->lock, flags);
5702}
5703EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5704
5705
5706
5707
5708void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5709{
5710 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5711}
5712EXPORT_SYMBOL_GPL(__wake_up_sync);
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726void complete(struct completion *x)
5727{
5728 unsigned long flags;
5729
5730 spin_lock_irqsave(&x->wait.lock, flags);
5731 x->done++;
5732 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5733 spin_unlock_irqrestore(&x->wait.lock, flags);
5734}
5735EXPORT_SYMBOL(complete);
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746void complete_all(struct completion *x)
5747{
5748 unsigned long flags;
5749
5750 spin_lock_irqsave(&x->wait.lock, flags);
5751 x->done += UINT_MAX/2;
5752 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5753 spin_unlock_irqrestore(&x->wait.lock, flags);
5754}
5755EXPORT_SYMBOL(complete_all);
5756
5757static inline long __sched
5758do_wait_for_common(struct completion *x, long timeout, int state)
5759{
5760 if (!x->done) {
5761 DECLARE_WAITQUEUE(wait, current);
5762
5763 wait.flags |= WQ_FLAG_EXCLUSIVE;
5764 __add_wait_queue_tail(&x->wait, &wait);
5765 do {
5766 if (signal_pending_state(state, current)) {
5767 timeout = -ERESTARTSYS;
5768 break;
5769 }
5770 __set_current_state(state);
5771 spin_unlock_irq(&x->wait.lock);
5772 timeout = schedule_timeout(timeout);
5773 spin_lock_irq(&x->wait.lock);
5774 } while (!x->done && timeout);
5775 __remove_wait_queue(&x->wait, &wait);
5776 if (!x->done)
5777 return timeout;
5778 }
5779 x->done--;
5780 return timeout ?: 1;
5781}
5782
5783static long __sched
5784wait_for_common(struct completion *x, long timeout, int state)
5785{
5786 might_sleep();
5787
5788 spin_lock_irq(&x->wait.lock);
5789 timeout = do_wait_for_common(x, timeout, state);
5790 spin_unlock_irq(&x->wait.lock);
5791 return timeout;
5792}
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804void __sched wait_for_completion(struct completion *x)
5805{
5806 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5807}
5808EXPORT_SYMBOL(wait_for_completion);
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819unsigned long __sched
5820wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5821{
5822 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5823}
5824EXPORT_SYMBOL(wait_for_completion_timeout);
5825
5826
5827
5828
5829
5830
5831
5832
5833int __sched wait_for_completion_interruptible(struct completion *x)
5834{
5835 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5836 if (t == -ERESTARTSYS)
5837 return t;
5838 return 0;
5839}
5840EXPORT_SYMBOL(wait_for_completion_interruptible);
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850unsigned long __sched
5851wait_for_completion_interruptible_timeout(struct completion *x,
5852 unsigned long timeout)
5853{
5854 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5855}
5856EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5857
5858
5859
5860
5861
5862
5863
5864
5865int __sched wait_for_completion_killable(struct completion *x)
5866{
5867 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5868 if (t == -ERESTARTSYS)
5869 return t;
5870 return 0;
5871}
5872EXPORT_SYMBOL(wait_for_completion_killable);
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886bool try_wait_for_completion(struct completion *x)
5887{
5888 int ret = 1;
5889
5890 spin_lock_irq(&x->wait.lock);
5891 if (!x->done)
5892 ret = 0;
5893 else
5894 x->done--;
5895 spin_unlock_irq(&x->wait.lock);
5896 return ret;
5897}
5898EXPORT_SYMBOL(try_wait_for_completion);
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908bool completion_done(struct completion *x)
5909{
5910 int ret = 1;
5911
5912 spin_lock_irq(&x->wait.lock);
5913 if (!x->done)
5914 ret = 0;
5915 spin_unlock_irq(&x->wait.lock);
5916 return ret;
5917}
5918EXPORT_SYMBOL(completion_done);
5919
5920static long __sched
5921sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5922{
5923 unsigned long flags;
5924 wait_queue_t wait;
5925
5926 init_waitqueue_entry(&wait, current);
5927
5928 __set_current_state(state);
5929
5930 spin_lock_irqsave(&q->lock, flags);
5931 __add_wait_queue(q, &wait);
5932 spin_unlock(&q->lock);
5933 timeout = schedule_timeout(timeout);
5934 spin_lock_irq(&q->lock);
5935 __remove_wait_queue(q, &wait);
5936 spin_unlock_irqrestore(&q->lock, flags);
5937
5938 return timeout;
5939}
5940
5941void __sched interruptible_sleep_on(wait_queue_head_t *q)
5942{
5943 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5944}
5945EXPORT_SYMBOL(interruptible_sleep_on);
5946
5947long __sched
5948interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5949{
5950 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5951}
5952EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5953
5954void __sched sleep_on(wait_queue_head_t *q)
5955{
5956 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5957}
5958EXPORT_SYMBOL(sleep_on);
5959
5960long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5961{
5962 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5963}
5964EXPORT_SYMBOL(sleep_on_timeout);
5965
5966#ifdef CONFIG_RT_MUTEXES
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978void rt_mutex_setprio(struct task_struct *p, int prio)
5979{
5980 unsigned long flags;
5981 int oldprio, on_rq, running;
5982 struct rq *rq;
5983 const struct sched_class *prev_class = p->sched_class;
5984
5985 BUG_ON(prio < 0 || prio > MAX_PRIO);
5986
5987 rq = task_rq_lock(p, &flags);
5988 update_rq_clock(rq);
5989
5990 oldprio = p->prio;
5991 on_rq = p->se.on_rq;
5992 running = task_current(rq, p);
5993 if (on_rq)
5994 dequeue_task(rq, p, 0);
5995 if (running)
5996 p->sched_class->put_prev_task(rq, p);
5997
5998 if (rt_prio(prio))
5999 p->sched_class = &rt_sched_class;
6000 else
6001 p->sched_class = &fair_sched_class;
6002
6003 p->prio = prio;
6004
6005 if (running)
6006 p->sched_class->set_curr_task(rq);
6007 if (on_rq) {
6008 enqueue_task(rq, p, 0);
6009
6010 check_class_changed(rq, p, prev_class, oldprio, running);
6011 }
6012 task_rq_unlock(rq, &flags);
6013}
6014
6015#endif
6016
6017void set_user_nice(struct task_struct *p, long nice)
6018{
6019 int old_prio, delta, on_rq;
6020 unsigned long flags;
6021 struct rq *rq;
6022
6023 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
6024 return;
6025
6026
6027
6028
6029 rq = task_rq_lock(p, &flags);
6030 update_rq_clock(rq);
6031
6032
6033
6034
6035
6036
6037 if (task_has_rt_policy(p)) {
6038 p->static_prio = NICE_TO_PRIO(nice);
6039 goto out_unlock;
6040 }
6041 on_rq = p->se.on_rq;
6042 if (on_rq)
6043 dequeue_task(rq, p, 0);
6044
6045 p->static_prio = NICE_TO_PRIO(nice);
6046 set_load_weight(p);
6047 old_prio = p->prio;
6048 p->prio = effective_prio(p);
6049 delta = p->prio - old_prio;
6050
6051 if (on_rq) {
6052 enqueue_task(rq, p, 0);
6053
6054
6055
6056
6057 if (delta < 0 || (delta > 0 && task_running(rq, p)))
6058 resched_task(rq->curr);
6059 }
6060out_unlock:
6061 task_rq_unlock(rq, &flags);
6062}
6063EXPORT_SYMBOL(set_user_nice);
6064
6065
6066
6067
6068
6069
6070int can_nice(const struct task_struct *p, const int nice)
6071{
6072
6073 int nice_rlim = 20 - nice;
6074
6075 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
6076 capable(CAP_SYS_NICE));
6077}
6078
6079#ifdef __ARCH_WANT_SYS_NICE
6080
6081
6082
6083
6084
6085
6086
6087
6088SYSCALL_DEFINE1(nice, int, increment)
6089{
6090 long nice, retval;
6091
6092
6093
6094
6095
6096
6097 if (increment < -40)
6098 increment = -40;
6099 if (increment > 40)
6100 increment = 40;
6101
6102 nice = TASK_NICE(current) + increment;
6103 if (nice < -20)
6104 nice = -20;
6105 if (nice > 19)
6106 nice = 19;
6107
6108 if (increment < 0 && !can_nice(current, nice))
6109 return -EPERM;
6110
6111 retval = security_task_setnice(current, nice);
6112 if (retval)
6113 return retval;
6114
6115 set_user_nice(current, nice);
6116 return 0;
6117}
6118
6119#endif
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129int task_prio(const struct task_struct *p)
6130{
6131 return p->prio - MAX_RT_PRIO;
6132}
6133
6134
6135
6136
6137
6138int task_nice(const struct task_struct *p)
6139{
6140 return TASK_NICE(p);
6141}
6142EXPORT_SYMBOL(task_nice);
6143
6144
6145
6146
6147
6148int idle_cpu(int cpu)
6149{
6150 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
6151}
6152
6153
6154
6155
6156
6157struct task_struct *idle_task(int cpu)
6158{
6159 return cpu_rq(cpu)->idle;
6160}
6161
6162
6163
6164
6165
6166static struct task_struct *find_process_by_pid(pid_t pid)
6167{
6168 return pid ? find_task_by_vpid(pid) : current;
6169}
6170
6171
6172static void
6173__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6174{
6175 BUG_ON(p->se.on_rq);
6176
6177 p->policy = policy;
6178 switch (p->policy) {
6179 case SCHED_NORMAL:
6180 case SCHED_BATCH:
6181 case SCHED_IDLE:
6182 p->sched_class = &fair_sched_class;
6183 break;
6184 case SCHED_FIFO:
6185 case SCHED_RR:
6186 p->sched_class = &rt_sched_class;
6187 break;
6188 }
6189
6190 p->rt_priority = prio;
6191 p->normal_prio = normal_prio(p);
6192
6193 p->prio = rt_mutex_getprio(p);
6194 set_load_weight(p);
6195}
6196
6197
6198
6199
6200static bool check_same_owner(struct task_struct *p)
6201{
6202 const struct cred *cred = current_cred(), *pcred;
6203 bool match;
6204
6205 rcu_read_lock();
6206 pcred = __task_cred(p);
6207 match = (cred->euid == pcred->euid ||
6208 cred->euid == pcred->uid);
6209 rcu_read_unlock();
6210 return match;
6211}
6212
6213static int __sched_setscheduler(struct task_struct *p, int policy,
6214 struct sched_param *param, bool user)
6215{
6216 int retval, oldprio, oldpolicy = -1, on_rq, running;
6217 unsigned long flags;
6218 const struct sched_class *prev_class = p->sched_class;
6219 struct rq *rq;
6220 int reset_on_fork;
6221
6222
6223 BUG_ON(in_interrupt());
6224recheck:
6225
6226 if (policy < 0) {
6227 reset_on_fork = p->sched_reset_on_fork;
6228 policy = oldpolicy = p->policy;
6229 } else {
6230 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6231 policy &= ~SCHED_RESET_ON_FORK;
6232
6233 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6234 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6235 policy != SCHED_IDLE)
6236 return -EINVAL;
6237 }
6238
6239
6240
6241
6242
6243
6244 if (param->sched_priority < 0 ||
6245 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
6246 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
6247 return -EINVAL;
6248 if (rt_policy(policy) != (param->sched_priority != 0))
6249 return -EINVAL;
6250
6251
6252
6253
6254 if (user && !capable(CAP_SYS_NICE)) {
6255 if (rt_policy(policy)) {
6256 unsigned long rlim_rtprio;
6257
6258 if (!lock_task_sighand(p, &flags))
6259 return -ESRCH;
6260 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
6261 unlock_task_sighand(p, &flags);
6262
6263
6264 if (policy != p->policy && !rlim_rtprio)
6265 return -EPERM;
6266
6267
6268 if (param->sched_priority > p->rt_priority &&
6269 param->sched_priority > rlim_rtprio)
6270 return -EPERM;
6271 }
6272
6273
6274
6275
6276 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
6277 return -EPERM;
6278
6279
6280 if (!check_same_owner(p))
6281 return -EPERM;
6282
6283
6284 if (p->sched_reset_on_fork && !reset_on_fork)
6285 return -EPERM;
6286 }
6287
6288 if (user) {
6289#ifdef CONFIG_RT_GROUP_SCHED
6290
6291
6292
6293
6294 if (rt_bandwidth_enabled() && rt_policy(policy) &&
6295 task_group(p)->rt_bandwidth.rt_runtime == 0)
6296 return -EPERM;
6297#endif
6298
6299 retval = security_task_setscheduler(p, policy, param);
6300 if (retval)
6301 return retval;
6302 }
6303
6304
6305
6306
6307
6308 spin_lock_irqsave(&p->pi_lock, flags);
6309
6310
6311
6312
6313 rq = __task_rq_lock(p);
6314
6315 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6316 policy = oldpolicy = -1;
6317 __task_rq_unlock(rq);
6318 spin_unlock_irqrestore(&p->pi_lock, flags);
6319 goto recheck;
6320 }
6321 update_rq_clock(rq);
6322 on_rq = p->se.on_rq;
6323 running = task_current(rq, p);
6324 if (on_rq)
6325 deactivate_task(rq, p, 0);
6326 if (running)
6327 p->sched_class->put_prev_task(rq, p);
6328
6329 p->sched_reset_on_fork = reset_on_fork;
6330
6331 oldprio = p->prio;
6332 __setscheduler(rq, p, policy, param->sched_priority);
6333
6334 if (running)
6335 p->sched_class->set_curr_task(rq);
6336 if (on_rq) {
6337 activate_task(rq, p, 0);
6338
6339 check_class_changed(rq, p, prev_class, oldprio, running);
6340 }
6341 __task_rq_unlock(rq);
6342 spin_unlock_irqrestore(&p->pi_lock, flags);
6343
6344 rt_mutex_adjust_pi(p);
6345
6346 return 0;
6347}
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357int sched_setscheduler(struct task_struct *p, int policy,
6358 struct sched_param *param)
6359{
6360 return __sched_setscheduler(p, policy, param, true);
6361}
6362EXPORT_SYMBOL_GPL(sched_setscheduler);
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6376 struct sched_param *param)
6377{
6378 return __sched_setscheduler(p, policy, param, false);
6379}
6380
6381static int
6382do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
6383{
6384 struct sched_param lparam;
6385 struct task_struct *p;
6386 int retval;
6387
6388 if (!param || pid < 0)
6389 return -EINVAL;
6390 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
6391 return -EFAULT;
6392
6393 rcu_read_lock();
6394 retval = -ESRCH;
6395 p = find_process_by_pid(pid);
6396 if (p != NULL)
6397 retval = sched_setscheduler(p, policy, &lparam);
6398 rcu_read_unlock();
6399
6400 return retval;
6401}
6402
6403
6404
6405
6406
6407
6408
6409SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
6410 struct sched_param __user *, param)
6411{
6412
6413 if (policy < 0)
6414 return -EINVAL;
6415
6416 return do_sched_setscheduler(pid, policy, param);
6417}
6418
6419
6420
6421
6422
6423
6424SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6425{
6426 return do_sched_setscheduler(pid, -1, param);
6427}
6428
6429
6430
6431
6432
6433SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6434{
6435 struct task_struct *p;
6436 int retval;
6437
6438 if (pid < 0)
6439 return -EINVAL;
6440
6441 retval = -ESRCH;
6442 read_lock(&tasklist_lock);
6443 p = find_process_by_pid(pid);
6444 if (p) {
6445 retval = security_task_getscheduler(p);
6446 if (!retval)
6447 retval = p->policy
6448 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6449 }
6450 read_unlock(&tasklist_lock);
6451 return retval;
6452}
6453
6454
6455
6456
6457
6458
6459SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6460{
6461 struct sched_param lp;
6462 struct task_struct *p;
6463 int retval;
6464
6465 if (!param || pid < 0)
6466 return -EINVAL;
6467
6468 read_lock(&tasklist_lock);
6469 p = find_process_by_pid(pid);
6470 retval = -ESRCH;
6471 if (!p)
6472 goto out_unlock;
6473
6474 retval = security_task_getscheduler(p);
6475 if (retval)
6476 goto out_unlock;
6477
6478 lp.sched_priority = p->rt_priority;
6479 read_unlock(&tasklist_lock);
6480
6481
6482
6483
6484 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6485
6486 return retval;
6487
6488out_unlock:
6489 read_unlock(&tasklist_lock);
6490 return retval;
6491}
6492
6493long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6494{
6495 cpumask_var_t cpus_allowed, new_mask;
6496 struct task_struct *p;
6497 int retval;
6498
6499 get_online_cpus();
6500 read_lock(&tasklist_lock);
6501
6502 p = find_process_by_pid(pid);
6503 if (!p) {
6504 read_unlock(&tasklist_lock);
6505 put_online_cpus();
6506 return -ESRCH;
6507 }
6508
6509
6510
6511
6512
6513
6514 get_task_struct(p);
6515 read_unlock(&tasklist_lock);
6516
6517 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6518 retval = -ENOMEM;
6519 goto out_put_task;
6520 }
6521 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6522 retval = -ENOMEM;
6523 goto out_free_cpus_allowed;
6524 }
6525 retval = -EPERM;
6526 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
6527 goto out_unlock;
6528
6529 retval = security_task_setscheduler(p, 0, NULL);
6530 if (retval)
6531 goto out_unlock;
6532
6533 cpuset_cpus_allowed(p, cpus_allowed);
6534 cpumask_and(new_mask, in_mask, cpus_allowed);
6535 again:
6536 retval = set_cpus_allowed_ptr(p, new_mask);
6537
6538 if (!retval) {
6539 cpuset_cpus_allowed(p, cpus_allowed);
6540 if (!cpumask_subset(new_mask, cpus_allowed)) {
6541
6542
6543
6544
6545
6546 cpumask_copy(new_mask, cpus_allowed);
6547 goto again;
6548 }
6549 }
6550out_unlock:
6551 free_cpumask_var(new_mask);
6552out_free_cpus_allowed:
6553 free_cpumask_var(cpus_allowed);
6554out_put_task:
6555 put_task_struct(p);
6556 put_online_cpus();
6557 return retval;
6558}
6559
6560static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6561 struct cpumask *new_mask)
6562{
6563 if (len < cpumask_size())
6564 cpumask_clear(new_mask);
6565 else if (len > cpumask_size())
6566 len = cpumask_size();
6567
6568 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6569}
6570
6571
6572
6573
6574
6575
6576
6577SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6578 unsigned long __user *, user_mask_ptr)
6579{
6580 cpumask_var_t new_mask;
6581 int retval;
6582
6583 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6584 return -ENOMEM;
6585
6586 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6587 if (retval == 0)
6588 retval = sched_setaffinity(pid, new_mask);
6589 free_cpumask_var(new_mask);
6590 return retval;
6591}
6592
6593long sched_getaffinity(pid_t pid, struct cpumask *mask)
6594{
6595 struct task_struct *p;
6596 int retval;
6597
6598 get_online_cpus();
6599 read_lock(&tasklist_lock);
6600
6601 retval = -ESRCH;
6602 p = find_process_by_pid(pid);
6603 if (!p)
6604 goto out_unlock;
6605
6606 retval = security_task_getscheduler(p);
6607 if (retval)
6608 goto out_unlock;
6609
6610 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6611
6612out_unlock:
6613 read_unlock(&tasklist_lock);
6614 put_online_cpus();
6615
6616 return retval;
6617}
6618
6619
6620
6621
6622
6623
6624
6625SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6626 unsigned long __user *, user_mask_ptr)
6627{
6628 int ret;
6629 cpumask_var_t mask;
6630
6631 if (len < cpumask_size())
6632 return -EINVAL;
6633
6634 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6635 return -ENOMEM;
6636
6637 ret = sched_getaffinity(pid, mask);
6638 if (ret == 0) {
6639 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
6640 ret = -EFAULT;
6641 else
6642 ret = cpumask_size();
6643 }
6644 free_cpumask_var(mask);
6645
6646 return ret;
6647}
6648
6649
6650
6651
6652
6653
6654
6655SYSCALL_DEFINE0(sched_yield)
6656{
6657 struct rq *rq = this_rq_lock();
6658
6659 schedstat_inc(rq, yld_count);
6660 current->sched_class->yield_task(rq);
6661
6662
6663
6664
6665
6666 __release(rq->lock);
6667 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6668 _raw_spin_unlock(&rq->lock);
6669 preempt_enable_no_resched();
6670
6671 schedule();
6672
6673 return 0;
6674}
6675
6676static inline int should_resched(void)
6677{
6678 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6679}
6680
6681static void __cond_resched(void)
6682{
6683 add_preempt_count(PREEMPT_ACTIVE);
6684 schedule();
6685 sub_preempt_count(PREEMPT_ACTIVE);
6686}
6687
6688int __sched _cond_resched(void)
6689{
6690 if (should_resched()) {
6691 __cond_resched();
6692 return 1;
6693 }
6694 return 0;
6695}
6696EXPORT_SYMBOL(_cond_resched);
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706int __cond_resched_lock(spinlock_t *lock)
6707{
6708 int resched = should_resched();
6709 int ret = 0;
6710
6711 lockdep_assert_held(lock);
6712
6713 if (spin_needbreak(lock) || resched) {
6714 spin_unlock(lock);
6715 if (resched)
6716 __cond_resched();
6717 else
6718 cpu_relax();
6719 ret = 1;
6720 spin_lock(lock);
6721 }
6722 return ret;
6723}
6724EXPORT_SYMBOL(__cond_resched_lock);
6725
6726int __sched __cond_resched_softirq(void)
6727{
6728 BUG_ON(!in_softirq());
6729
6730 if (should_resched()) {
6731 local_bh_enable();
6732 __cond_resched();
6733 local_bh_disable();
6734 return 1;
6735 }
6736 return 0;
6737}
6738EXPORT_SYMBOL(__cond_resched_softirq);
6739
6740
6741
6742
6743
6744
6745
6746void __sched yield(void)
6747{
6748 set_current_state(TASK_RUNNING);
6749 sys_sched_yield();
6750}
6751EXPORT_SYMBOL(yield);
6752
6753
6754
6755
6756
6757void __sched io_schedule(void)
6758{
6759 struct rq *rq = raw_rq();
6760
6761 delayacct_blkio_start();
6762 atomic_inc(&rq->nr_iowait);
6763 current->in_iowait = 1;
6764 schedule();
6765 current->in_iowait = 0;
6766 atomic_dec(&rq->nr_iowait);
6767 delayacct_blkio_end();
6768}
6769EXPORT_SYMBOL(io_schedule);
6770
6771long __sched io_schedule_timeout(long timeout)
6772{
6773 struct rq *rq = raw_rq();
6774 long ret;
6775
6776 delayacct_blkio_start();
6777 atomic_inc(&rq->nr_iowait);
6778 current->in_iowait = 1;
6779 ret = schedule_timeout(timeout);
6780 current->in_iowait = 0;
6781 atomic_dec(&rq->nr_iowait);
6782 delayacct_blkio_end();
6783 return ret;
6784}
6785
6786
6787
6788
6789
6790
6791
6792
6793SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6794{
6795 int ret = -EINVAL;
6796
6797 switch (policy) {
6798 case SCHED_FIFO:
6799 case SCHED_RR:
6800 ret = MAX_USER_RT_PRIO-1;
6801 break;
6802 case SCHED_NORMAL:
6803 case SCHED_BATCH:
6804 case SCHED_IDLE:
6805 ret = 0;
6806 break;
6807 }
6808 return ret;
6809}
6810
6811
6812
6813
6814
6815
6816
6817
6818SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6819{
6820 int ret = -EINVAL;
6821
6822 switch (policy) {
6823 case SCHED_FIFO:
6824 case SCHED_RR:
6825 ret = 1;
6826 break;
6827 case SCHED_NORMAL:
6828 case SCHED_BATCH:
6829 case SCHED_IDLE:
6830 ret = 0;
6831 }
6832 return ret;
6833}
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6844 struct timespec __user *, interval)
6845{
6846 struct task_struct *p;
6847 unsigned int time_slice;
6848 int retval;
6849 struct timespec t;
6850
6851 if (pid < 0)
6852 return -EINVAL;
6853
6854 retval = -ESRCH;
6855 read_lock(&tasklist_lock);
6856 p = find_process_by_pid(pid);
6857 if (!p)
6858 goto out_unlock;
6859
6860 retval = security_task_getscheduler(p);
6861 if (retval)
6862 goto out_unlock;
6863
6864 time_slice = p->sched_class->get_rr_interval(p);
6865
6866 read_unlock(&tasklist_lock);
6867 jiffies_to_timespec(time_slice, &t);
6868 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6869 return retval;
6870
6871out_unlock:
6872 read_unlock(&tasklist_lock);
6873 return retval;
6874}
6875
6876static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6877
6878void sched_show_task(struct task_struct *p)
6879{
6880 unsigned long free = 0;
6881 unsigned state;
6882
6883 state = p->state ? __ffs(p->state) + 1 : 0;
6884 printk(KERN_INFO "%-13.13s %c", p->comm,
6885 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6886#if BITS_PER_LONG == 32
6887 if (state == TASK_RUNNING)
6888 printk(KERN_CONT " running ");
6889 else
6890 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6891#else
6892 if (state == TASK_RUNNING)
6893 printk(KERN_CONT " running task ");
6894 else
6895 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6896#endif
6897#ifdef CONFIG_DEBUG_STACK_USAGE
6898 free = stack_not_used(p);
6899#endif
6900 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6901 task_pid_nr(p), task_pid_nr(p->real_parent),
6902 (unsigned long)task_thread_info(p)->flags);
6903
6904 show_stack(p, NULL);
6905}
6906
6907void show_state_filter(unsigned long state_filter)
6908{
6909 struct task_struct *g, *p;
6910
6911#if BITS_PER_LONG == 32
6912 printk(KERN_INFO
6913 " task PC stack pid father\n");
6914#else
6915 printk(KERN_INFO
6916 " task PC stack pid father\n");
6917#endif
6918 read_lock(&tasklist_lock);
6919 do_each_thread(g, p) {
6920
6921
6922
6923
6924 touch_nmi_watchdog();
6925 if (!state_filter || (p->state & state_filter))
6926 sched_show_task(p);
6927 } while_each_thread(g, p);
6928
6929 touch_all_softlockup_watchdogs();
6930
6931#ifdef CONFIG_SCHED_DEBUG
6932 sysrq_sched_debug_show();
6933#endif
6934 read_unlock(&tasklist_lock);
6935
6936
6937
6938 if (state_filter == -1)
6939 debug_show_all_locks();
6940}
6941
6942void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6943{
6944 idle->sched_class = &idle_sched_class;
6945}
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955void __cpuinit init_idle(struct task_struct *idle, int cpu)
6956{
6957 struct rq *rq = cpu_rq(cpu);
6958 unsigned long flags;
6959
6960 spin_lock_irqsave(&rq->lock, flags);
6961
6962 __sched_fork(idle);
6963 idle->se.exec_start = sched_clock();
6964
6965 idle->prio = idle->normal_prio = MAX_PRIO;
6966 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6967 __set_task_cpu(idle, cpu);
6968
6969 rq->curr = rq->idle = idle;
6970#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6971 idle->oncpu = 1;
6972#endif
6973 spin_unlock_irqrestore(&rq->lock, flags);
6974
6975
6976#if defined(CONFIG_PREEMPT)
6977 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6978#else
6979 task_thread_info(idle)->preempt_count = 0;
6980#endif
6981
6982
6983
6984 idle->sched_class = &idle_sched_class;
6985 ftrace_graph_init_task(idle);
6986}
6987
6988
6989
6990
6991
6992
6993
6994
6995cpumask_var_t nohz_cpu_mask;
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006static inline void sched_init_granularity(void)
7007{
7008 unsigned int factor = 1 + ilog2(num_online_cpus());
7009 const unsigned long limit = 200000000;
7010
7011 sysctl_sched_min_granularity *= factor;
7012 if (sysctl_sched_min_granularity > limit)
7013 sysctl_sched_min_granularity = limit;
7014
7015 sysctl_sched_latency *= factor;
7016 if (sysctl_sched_latency > limit)
7017 sysctl_sched_latency = limit;
7018
7019 sysctl_sched_wakeup_granularity *= factor;
7020
7021 sysctl_sched_shares_ratelimit *= factor;
7022}
7023
7024#ifdef CONFIG_SMP
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7051{
7052 struct migration_req req;
7053 unsigned long flags;
7054 struct rq *rq;
7055 int ret = 0;
7056
7057 rq = task_rq_lock(p, &flags);
7058 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
7059 ret = -EINVAL;
7060 goto out;
7061 }
7062
7063 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
7064 !cpumask_equal(&p->cpus_allowed, new_mask))) {
7065 ret = -EINVAL;
7066 goto out;
7067 }
7068
7069 if (p->sched_class->set_cpus_allowed)
7070 p->sched_class->set_cpus_allowed(p, new_mask);
7071 else {
7072 cpumask_copy(&p->cpus_allowed, new_mask);
7073 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
7074 }
7075
7076
7077 if (cpumask_test_cpu(task_cpu(p), new_mask))
7078 goto out;
7079
7080 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
7081
7082 struct task_struct *mt = rq->migration_thread;
7083
7084 get_task_struct(mt);
7085 task_rq_unlock(rq, &flags);
7086 wake_up_process(rq->migration_thread);
7087 put_task_struct(mt);
7088 wait_for_completion(&req.done);
7089 tlb_migrate_finish(p->mm);
7090 return 0;
7091 }
7092out:
7093 task_rq_unlock(rq, &flags);
7094
7095 return ret;
7096}
7097EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7111{
7112 struct rq *rq_dest, *rq_src;
7113 int ret = 0, on_rq;
7114
7115 if (unlikely(!cpu_active(dest_cpu)))
7116 return ret;
7117
7118 rq_src = cpu_rq(src_cpu);
7119 rq_dest = cpu_rq(dest_cpu);
7120
7121 double_rq_lock(rq_src, rq_dest);
7122
7123 if (task_cpu(p) != src_cpu)
7124 goto done;
7125
7126 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7127 goto fail;
7128
7129 on_rq = p->se.on_rq;
7130 if (on_rq)
7131 deactivate_task(rq_src, p, 0);
7132
7133 set_task_cpu(p, dest_cpu);
7134 if (on_rq) {
7135 activate_task(rq_dest, p, 0);
7136 check_preempt_curr(rq_dest, p, 0);
7137 }
7138done:
7139 ret = 1;
7140fail:
7141 double_rq_unlock(rq_src, rq_dest);
7142 return ret;
7143}
7144
7145#define RCU_MIGRATION_IDLE 0
7146#define RCU_MIGRATION_NEED_QS 1
7147#define RCU_MIGRATION_GOT_QS 2
7148#define RCU_MIGRATION_MUST_SYNC 3
7149
7150
7151
7152
7153
7154
7155static int migration_thread(void *data)
7156{
7157 int badcpu;
7158 int cpu = (long)data;
7159 struct rq *rq;
7160
7161 rq = cpu_rq(cpu);
7162 BUG_ON(rq->migration_thread != current);
7163
7164 set_current_state(TASK_INTERRUPTIBLE);
7165 while (!kthread_should_stop()) {
7166 struct migration_req *req;
7167 struct list_head *head;
7168
7169 spin_lock_irq(&rq->lock);
7170
7171 if (cpu_is_offline(cpu)) {
7172 spin_unlock_irq(&rq->lock);
7173 break;
7174 }
7175
7176 if (rq->active_balance) {
7177 active_load_balance(rq, cpu);
7178 rq->active_balance = 0;
7179 }
7180
7181 head = &rq->migration_queue;
7182
7183 if (list_empty(head)) {
7184 spin_unlock_irq(&rq->lock);
7185 schedule();
7186 set_current_state(TASK_INTERRUPTIBLE);
7187 continue;
7188 }
7189 req = list_entry(head->next, struct migration_req, list);
7190 list_del_init(head->next);
7191
7192 if (req->task != NULL) {
7193 spin_unlock(&rq->lock);
7194 __migrate_task(req->task, cpu, req->dest_cpu);
7195 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7196 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7197 spin_unlock(&rq->lock);
7198 } else {
7199 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7200 spin_unlock(&rq->lock);
7201 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7202 }
7203 local_irq_enable();
7204
7205 complete(&req->done);
7206 }
7207 __set_current_state(TASK_RUNNING);
7208
7209 return 0;
7210}
7211
7212#ifdef CONFIG_HOTPLUG_CPU
7213
7214static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7215{
7216 int ret;
7217
7218 local_irq_disable();
7219 ret = __migrate_task(p, src_cpu, dest_cpu);
7220 local_irq_enable();
7221 return ret;
7222}
7223
7224
7225
7226
7227static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7228{
7229 int dest_cpu;
7230 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7231
7232again:
7233
7234 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7235 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7236 goto move;
7237
7238
7239 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7240 if (dest_cpu < nr_cpu_ids)
7241 goto move;
7242
7243
7244 if (dest_cpu >= nr_cpu_ids) {
7245 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7246 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7247
7248
7249
7250
7251
7252
7253 if (p->mm && printk_ratelimit()) {
7254 printk(KERN_INFO "process %d (%s) no "
7255 "longer affine to cpu%d\n",
7256 task_pid_nr(p), p->comm, dead_cpu);
7257 }
7258 }
7259
7260move:
7261
7262 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7263 goto again;
7264}
7265
7266
7267
7268
7269
7270
7271
7272
7273static void migrate_nr_uninterruptible(struct rq *rq_src)
7274{
7275 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
7276 unsigned long flags;
7277
7278 local_irq_save(flags);
7279 double_rq_lock(rq_src, rq_dest);
7280 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
7281 rq_src->nr_uninterruptible = 0;
7282 double_rq_unlock(rq_src, rq_dest);
7283 local_irq_restore(flags);
7284}
7285
7286
7287static void migrate_live_tasks(int src_cpu)
7288{
7289 struct task_struct *p, *t;
7290
7291 read_lock(&tasklist_lock);
7292
7293 do_each_thread(t, p) {
7294 if (p == current)
7295 continue;
7296
7297 if (task_cpu(p) == src_cpu)
7298 move_task_off_dead_cpu(src_cpu, p);
7299 } while_each_thread(t, p);
7300
7301 read_unlock(&tasklist_lock);
7302}
7303
7304
7305
7306
7307
7308
7309void sched_idle_next(void)
7310{
7311 int this_cpu = smp_processor_id();
7312 struct rq *rq = cpu_rq(this_cpu);
7313 struct task_struct *p = rq->idle;
7314 unsigned long flags;
7315
7316
7317 BUG_ON(cpu_online(this_cpu));
7318
7319
7320
7321
7322
7323 spin_lock_irqsave(&rq->lock, flags);
7324
7325 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7326
7327 update_rq_clock(rq);
7328 activate_task(rq, p, 0);
7329
7330 spin_unlock_irqrestore(&rq->lock, flags);
7331}
7332
7333
7334
7335
7336
7337void idle_task_exit(void)
7338{
7339 struct mm_struct *mm = current->active_mm;
7340
7341 BUG_ON(cpu_online(smp_processor_id()));
7342
7343 if (mm != &init_mm)
7344 switch_mm(mm, &init_mm, current);
7345 mmdrop(mm);
7346}
7347
7348
7349static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7350{
7351 struct rq *rq = cpu_rq(dead_cpu);
7352
7353
7354 BUG_ON(!p->exit_state);
7355
7356
7357 BUG_ON(p->state == TASK_DEAD);
7358
7359 get_task_struct(p);
7360
7361
7362
7363
7364
7365
7366 spin_unlock_irq(&rq->lock);
7367 move_task_off_dead_cpu(dead_cpu, p);
7368 spin_lock_irq(&rq->lock);
7369
7370 put_task_struct(p);
7371}
7372
7373
7374static void migrate_dead_tasks(unsigned int dead_cpu)
7375{
7376 struct rq *rq = cpu_rq(dead_cpu);
7377 struct task_struct *next;
7378
7379 for ( ; ; ) {
7380 if (!rq->nr_running)
7381 break;
7382 update_rq_clock(rq);
7383 next = pick_next_task(rq);
7384 if (!next)
7385 break;
7386 next->sched_class->put_prev_task(rq, next);
7387 migrate_dead(dead_cpu, next);
7388
7389 }
7390}
7391
7392
7393
7394
7395static void calc_global_load_remove(struct rq *rq)
7396{
7397 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7398 rq->calc_load_active = 0;
7399}
7400#endif
7401
7402#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
7403
7404static struct ctl_table sd_ctl_dir[] = {
7405 {
7406 .procname = "sched_domain",
7407 .mode = 0555,
7408 },
7409 {0, },
7410};
7411
7412static struct ctl_table sd_ctl_root[] = {
7413 {
7414 .ctl_name = CTL_KERN,
7415 .procname = "kernel",
7416 .mode = 0555,
7417 .child = sd_ctl_dir,
7418 },
7419 {0, },
7420};
7421
7422static struct ctl_table *sd_alloc_ctl_entry(int n)
7423{
7424 struct ctl_table *entry =
7425 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
7426
7427 return entry;
7428}
7429
7430static void sd_free_ctl_entry(struct ctl_table **tablep)
7431{
7432 struct ctl_table *entry;
7433
7434
7435
7436
7437
7438
7439
7440 for (entry = *tablep; entry->mode; entry++) {
7441 if (entry->child)
7442 sd_free_ctl_entry(&entry->child);
7443 if (entry->proc_handler == NULL)
7444 kfree(entry->procname);
7445 }
7446
7447 kfree(*tablep);
7448 *tablep = NULL;
7449}
7450
7451static void
7452set_table_entry(struct ctl_table *entry,
7453 const char *procname, void *data, int maxlen,
7454 mode_t mode, proc_handler *proc_handler)
7455{
7456 entry->procname = procname;
7457 entry->data = data;
7458 entry->maxlen = maxlen;
7459 entry->mode = mode;
7460 entry->proc_handler = proc_handler;
7461}
7462
7463static struct ctl_table *
7464sd_alloc_ctl_domain_table(struct sched_domain *sd)
7465{
7466 struct ctl_table *table = sd_alloc_ctl_entry(13);
7467
7468 if (table == NULL)
7469 return NULL;
7470
7471 set_table_entry(&table[0], "min_interval", &sd->min_interval,
7472 sizeof(long), 0644, proc_doulongvec_minmax);
7473 set_table_entry(&table[1], "max_interval", &sd->max_interval,
7474 sizeof(long), 0644, proc_doulongvec_minmax);
7475 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
7476 sizeof(int), 0644, proc_dointvec_minmax);
7477 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
7478 sizeof(int), 0644, proc_dointvec_minmax);
7479 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
7480 sizeof(int), 0644, proc_dointvec_minmax);
7481 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
7482 sizeof(int), 0644, proc_dointvec_minmax);
7483 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
7484 sizeof(int), 0644, proc_dointvec_minmax);
7485 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
7486 sizeof(int), 0644, proc_dointvec_minmax);
7487 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
7488 sizeof(int), 0644, proc_dointvec_minmax);
7489 set_table_entry(&table[9], "cache_nice_tries",
7490 &sd->cache_nice_tries,
7491 sizeof(int), 0644, proc_dointvec_minmax);
7492 set_table_entry(&table[10], "flags", &sd->flags,
7493 sizeof(int), 0644, proc_dointvec_minmax);
7494 set_table_entry(&table[11], "name", sd->name,
7495 CORENAME_MAX_SIZE, 0444, proc_dostring);
7496
7497
7498 return table;
7499}
7500
7501static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7502{
7503 struct ctl_table *entry, *table;
7504 struct sched_domain *sd;
7505 int domain_num = 0, i;
7506 char buf[32];
7507
7508 for_each_domain(cpu, sd)
7509 domain_num++;
7510 entry = table = sd_alloc_ctl_entry(domain_num + 1);
7511 if (table == NULL)
7512 return NULL;
7513
7514 i = 0;
7515 for_each_domain(cpu, sd) {
7516 snprintf(buf, 32, "domain%d", i);
7517 entry->procname = kstrdup(buf, GFP_KERNEL);
7518 entry->mode = 0555;
7519 entry->child = sd_alloc_ctl_domain_table(sd);
7520 entry++;
7521 i++;
7522 }
7523 return table;
7524}
7525
7526static struct ctl_table_header *sd_sysctl_header;
7527static void register_sched_domain_sysctl(void)
7528{
7529 int i, cpu_num = num_online_cpus();
7530 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7531 char buf[32];
7532
7533 WARN_ON(sd_ctl_dir[0].child);
7534 sd_ctl_dir[0].child = entry;
7535
7536 if (entry == NULL)
7537 return;
7538
7539 for_each_online_cpu(i) {
7540 snprintf(buf, 32, "cpu%d", i);
7541 entry->procname = kstrdup(buf, GFP_KERNEL);
7542 entry->mode = 0555;
7543 entry->child = sd_alloc_ctl_cpu_table(i);
7544 entry++;
7545 }
7546
7547 WARN_ON(sd_sysctl_header);
7548 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
7549}
7550
7551
7552static void unregister_sched_domain_sysctl(void)
7553{
7554 if (sd_sysctl_header)
7555 unregister_sysctl_table(sd_sysctl_header);
7556 sd_sysctl_header = NULL;
7557 if (sd_ctl_dir[0].child)
7558 sd_free_ctl_entry(&sd_ctl_dir[0].child);
7559}
7560#else
7561static void register_sched_domain_sysctl(void)
7562{
7563}
7564static void unregister_sched_domain_sysctl(void)
7565{
7566}
7567#endif
7568
7569static void set_rq_online(struct rq *rq)
7570{
7571 if (!rq->online) {
7572 const struct sched_class *class;
7573
7574 cpumask_set_cpu(rq->cpu, rq->rd->online);
7575 rq->online = 1;
7576
7577 for_each_class(class) {
7578 if (class->rq_online)
7579 class->rq_online(rq);
7580 }
7581 }
7582}
7583
7584static void set_rq_offline(struct rq *rq)
7585{
7586 if (rq->online) {
7587 const struct sched_class *class;
7588
7589 for_each_class(class) {
7590 if (class->rq_offline)
7591 class->rq_offline(rq);
7592 }
7593
7594 cpumask_clear_cpu(rq->cpu, rq->rd->online);
7595 rq->online = 0;
7596 }
7597}
7598
7599
7600
7601
7602
7603static int __cpuinit
7604migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7605{
7606 struct task_struct *p;
7607 int cpu = (long)hcpu;
7608 unsigned long flags;
7609 struct rq *rq;
7610
7611 switch (action) {
7612
7613 case CPU_UP_PREPARE:
7614 case CPU_UP_PREPARE_FROZEN:
7615 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
7616 if (IS_ERR(p))
7617 return NOTIFY_BAD;
7618 kthread_bind(p, cpu);
7619
7620 rq = task_rq_lock(p, &flags);
7621 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7622 task_rq_unlock(rq, &flags);
7623 get_task_struct(p);
7624 cpu_rq(cpu)->migration_thread = p;
7625 rq->calc_load_update = calc_load_update;
7626 break;
7627
7628 case CPU_ONLINE:
7629 case CPU_ONLINE_FROZEN:
7630
7631 wake_up_process(cpu_rq(cpu)->migration_thread);
7632
7633
7634 rq = cpu_rq(cpu);
7635 spin_lock_irqsave(&rq->lock, flags);
7636 if (rq->rd) {
7637 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7638
7639 set_rq_online(rq);
7640 }
7641 spin_unlock_irqrestore(&rq->lock, flags);
7642 break;
7643
7644#ifdef CONFIG_HOTPLUG_CPU
7645 case CPU_UP_CANCELED:
7646 case CPU_UP_CANCELED_FROZEN:
7647 if (!cpu_rq(cpu)->migration_thread)
7648 break;
7649
7650 kthread_bind(cpu_rq(cpu)->migration_thread,
7651 cpumask_any(cpu_online_mask));
7652 kthread_stop(cpu_rq(cpu)->migration_thread);
7653 put_task_struct(cpu_rq(cpu)->migration_thread);
7654 cpu_rq(cpu)->migration_thread = NULL;
7655 break;
7656
7657 case CPU_DEAD:
7658 case CPU_DEAD_FROZEN:
7659 cpuset_lock();
7660 migrate_live_tasks(cpu);
7661 rq = cpu_rq(cpu);
7662 kthread_stop(rq->migration_thread);
7663 put_task_struct(rq->migration_thread);
7664 rq->migration_thread = NULL;
7665
7666 spin_lock_irq(&rq->lock);
7667 update_rq_clock(rq);
7668 deactivate_task(rq, rq->idle, 0);
7669 rq->idle->static_prio = MAX_PRIO;
7670 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7671 rq->idle->sched_class = &idle_sched_class;
7672 migrate_dead_tasks(cpu);
7673 spin_unlock_irq(&rq->lock);
7674 cpuset_unlock();
7675 migrate_nr_uninterruptible(rq);
7676 BUG_ON(rq->nr_running != 0);
7677 calc_global_load_remove(rq);
7678
7679
7680
7681
7682
7683 spin_lock_irq(&rq->lock);
7684 while (!list_empty(&rq->migration_queue)) {
7685 struct migration_req *req;
7686
7687 req = list_entry(rq->migration_queue.next,
7688 struct migration_req, list);
7689 list_del_init(&req->list);
7690 spin_unlock_irq(&rq->lock);
7691 complete(&req->done);
7692 spin_lock_irq(&rq->lock);
7693 }
7694 spin_unlock_irq(&rq->lock);
7695 break;
7696
7697 case CPU_DYING:
7698 case CPU_DYING_FROZEN:
7699
7700 rq = cpu_rq(cpu);
7701 spin_lock_irqsave(&rq->lock, flags);
7702 if (rq->rd) {
7703 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7704 set_rq_offline(rq);
7705 }
7706 spin_unlock_irqrestore(&rq->lock, flags);
7707 break;
7708#endif
7709 }
7710 return NOTIFY_OK;
7711}
7712
7713
7714
7715
7716
7717
7718static struct notifier_block __cpuinitdata migration_notifier = {
7719 .notifier_call = migration_call,
7720 .priority = 10
7721};
7722
7723static int __init migration_init(void)
7724{
7725 void *cpu = (void *)(long)smp_processor_id();
7726 int err;
7727
7728
7729 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
7730 BUG_ON(err == NOTIFY_BAD);
7731 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7732 register_cpu_notifier(&migration_notifier);
7733
7734 return 0;
7735}
7736early_initcall(migration_init);
7737#endif
7738
7739#ifdef CONFIG_SMP
7740
7741#ifdef CONFIG_SCHED_DEBUG
7742
7743static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7744 struct cpumask *groupmask)
7745{
7746 struct sched_group *group = sd->groups;
7747 char str[256];
7748
7749 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
7750 cpumask_clear(groupmask);
7751
7752 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7753
7754 if (!(sd->flags & SD_LOAD_BALANCE)) {
7755 printk("does not load-balance\n");
7756 if (sd->parent)
7757 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7758 " has parent");
7759 return -1;
7760 }
7761
7762 printk(KERN_CONT "span %s level %s\n", str, sd->name);
7763
7764 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
7765 printk(KERN_ERR "ERROR: domain->span does not contain "
7766 "CPU%d\n", cpu);
7767 }
7768 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
7769 printk(KERN_ERR "ERROR: domain->groups does not contain"
7770 " CPU%d\n", cpu);
7771 }
7772
7773 printk(KERN_DEBUG "%*s groups:", level + 1, "");
7774 do {
7775 if (!group) {
7776 printk("\n");
7777 printk(KERN_ERR "ERROR: group is NULL\n");
7778 break;
7779 }
7780
7781 if (!group->cpu_power) {
7782 printk(KERN_CONT "\n");
7783 printk(KERN_ERR "ERROR: domain->cpu_power not "
7784 "set\n");
7785 break;
7786 }
7787
7788 if (!cpumask_weight(sched_group_cpus(group))) {
7789 printk(KERN_CONT "\n");
7790 printk(KERN_ERR "ERROR: empty group\n");
7791 break;
7792 }
7793
7794 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
7795 printk(KERN_CONT "\n");
7796 printk(KERN_ERR "ERROR: repeated CPUs\n");
7797 break;
7798 }
7799
7800 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7801
7802 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7803
7804 printk(KERN_CONT " %s", str);
7805 if (group->cpu_power != SCHED_LOAD_SCALE) {
7806 printk(KERN_CONT " (cpu_power = %d)",
7807 group->cpu_power);
7808 }
7809
7810 group = group->next;
7811 } while (group != sd->groups);
7812 printk(KERN_CONT "\n");
7813
7814 if (!cpumask_equal(sched_domain_span(sd), groupmask))
7815 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7816
7817 if (sd->parent &&
7818 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
7819 printk(KERN_ERR "ERROR: parent span is not a superset "
7820 "of domain->span\n");
7821 return 0;
7822}
7823
7824static void sched_domain_debug(struct sched_domain *sd, int cpu)
7825{
7826 cpumask_var_t groupmask;
7827 int level = 0;
7828
7829 if (!sd) {
7830 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7831 return;
7832 }
7833
7834 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7835
7836 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
7837 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7838 return;
7839 }
7840
7841 for (;;) {
7842 if (sched_domain_debug_one(sd, cpu, level, groupmask))
7843 break;
7844 level++;
7845 sd = sd->parent;
7846 if (!sd)
7847 break;
7848 }
7849 free_cpumask_var(groupmask);
7850}
7851#else
7852# define sched_domain_debug(sd, cpu) do { } while (0)
7853#endif
7854
7855static int sd_degenerate(struct sched_domain *sd)
7856{
7857 if (cpumask_weight(sched_domain_span(sd)) == 1)
7858 return 1;
7859
7860
7861 if (sd->flags & (SD_LOAD_BALANCE |
7862 SD_BALANCE_NEWIDLE |
7863 SD_BALANCE_FORK |
7864 SD_BALANCE_EXEC |
7865 SD_SHARE_CPUPOWER |
7866 SD_SHARE_PKG_RESOURCES)) {
7867 if (sd->groups != sd->groups->next)
7868 return 0;
7869 }
7870
7871
7872 if (sd->flags & (SD_WAKE_AFFINE))
7873 return 0;
7874
7875 return 1;
7876}
7877
7878static int
7879sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7880{
7881 unsigned long cflags = sd->flags, pflags = parent->flags;
7882
7883 if (sd_degenerate(parent))
7884 return 1;
7885
7886 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7887 return 0;
7888
7889
7890 if (parent->groups == parent->groups->next) {
7891 pflags &= ~(SD_LOAD_BALANCE |
7892 SD_BALANCE_NEWIDLE |
7893 SD_BALANCE_FORK |
7894 SD_BALANCE_EXEC |
7895 SD_SHARE_CPUPOWER |
7896 SD_SHARE_PKG_RESOURCES);
7897 if (nr_node_ids == 1)
7898 pflags &= ~SD_SERIALIZE;
7899 }
7900 if (~cflags & pflags)
7901 return 0;
7902
7903 return 1;
7904}
7905
7906static void free_rootdomain(struct root_domain *rd)
7907{
7908 cpupri_cleanup(&rd->cpupri);
7909
7910 free_cpumask_var(rd->rto_mask);
7911 free_cpumask_var(rd->online);
7912 free_cpumask_var(rd->span);
7913 kfree(rd);
7914}
7915
7916static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7917{
7918 struct root_domain *old_rd = NULL;
7919 unsigned long flags;
7920
7921 spin_lock_irqsave(&rq->lock, flags);
7922
7923 if (rq->rd) {
7924 old_rd = rq->rd;
7925
7926 if (cpumask_test_cpu(rq->cpu, old_rd->online))
7927 set_rq_offline(rq);
7928
7929 cpumask_clear_cpu(rq->cpu, old_rd->span);
7930
7931
7932
7933
7934
7935
7936 if (!atomic_dec_and_test(&old_rd->refcount))
7937 old_rd = NULL;
7938 }
7939
7940 atomic_inc(&rd->refcount);
7941 rq->rd = rd;
7942
7943 cpumask_set_cpu(rq->cpu, rd->span);
7944 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7945 set_rq_online(rq);
7946
7947 spin_unlock_irqrestore(&rq->lock, flags);
7948
7949 if (old_rd)
7950 free_rootdomain(old_rd);
7951}
7952
7953static int init_rootdomain(struct root_domain *rd, bool bootmem)
7954{
7955 gfp_t gfp = GFP_KERNEL;
7956
7957 memset(rd, 0, sizeof(*rd));
7958
7959 if (bootmem)
7960 gfp = GFP_NOWAIT;
7961
7962 if (!alloc_cpumask_var(&rd->span, gfp))
7963 goto out;
7964 if (!alloc_cpumask_var(&rd->online, gfp))
7965 goto free_span;
7966 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7967 goto free_online;
7968
7969 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7970 goto free_rto_mask;
7971 return 0;
7972
7973free_rto_mask:
7974 free_cpumask_var(rd->rto_mask);
7975free_online:
7976 free_cpumask_var(rd->online);
7977free_span:
7978 free_cpumask_var(rd->span);
7979out:
7980 return -ENOMEM;
7981}
7982
7983static void init_defrootdomain(void)
7984{
7985 init_rootdomain(&def_root_domain, true);
7986
7987 atomic_set(&def_root_domain.refcount, 1);
7988}
7989
7990static struct root_domain *alloc_rootdomain(void)
7991{
7992 struct root_domain *rd;
7993
7994 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7995 if (!rd)
7996 return NULL;
7997
7998 if (init_rootdomain(rd, false) != 0) {
7999 kfree(rd);
8000 return NULL;
8001 }
8002
8003 return rd;
8004}
8005
8006
8007
8008
8009
8010static void
8011cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
8012{
8013 struct rq *rq = cpu_rq(cpu);
8014 struct sched_domain *tmp;
8015
8016
8017 for (tmp = sd; tmp; ) {
8018 struct sched_domain *parent = tmp->parent;
8019 if (!parent)
8020 break;
8021
8022 if (sd_parent_degenerate(tmp, parent)) {
8023 tmp->parent = parent->parent;
8024 if (parent->parent)
8025 parent->parent->child = tmp;
8026 } else
8027 tmp = tmp->parent;
8028 }
8029
8030 if (sd && sd_degenerate(sd)) {
8031 sd = sd->parent;
8032 if (sd)
8033 sd->child = NULL;
8034 }
8035
8036 sched_domain_debug(sd, cpu);
8037
8038 rq_attach_root(rq, rd);
8039 rcu_assign_pointer(rq->sd, sd);
8040}
8041
8042
8043static cpumask_var_t cpu_isolated_map;
8044
8045
8046static int __init isolated_cpu_setup(char *str)
8047{
8048 cpulist_parse(str, cpu_isolated_map);
8049 return 1;
8050}
8051
8052__setup("isolcpus=", isolated_cpu_setup);
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064static void
8065init_sched_build_groups(const struct cpumask *span,
8066 const struct cpumask *cpu_map,
8067 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
8068 struct sched_group **sg,
8069 struct cpumask *tmpmask),
8070 struct cpumask *covered, struct cpumask *tmpmask)
8071{
8072 struct sched_group *first = NULL, *last = NULL;
8073 int i;
8074
8075 cpumask_clear(covered);
8076
8077 for_each_cpu(i, span) {
8078 struct sched_group *sg;
8079 int group = group_fn(i, cpu_map, &sg, tmpmask);
8080 int j;
8081
8082 if (cpumask_test_cpu(i, covered))
8083 continue;
8084
8085 cpumask_clear(sched_group_cpus(sg));
8086 sg->cpu_power = 0;
8087
8088 for_each_cpu(j, span) {
8089 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
8090 continue;
8091
8092 cpumask_set_cpu(j, covered);
8093 cpumask_set_cpu(j, sched_group_cpus(sg));
8094 }
8095 if (!first)
8096 first = sg;
8097 if (last)
8098 last->next = sg;
8099 last = sg;
8100 }
8101 last->next = first;
8102}
8103
8104#define SD_NODES_PER_DOMAIN 16
8105
8106#ifdef CONFIG_NUMA
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118static int find_next_best_node(int node, nodemask_t *used_nodes)
8119{
8120 int i, n, val, min_val, best_node = 0;
8121
8122 min_val = INT_MAX;
8123
8124 for (i = 0; i < nr_node_ids; i++) {
8125
8126 n = (node + i) % nr_node_ids;
8127
8128 if (!nr_cpus_node(n))
8129 continue;
8130
8131
8132 if (node_isset(n, *used_nodes))
8133 continue;
8134
8135
8136 val = node_distance(node, n);
8137
8138 if (val < min_val) {
8139 min_val = val;
8140 best_node = n;
8141 }
8142 }
8143
8144 node_set(best_node, *used_nodes);
8145 return best_node;
8146}
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157static void sched_domain_node_span(int node, struct cpumask *span)
8158{
8159 nodemask_t used_nodes;
8160 int i;
8161
8162 cpumask_clear(span);
8163 nodes_clear(used_nodes);
8164
8165 cpumask_or(span, span, cpumask_of_node(node));
8166 node_set(node, used_nodes);
8167
8168 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
8169 int next_node = find_next_best_node(node, &used_nodes);
8170
8171 cpumask_or(span, span, cpumask_of_node(next_node));
8172 }
8173}
8174#endif
8175
8176int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
8177
8178
8179
8180
8181
8182
8183
8184struct static_sched_group {
8185 struct sched_group sg;
8186 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
8187};
8188
8189struct static_sched_domain {
8190 struct sched_domain sd;
8191 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8192};
8193
8194struct s_data {
8195#ifdef CONFIG_NUMA
8196 int sd_allnodes;
8197 cpumask_var_t domainspan;
8198 cpumask_var_t covered;
8199 cpumask_var_t notcovered;
8200#endif
8201 cpumask_var_t nodemask;
8202 cpumask_var_t this_sibling_map;
8203 cpumask_var_t this_core_map;
8204 cpumask_var_t send_covered;
8205 cpumask_var_t tmpmask;
8206 struct sched_group **sched_group_nodes;
8207 struct root_domain *rd;
8208};
8209
8210enum s_alloc {
8211 sa_sched_groups = 0,
8212 sa_rootdomain,
8213 sa_tmpmask,
8214 sa_send_covered,
8215 sa_this_core_map,
8216 sa_this_sibling_map,
8217 sa_nodemask,
8218 sa_sched_group_nodes,
8219#ifdef CONFIG_NUMA
8220 sa_notcovered,
8221 sa_covered,
8222 sa_domainspan,
8223#endif
8224 sa_none,
8225};
8226
8227
8228
8229
8230#ifdef CONFIG_SCHED_SMT
8231static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8232static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
8233
8234static int
8235cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8236 struct sched_group **sg, struct cpumask *unused)
8237{
8238 if (sg)
8239 *sg = &per_cpu(sched_group_cpus, cpu).sg;
8240 return cpu;
8241}
8242#endif
8243
8244
8245
8246
8247#ifdef CONFIG_SCHED_MC
8248static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
8249static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
8250#endif
8251
8252#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
8253static int
8254cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8255 struct sched_group **sg, struct cpumask *mask)
8256{
8257 int group;
8258
8259 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8260 group = cpumask_first(mask);
8261 if (sg)
8262 *sg = &per_cpu(sched_group_core, group).sg;
8263 return group;
8264}
8265#elif defined(CONFIG_SCHED_MC)
8266static int
8267cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8268 struct sched_group **sg, struct cpumask *unused)
8269{
8270 if (sg)
8271 *sg = &per_cpu(sched_group_core, cpu).sg;
8272 return cpu;
8273}
8274#endif
8275
8276static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
8277static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
8278
8279static int
8280cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
8281 struct sched_group **sg, struct cpumask *mask)
8282{
8283 int group;
8284#ifdef CONFIG_SCHED_MC
8285 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
8286 group = cpumask_first(mask);
8287#elif defined(CONFIG_SCHED_SMT)
8288 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8289 group = cpumask_first(mask);
8290#else
8291 group = cpu;
8292#endif
8293 if (sg)
8294 *sg = &per_cpu(sched_group_phys, group).sg;
8295 return group;
8296}
8297
8298#ifdef CONFIG_NUMA
8299
8300
8301
8302
8303
8304static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
8305static struct sched_group ***sched_group_nodes_bycpu;
8306
8307static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
8308static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
8309
8310static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
8311 struct sched_group **sg,
8312 struct cpumask *nodemask)
8313{
8314 int group;
8315
8316 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
8317 group = cpumask_first(nodemask);
8318
8319 if (sg)
8320 *sg = &per_cpu(sched_group_allnodes, group).sg;
8321 return group;
8322}
8323
8324static void init_numa_sched_groups_power(struct sched_group *group_head)
8325{
8326 struct sched_group *sg = group_head;
8327 int j;
8328
8329 if (!sg)
8330 return;
8331 do {
8332 for_each_cpu(j, sched_group_cpus(sg)) {
8333 struct sched_domain *sd;
8334
8335 sd = &per_cpu(phys_domains, j).sd;
8336 if (j != group_first_cpu(sd->groups)) {
8337
8338
8339
8340
8341 continue;
8342 }
8343
8344 sg->cpu_power += sd->groups->cpu_power;
8345 }
8346 sg = sg->next;
8347 } while (sg != group_head);
8348}
8349
8350static int build_numa_sched_groups(struct s_data *d,
8351 const struct cpumask *cpu_map, int num)
8352{
8353 struct sched_domain *sd;
8354 struct sched_group *sg, *prev;
8355 int n, j;
8356
8357 cpumask_clear(d->covered);
8358 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8359 if (cpumask_empty(d->nodemask)) {
8360 d->sched_group_nodes[num] = NULL;
8361 goto out;
8362 }
8363
8364 sched_domain_node_span(num, d->domainspan);
8365 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8366
8367 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8368 GFP_KERNEL, num);
8369 if (!sg) {
8370 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8371 num);
8372 return -ENOMEM;
8373 }
8374 d->sched_group_nodes[num] = sg;
8375
8376 for_each_cpu(j, d->nodemask) {
8377 sd = &per_cpu(node_domains, j).sd;
8378 sd->groups = sg;
8379 }
8380
8381 sg->cpu_power = 0;
8382 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8383 sg->next = sg;
8384 cpumask_or(d->covered, d->covered, d->nodemask);
8385
8386 prev = sg;
8387 for (j = 0; j < nr_node_ids; j++) {
8388 n = (num + j) % nr_node_ids;
8389 cpumask_complement(d->notcovered, d->covered);
8390 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8391 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8392 if (cpumask_empty(d->tmpmask))
8393 break;
8394 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8395 if (cpumask_empty(d->tmpmask))
8396 continue;
8397 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8398 GFP_KERNEL, num);
8399 if (!sg) {
8400 printk(KERN_WARNING
8401 "Can not alloc domain group for node %d\n", j);
8402 return -ENOMEM;
8403 }
8404 sg->cpu_power = 0;
8405 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8406 sg->next = prev->next;
8407 cpumask_or(d->covered, d->covered, d->tmpmask);
8408 prev->next = sg;
8409 prev = sg;
8410 }
8411out:
8412 return 0;
8413}
8414#endif
8415
8416#ifdef CONFIG_NUMA
8417
8418static void free_sched_groups(const struct cpumask *cpu_map,
8419 struct cpumask *nodemask)
8420{
8421 int cpu, i;
8422
8423 for_each_cpu(cpu, cpu_map) {
8424 struct sched_group **sched_group_nodes
8425 = sched_group_nodes_bycpu[cpu];
8426
8427 if (!sched_group_nodes)
8428 continue;
8429
8430 for (i = 0; i < nr_node_ids; i++) {
8431 struct sched_group *oldsg, *sg = sched_group_nodes[i];
8432
8433 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
8434 if (cpumask_empty(nodemask))
8435 continue;
8436
8437 if (sg == NULL)
8438 continue;
8439 sg = sg->next;
8440next_sg:
8441 oldsg = sg;
8442 sg = sg->next;
8443 kfree(oldsg);
8444 if (oldsg != sched_group_nodes[i])
8445 goto next_sg;
8446 }
8447 kfree(sched_group_nodes);
8448 sched_group_nodes_bycpu[cpu] = NULL;
8449 }
8450}
8451#else
8452static void free_sched_groups(const struct cpumask *cpu_map,
8453 struct cpumask *nodemask)
8454{
8455}
8456#endif
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8469{
8470 struct sched_domain *child;
8471 struct sched_group *group;
8472 long power;
8473 int weight;
8474
8475 WARN_ON(!sd || !sd->groups);
8476
8477 if (cpu != group_first_cpu(sd->groups))
8478 return;
8479
8480 child = sd->child;
8481
8482 sd->groups->cpu_power = 0;
8483
8484 if (!child) {
8485 power = SCHED_LOAD_SCALE;
8486 weight = cpumask_weight(sched_domain_span(sd));
8487
8488
8489
8490
8491
8492
8493 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8494 power *= sd->smt_gain;
8495 power /= weight;
8496 power >>= SCHED_LOAD_SHIFT;
8497 }
8498 sd->groups->cpu_power += power;
8499 return;
8500 }
8501
8502
8503
8504
8505 group = child->groups;
8506 do {
8507 sd->groups->cpu_power += group->cpu_power;
8508 group = group->next;
8509 } while (group != child->groups);
8510}
8511
8512
8513
8514
8515
8516
8517#ifdef CONFIG_SCHED_DEBUG
8518# define SD_INIT_NAME(sd, type) sd->name = #type
8519#else
8520# define SD_INIT_NAME(sd, type) do { } while (0)
8521#endif
8522
8523#define SD_INIT(sd, type) sd_init_##type(sd)
8524
8525#define SD_INIT_FUNC(type) \
8526static noinline void sd_init_##type(struct sched_domain *sd) \
8527{ \
8528 memset(sd, 0, sizeof(*sd)); \
8529 *sd = SD_##type##_INIT; \
8530 sd->level = SD_LV_##type; \
8531 SD_INIT_NAME(sd, type); \
8532}
8533
8534SD_INIT_FUNC(CPU)
8535#ifdef CONFIG_NUMA
8536 SD_INIT_FUNC(ALLNODES)
8537 SD_INIT_FUNC(NODE)
8538#endif
8539#ifdef CONFIG_SCHED_SMT
8540 SD_INIT_FUNC(SIBLING)
8541#endif
8542#ifdef CONFIG_SCHED_MC
8543 SD_INIT_FUNC(MC)
8544#endif
8545
8546static int default_relax_domain_level = -1;
8547
8548static int __init setup_relax_domain_level(char *str)
8549{
8550 unsigned long val;
8551
8552 val = simple_strtoul(str, NULL, 0);
8553 if (val < SD_LV_MAX)
8554 default_relax_domain_level = val;
8555
8556 return 1;
8557}
8558__setup("relax_domain_level=", setup_relax_domain_level);
8559
8560static void set_domain_attribute(struct sched_domain *sd,
8561 struct sched_domain_attr *attr)
8562{
8563 int request;
8564
8565 if (!attr || attr->relax_domain_level < 0) {
8566 if (default_relax_domain_level < 0)
8567 return;
8568 else
8569 request = default_relax_domain_level;
8570 } else
8571 request = attr->relax_domain_level;
8572 if (request < sd->level) {
8573
8574 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8575 } else {
8576
8577 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8578 }
8579}
8580
8581static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8582 const struct cpumask *cpu_map)
8583{
8584 switch (what) {
8585 case sa_sched_groups:
8586 free_sched_groups(cpu_map, d->tmpmask);
8587 d->sched_group_nodes = NULL;
8588 case sa_rootdomain:
8589 free_rootdomain(d->rd);
8590 case sa_tmpmask:
8591 free_cpumask_var(d->tmpmask);
8592 case sa_send_covered:
8593 free_cpumask_var(d->send_covered);
8594 case sa_this_core_map:
8595 free_cpumask_var(d->this_core_map);
8596 case sa_this_sibling_map:
8597 free_cpumask_var(d->this_sibling_map);
8598 case sa_nodemask:
8599 free_cpumask_var(d->nodemask);
8600 case sa_sched_group_nodes:
8601#ifdef CONFIG_NUMA
8602 kfree(d->sched_group_nodes);
8603 case sa_notcovered:
8604 free_cpumask_var(d->notcovered);
8605 case sa_covered:
8606 free_cpumask_var(d->covered);
8607 case sa_domainspan:
8608 free_cpumask_var(d->domainspan);
8609#endif
8610 case sa_none:
8611 break;
8612 }
8613}
8614
8615static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8616 const struct cpumask *cpu_map)
8617{
8618#ifdef CONFIG_NUMA
8619 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8620 return sa_none;
8621 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8622 return sa_domainspan;
8623 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8624 return sa_covered;
8625
8626 d->sched_group_nodes = kcalloc(nr_node_ids,
8627 sizeof(struct sched_group *), GFP_KERNEL);
8628 if (!d->sched_group_nodes) {
8629 printk(KERN_WARNING "Can not alloc sched group node list\n");
8630 return sa_notcovered;
8631 }
8632 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8633#endif
8634 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8635 return sa_sched_group_nodes;
8636 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8637 return sa_nodemask;
8638 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8639 return sa_this_sibling_map;
8640 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8641 return sa_this_core_map;
8642 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8643 return sa_send_covered;
8644 d->rd = alloc_rootdomain();
8645 if (!d->rd) {
8646 printk(KERN_WARNING "Cannot alloc root domain\n");
8647 return sa_tmpmask;
8648 }
8649 return sa_rootdomain;
8650}
8651
8652static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8653 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8654{
8655 struct sched_domain *sd = NULL;
8656#ifdef CONFIG_NUMA
8657 struct sched_domain *parent;
8658
8659 d->sd_allnodes = 0;
8660 if (cpumask_weight(cpu_map) >
8661 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8662 sd = &per_cpu(allnodes_domains, i).sd;
8663 SD_INIT(sd, ALLNODES);
8664 set_domain_attribute(sd, attr);
8665 cpumask_copy(sched_domain_span(sd), cpu_map);
8666 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8667 d->sd_allnodes = 1;
8668 }
8669 parent = sd;
8670
8671 sd = &per_cpu(node_domains, i).sd;
8672 SD_INIT(sd, NODE);
8673 set_domain_attribute(sd, attr);
8674 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8675 sd->parent = parent;
8676 if (parent)
8677 parent->child = sd;
8678 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8679#endif
8680 return sd;
8681}
8682
8683static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8684 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8685 struct sched_domain *parent, int i)
8686{
8687 struct sched_domain *sd;
8688 sd = &per_cpu(phys_domains, i).sd;
8689 SD_INIT(sd, CPU);
8690 set_domain_attribute(sd, attr);
8691 cpumask_copy(sched_domain_span(sd), d->nodemask);
8692 sd->parent = parent;
8693 if (parent)
8694 parent->child = sd;
8695 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8696 return sd;
8697}
8698
8699static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8700 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8701 struct sched_domain *parent, int i)
8702{
8703 struct sched_domain *sd = parent;
8704#ifdef CONFIG_SCHED_MC
8705 sd = &per_cpu(core_domains, i).sd;
8706 SD_INIT(sd, MC);
8707 set_domain_attribute(sd, attr);
8708 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8709 sd->parent = parent;
8710 parent->child = sd;
8711 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8712#endif
8713 return sd;
8714}
8715
8716static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8717 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8718 struct sched_domain *parent, int i)
8719{
8720 struct sched_domain *sd = parent;
8721#ifdef CONFIG_SCHED_SMT
8722 sd = &per_cpu(cpu_domains, i).sd;
8723 SD_INIT(sd, SIBLING);
8724 set_domain_attribute(sd, attr);
8725 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8726 sd->parent = parent;
8727 parent->child = sd;
8728 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8729#endif
8730 return sd;
8731}
8732
8733static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8734 const struct cpumask *cpu_map, int cpu)
8735{
8736 switch (l) {
8737#ifdef CONFIG_SCHED_SMT
8738 case SD_LV_SIBLING:
8739 cpumask_and(d->this_sibling_map, cpu_map,
8740 topology_thread_cpumask(cpu));
8741 if (cpu == cpumask_first(d->this_sibling_map))
8742 init_sched_build_groups(d->this_sibling_map, cpu_map,
8743 &cpu_to_cpu_group,
8744 d->send_covered, d->tmpmask);
8745 break;
8746#endif
8747#ifdef CONFIG_SCHED_MC
8748 case SD_LV_MC:
8749 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8750 if (cpu == cpumask_first(d->this_core_map))
8751 init_sched_build_groups(d->this_core_map, cpu_map,
8752 &cpu_to_core_group,
8753 d->send_covered, d->tmpmask);
8754 break;
8755#endif
8756 case SD_LV_CPU:
8757 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8758 if (!cpumask_empty(d->nodemask))
8759 init_sched_build_groups(d->nodemask, cpu_map,
8760 &cpu_to_phys_group,
8761 d->send_covered, d->tmpmask);
8762 break;
8763#ifdef CONFIG_NUMA
8764 case SD_LV_ALLNODES:
8765 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8766 d->send_covered, d->tmpmask);
8767 break;
8768#endif
8769 default:
8770 break;
8771 }
8772}
8773
8774
8775
8776
8777
8778static int __build_sched_domains(const struct cpumask *cpu_map,
8779 struct sched_domain_attr *attr)
8780{
8781 enum s_alloc alloc_state = sa_none;
8782 struct s_data d;
8783 struct sched_domain *sd;
8784 int i;
8785#ifdef CONFIG_NUMA
8786 d.sd_allnodes = 0;
8787#endif
8788
8789 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8790 if (alloc_state != sa_rootdomain)
8791 goto error;
8792 alloc_state = sa_sched_groups;
8793
8794
8795
8796
8797 for_each_cpu(i, cpu_map) {
8798 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8799 cpu_map);
8800
8801 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8802 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8803 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8804 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8805 }
8806
8807 for_each_cpu(i, cpu_map) {
8808 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8809 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8810 }
8811
8812
8813 for (i = 0; i < nr_node_ids; i++)
8814 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8815
8816#ifdef CONFIG_NUMA
8817
8818 if (d.sd_allnodes)
8819 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8820
8821 for (i = 0; i < nr_node_ids; i++)
8822 if (build_numa_sched_groups(&d, cpu_map, i))
8823 goto error;
8824#endif
8825
8826
8827#ifdef CONFIG_SCHED_SMT
8828 for_each_cpu(i, cpu_map) {
8829 sd = &per_cpu(cpu_domains, i).sd;
8830 init_sched_groups_power(i, sd);
8831 }
8832#endif
8833#ifdef CONFIG_SCHED_MC
8834 for_each_cpu(i, cpu_map) {
8835 sd = &per_cpu(core_domains, i).sd;
8836 init_sched_groups_power(i, sd);
8837 }
8838#endif
8839
8840 for_each_cpu(i, cpu_map) {
8841 sd = &per_cpu(phys_domains, i).sd;
8842 init_sched_groups_power(i, sd);
8843 }
8844
8845#ifdef CONFIG_NUMA
8846 for (i = 0; i < nr_node_ids; i++)
8847 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8848
8849 if (d.sd_allnodes) {
8850 struct sched_group *sg;
8851
8852 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8853 d.tmpmask);
8854 init_numa_sched_groups_power(sg);
8855 }
8856#endif
8857
8858
8859 for_each_cpu(i, cpu_map) {
8860#ifdef CONFIG_SCHED_SMT
8861 sd = &per_cpu(cpu_domains, i).sd;
8862#elif defined(CONFIG_SCHED_MC)
8863 sd = &per_cpu(core_domains, i).sd;
8864#else
8865 sd = &per_cpu(phys_domains, i).sd;
8866#endif
8867 cpu_attach_domain(sd, d.rd, i);
8868 }
8869
8870 d.sched_group_nodes = NULL;
8871 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8872 return 0;
8873
8874error:
8875 __free_domain_allocs(&d, alloc_state, cpu_map);
8876 return -ENOMEM;
8877}
8878
8879static int build_sched_domains(const struct cpumask *cpu_map)
8880{
8881 return __build_sched_domains(cpu_map, NULL);
8882}
8883
8884static struct cpumask *doms_cur;
8885static int ndoms_cur;
8886static struct sched_domain_attr *dattr_cur;
8887
8888
8889
8890
8891
8892
8893
8894static cpumask_var_t fallback_doms;
8895
8896
8897
8898
8899
8900
8901int __attribute__((weak)) arch_update_cpu_topology(void)
8902{
8903 return 0;
8904}
8905
8906
8907
8908
8909
8910
8911static int arch_init_sched_domains(const struct cpumask *cpu_map)
8912{
8913 int err;
8914
8915 arch_update_cpu_topology();
8916 ndoms_cur = 1;
8917 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
8918 if (!doms_cur)
8919 doms_cur = fallback_doms;
8920 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
8921 dattr_cur = NULL;
8922 err = build_sched_domains(doms_cur);
8923 register_sched_domain_sysctl();
8924
8925 return err;
8926}
8927
8928static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
8929 struct cpumask *tmpmask)
8930{
8931 free_sched_groups(cpu_map, tmpmask);
8932}
8933
8934
8935
8936
8937
8938static void detach_destroy_domains(const struct cpumask *cpu_map)
8939{
8940
8941 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
8942 int i;
8943
8944 for_each_cpu(i, cpu_map)
8945 cpu_attach_domain(NULL, &def_root_domain, i);
8946 synchronize_sched();
8947 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
8948}
8949
8950
8951static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8952 struct sched_domain_attr *new, int idx_new)
8953{
8954 struct sched_domain_attr tmp;
8955
8956
8957 if (!new && !cur)
8958 return 1;
8959
8960 tmp = SD_ATTR_INIT;
8961 return !memcmp(cur ? (cur + idx_cur) : &tmp,
8962 new ? (new + idx_new) : &tmp,
8963 sizeof(struct sched_domain_attr));
8964}
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8994 struct sched_domain_attr *dattr_new)
8995{
8996 int i, j, n;
8997 int new_topology;
8998
8999 mutex_lock(&sched_domains_mutex);
9000
9001
9002 unregister_sched_domain_sysctl();
9003
9004
9005 new_topology = arch_update_cpu_topology();
9006
9007 n = doms_new ? ndoms_new : 0;
9008
9009
9010 for (i = 0; i < ndoms_cur; i++) {
9011 for (j = 0; j < n && !new_topology; j++) {
9012 if (cpumask_equal(&doms_cur[i], &doms_new[j])
9013 && dattrs_equal(dattr_cur, i, dattr_new, j))
9014 goto match1;
9015 }
9016
9017 detach_destroy_domains(doms_cur + i);
9018match1:
9019 ;
9020 }
9021
9022 if (doms_new == NULL) {
9023 ndoms_cur = 0;
9024 doms_new = fallback_doms;
9025 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
9026 WARN_ON_ONCE(dattr_new);
9027 }
9028
9029
9030 for (i = 0; i < ndoms_new; i++) {
9031 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9032 if (cpumask_equal(&doms_new[i], &doms_cur[j])
9033 && dattrs_equal(dattr_new, i, dattr_cur, j))
9034 goto match2;
9035 }
9036
9037 __build_sched_domains(doms_new + i,
9038 dattr_new ? dattr_new + i : NULL);
9039match2:
9040 ;
9041 }
9042
9043
9044 if (doms_cur != fallback_doms)
9045 kfree(doms_cur);
9046 kfree(dattr_cur);
9047 doms_cur = doms_new;
9048 dattr_cur = dattr_new;
9049 ndoms_cur = ndoms_new;
9050
9051 register_sched_domain_sysctl();
9052
9053 mutex_unlock(&sched_domains_mutex);
9054}
9055
9056#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
9057static void arch_reinit_sched_domains(void)
9058{
9059 get_online_cpus();
9060
9061
9062 partition_sched_domains(0, NULL, NULL);
9063
9064 rebuild_sched_domains();
9065 put_online_cpus();
9066}
9067
9068static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9069{
9070 unsigned int level = 0;
9071
9072 if (sscanf(buf, "%u", &level) != 1)
9073 return -EINVAL;
9074
9075
9076
9077
9078
9079
9080
9081
9082 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
9083 return -EINVAL;
9084
9085 if (smt)
9086 sched_smt_power_savings = level;
9087 else
9088 sched_mc_power_savings = level;
9089
9090 arch_reinit_sched_domains();
9091
9092 return count;
9093}
9094
9095#ifdef CONFIG_SCHED_MC
9096static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
9097 char *page)
9098{
9099 return sprintf(page, "%u\n", sched_mc_power_savings);
9100}
9101static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
9102 const char *buf, size_t count)
9103{
9104 return sched_power_savings_store(buf, count, 0);
9105}
9106static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9107 sched_mc_power_savings_show,
9108 sched_mc_power_savings_store);
9109#endif
9110
9111#ifdef CONFIG_SCHED_SMT
9112static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
9113 char *page)
9114{
9115 return sprintf(page, "%u\n", sched_smt_power_savings);
9116}
9117static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
9118 const char *buf, size_t count)
9119{
9120 return sched_power_savings_store(buf, count, 1);
9121}
9122static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
9123 sched_smt_power_savings_show,
9124 sched_smt_power_savings_store);
9125#endif
9126
9127int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
9128{
9129 int err = 0;
9130
9131#ifdef CONFIG_SCHED_SMT
9132 if (smt_capable())
9133 err = sysfs_create_file(&cls->kset.kobj,
9134 &attr_sched_smt_power_savings.attr);
9135#endif
9136#ifdef CONFIG_SCHED_MC
9137 if (!err && mc_capable())
9138 err = sysfs_create_file(&cls->kset.kobj,
9139 &attr_sched_mc_power_savings.attr);
9140#endif
9141 return err;
9142}
9143#endif
9144
9145#ifndef CONFIG_CPUSETS
9146
9147
9148
9149
9150static int update_sched_domains(struct notifier_block *nfb,
9151 unsigned long action, void *hcpu)
9152{
9153 switch (action) {
9154 case CPU_ONLINE:
9155 case CPU_ONLINE_FROZEN:
9156 case CPU_DEAD:
9157 case CPU_DEAD_FROZEN:
9158 partition_sched_domains(1, NULL, NULL);
9159 return NOTIFY_OK;
9160
9161 default:
9162 return NOTIFY_DONE;
9163 }
9164}
9165#endif
9166
9167static int update_runtime(struct notifier_block *nfb,
9168 unsigned long action, void *hcpu)
9169{
9170 int cpu = (int)(long)hcpu;
9171
9172 switch (action) {
9173 case CPU_DOWN_PREPARE:
9174 case CPU_DOWN_PREPARE_FROZEN:
9175 disable_runtime(cpu_rq(cpu));
9176 return NOTIFY_OK;
9177
9178 case CPU_DOWN_FAILED:
9179 case CPU_DOWN_FAILED_FROZEN:
9180 case CPU_ONLINE:
9181 case CPU_ONLINE_FROZEN:
9182 enable_runtime(cpu_rq(cpu));
9183 return NOTIFY_OK;
9184
9185 default:
9186 return NOTIFY_DONE;
9187 }
9188}
9189
9190void __init sched_init_smp(void)
9191{
9192 cpumask_var_t non_isolated_cpus;
9193
9194 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9195 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9196
9197#if defined(CONFIG_NUMA)
9198 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
9199 GFP_KERNEL);
9200 BUG_ON(sched_group_nodes_bycpu == NULL);
9201#endif
9202 get_online_cpus();
9203 mutex_lock(&sched_domains_mutex);
9204 arch_init_sched_domains(cpu_online_mask);
9205 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9206 if (cpumask_empty(non_isolated_cpus))
9207 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
9208 mutex_unlock(&sched_domains_mutex);
9209 put_online_cpus();
9210
9211#ifndef CONFIG_CPUSETS
9212
9213 hotcpu_notifier(update_sched_domains, 0);
9214#endif
9215
9216
9217 hotcpu_notifier(update_runtime, 0);
9218
9219 init_hrtick();
9220
9221
9222 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
9223 BUG();
9224 sched_init_granularity();
9225 free_cpumask_var(non_isolated_cpus);
9226
9227 init_sched_rt_class();
9228}
9229#else
9230void __init sched_init_smp(void)
9231{
9232 sched_init_granularity();
9233}
9234#endif
9235
9236const_debug unsigned int sysctl_timer_migration = 1;
9237
9238int in_sched_functions(unsigned long addr)
9239{
9240 return in_lock_functions(addr) ||
9241 (addr >= (unsigned long)__sched_text_start
9242 && addr < (unsigned long)__sched_text_end);
9243}
9244
9245static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
9246{
9247 cfs_rq->tasks_timeline = RB_ROOT;
9248 INIT_LIST_HEAD(&cfs_rq->tasks);
9249#ifdef CONFIG_FAIR_GROUP_SCHED
9250 cfs_rq->rq = rq;
9251#endif
9252 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9253}
9254
9255static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9256{
9257 struct rt_prio_array *array;
9258 int i;
9259
9260 array = &rt_rq->active;
9261 for (i = 0; i < MAX_RT_PRIO; i++) {
9262 INIT_LIST_HEAD(array->queue + i);
9263 __clear_bit(i, array->bitmap);
9264 }
9265
9266 __set_bit(MAX_RT_PRIO, array->bitmap);
9267
9268#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
9269 rt_rq->highest_prio.curr = MAX_RT_PRIO;
9270#ifdef CONFIG_SMP
9271 rt_rq->highest_prio.next = MAX_RT_PRIO;
9272#endif
9273#endif
9274#ifdef CONFIG_SMP
9275 rt_rq->rt_nr_migratory = 0;
9276 rt_rq->overloaded = 0;
9277 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9278#endif
9279
9280 rt_rq->rt_time = 0;
9281 rt_rq->rt_throttled = 0;
9282 rt_rq->rt_runtime = 0;
9283 spin_lock_init(&rt_rq->rt_runtime_lock);
9284
9285#ifdef CONFIG_RT_GROUP_SCHED
9286 rt_rq->rt_nr_boosted = 0;
9287 rt_rq->rq = rq;
9288#endif
9289}
9290
9291#ifdef CONFIG_FAIR_GROUP_SCHED
9292static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9293 struct sched_entity *se, int cpu, int add,
9294 struct sched_entity *parent)
9295{
9296 struct rq *rq = cpu_rq(cpu);
9297 tg->cfs_rq[cpu] = cfs_rq;
9298 init_cfs_rq(cfs_rq, rq);
9299 cfs_rq->tg = tg;
9300 if (add)
9301 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
9302
9303 tg->se[cpu] = se;
9304
9305 if (!se)
9306 return;
9307
9308 if (!parent)
9309 se->cfs_rq = &rq->cfs;
9310 else
9311 se->cfs_rq = parent->my_q;
9312
9313 se->my_q = cfs_rq;
9314 se->load.weight = tg->shares;
9315 se->load.inv_weight = 0;
9316 se->parent = parent;
9317}
9318#endif
9319
9320#ifdef CONFIG_RT_GROUP_SCHED
9321static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9322 struct sched_rt_entity *rt_se, int cpu, int add,
9323 struct sched_rt_entity *parent)
9324{
9325 struct rq *rq = cpu_rq(cpu);
9326
9327 tg->rt_rq[cpu] = rt_rq;
9328 init_rt_rq(rt_rq, rq);
9329 rt_rq->tg = tg;
9330 rt_rq->rt_se = rt_se;
9331 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9332 if (add)
9333 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
9334
9335 tg->rt_se[cpu] = rt_se;
9336 if (!rt_se)
9337 return;
9338
9339 if (!parent)
9340 rt_se->rt_rq = &rq->rt;
9341 else
9342 rt_se->rt_rq = parent->my_q;
9343
9344 rt_se->my_q = rt_rq;
9345 rt_se->parent = parent;
9346 INIT_LIST_HEAD(&rt_se->run_list);
9347}
9348#endif
9349
9350void __init sched_init(void)
9351{
9352 int i, j;
9353 unsigned long alloc_size = 0, ptr;
9354
9355#ifdef CONFIG_FAIR_GROUP_SCHED
9356 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9357#endif
9358#ifdef CONFIG_RT_GROUP_SCHED
9359 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9360#endif
9361#ifdef CONFIG_USER_SCHED
9362 alloc_size *= 2;
9363#endif
9364#ifdef CONFIG_CPUMASK_OFFSTACK
9365 alloc_size += num_possible_cpus() * cpumask_size();
9366#endif
9367
9368
9369
9370
9371 if (alloc_size) {
9372 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9373
9374#ifdef CONFIG_FAIR_GROUP_SCHED
9375 init_task_group.se = (struct sched_entity **)ptr;
9376 ptr += nr_cpu_ids * sizeof(void **);
9377
9378 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9379 ptr += nr_cpu_ids * sizeof(void **);
9380
9381#ifdef CONFIG_USER_SCHED
9382 root_task_group.se = (struct sched_entity **)ptr;
9383 ptr += nr_cpu_ids * sizeof(void **);
9384
9385 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9386 ptr += nr_cpu_ids * sizeof(void **);
9387#endif
9388#endif
9389#ifdef CONFIG_RT_GROUP_SCHED
9390 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
9391 ptr += nr_cpu_ids * sizeof(void **);
9392
9393 init_task_group.rt_rq = (struct rt_rq **)ptr;
9394 ptr += nr_cpu_ids * sizeof(void **);
9395
9396#ifdef CONFIG_USER_SCHED
9397 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9398 ptr += nr_cpu_ids * sizeof(void **);
9399
9400 root_task_group.rt_rq = (struct rt_rq **)ptr;
9401 ptr += nr_cpu_ids * sizeof(void **);
9402#endif
9403#endif
9404#ifdef CONFIG_CPUMASK_OFFSTACK
9405 for_each_possible_cpu(i) {
9406 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
9407 ptr += cpumask_size();
9408 }
9409#endif
9410 }
9411
9412#ifdef CONFIG_SMP
9413 init_defrootdomain();
9414#endif
9415
9416 init_rt_bandwidth(&def_rt_bandwidth,
9417 global_rt_period(), global_rt_runtime());
9418
9419#ifdef CONFIG_RT_GROUP_SCHED
9420 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9421 global_rt_period(), global_rt_runtime());
9422#ifdef CONFIG_USER_SCHED
9423 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9424 global_rt_period(), RUNTIME_INF);
9425#endif
9426#endif
9427
9428#ifdef CONFIG_GROUP_SCHED
9429 list_add(&init_task_group.list, &task_groups);
9430 INIT_LIST_HEAD(&init_task_group.children);
9431
9432#ifdef CONFIG_USER_SCHED
9433 INIT_LIST_HEAD(&root_task_group.children);
9434 init_task_group.parent = &root_task_group;
9435 list_add(&init_task_group.siblings, &root_task_group.children);
9436#endif
9437#endif
9438
9439#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9440 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9441 __alignof__(unsigned long));
9442#endif
9443 for_each_possible_cpu(i) {
9444 struct rq *rq;
9445
9446 rq = cpu_rq(i);
9447 spin_lock_init(&rq->lock);
9448 rq->nr_running = 0;
9449 rq->calc_load_active = 0;
9450 rq->calc_load_update = jiffies + LOAD_FREQ;
9451 init_cfs_rq(&rq->cfs, rq);
9452 init_rt_rq(&rq->rt, rq);
9453#ifdef CONFIG_FAIR_GROUP_SCHED
9454 init_task_group.shares = init_task_group_load;
9455 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9456#ifdef CONFIG_CGROUP_SCHED
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9477#elif defined CONFIG_USER_SCHED
9478 root_task_group.shares = NICE_0_LOAD;
9479 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491 init_tg_cfs_entry(&init_task_group,
9492 &per_cpu(init_tg_cfs_rq, i),
9493 &per_cpu(init_sched_entity, i), i, 1,
9494 root_task_group.se[i]);
9495
9496#endif
9497#endif
9498
9499 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9500#ifdef CONFIG_RT_GROUP_SCHED
9501 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9502#ifdef CONFIG_CGROUP_SCHED
9503 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9504#elif defined CONFIG_USER_SCHED
9505 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9506 init_tg_rt_entry(&init_task_group,
9507 &per_cpu(init_rt_rq, i),
9508 &per_cpu(init_sched_rt_entity, i), i, 1,
9509 root_task_group.rt_se[i]);
9510#endif
9511#endif
9512
9513 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
9514 rq->cpu_load[j] = 0;
9515#ifdef CONFIG_SMP
9516 rq->sd = NULL;
9517 rq->rd = NULL;
9518 rq->post_schedule = 0;
9519 rq->active_balance = 0;
9520 rq->next_balance = jiffies;
9521 rq->push_cpu = 0;
9522 rq->cpu = i;
9523 rq->online = 0;
9524 rq->migration_thread = NULL;
9525 INIT_LIST_HEAD(&rq->migration_queue);
9526 rq_attach_root(rq, &def_root_domain);
9527#endif
9528 init_rq_hrtick(rq);
9529 atomic_set(&rq->nr_iowait, 0);
9530 }
9531
9532 set_load_weight(&init_task);
9533
9534#ifdef CONFIG_PREEMPT_NOTIFIERS
9535 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
9536#endif
9537
9538#ifdef CONFIG_SMP
9539 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9540#endif
9541
9542#ifdef CONFIG_RT_MUTEXES
9543 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
9544#endif
9545
9546
9547
9548
9549 atomic_inc(&init_mm.mm_count);
9550 enter_lazy_tlb(&init_mm, current);
9551
9552
9553
9554
9555
9556
9557
9558 init_idle(current, smp_processor_id());
9559
9560 calc_load_update = jiffies + LOAD_FREQ;
9561
9562
9563
9564
9565 current->sched_class = &fair_sched_class;
9566
9567
9568 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9569#ifdef CONFIG_SMP
9570#ifdef CONFIG_NO_HZ
9571 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9572 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9573#endif
9574 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9575#endif
9576
9577 perf_event_init();
9578
9579 scheduler_running = 1;
9580}
9581
9582#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9583static inline int preempt_count_equals(int preempt_offset)
9584{
9585 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9586
9587 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9588}
9589
9590void __might_sleep(char *file, int line, int preempt_offset)
9591{
9592#ifdef in_atomic
9593 static unsigned long prev_jiffy;
9594
9595 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9596 system_state != SYSTEM_RUNNING || oops_in_progress)
9597 return;
9598 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9599 return;
9600 prev_jiffy = jiffies;
9601
9602 printk(KERN_ERR
9603 "BUG: sleeping function called from invalid context at %s:%d\n",
9604 file, line);
9605 printk(KERN_ERR
9606 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9607 in_atomic(), irqs_disabled(),
9608 current->pid, current->comm);
9609
9610 debug_show_held_locks(current);
9611 if (irqs_disabled())
9612 print_irqtrace_events(current);
9613 dump_stack();
9614#endif
9615}
9616EXPORT_SYMBOL(__might_sleep);
9617#endif
9618
9619#ifdef CONFIG_MAGIC_SYSRQ
9620static void normalize_task(struct rq *rq, struct task_struct *p)
9621{
9622 int on_rq;
9623
9624 update_rq_clock(rq);
9625 on_rq = p->se.on_rq;
9626 if (on_rq)
9627 deactivate_task(rq, p, 0);
9628 __setscheduler(rq, p, SCHED_NORMAL, 0);
9629 if (on_rq) {
9630 activate_task(rq, p, 0);
9631 resched_task(rq->curr);
9632 }
9633}
9634
9635void normalize_rt_tasks(void)
9636{
9637 struct task_struct *g, *p;
9638 unsigned long flags;
9639 struct rq *rq;
9640
9641 read_lock_irqsave(&tasklist_lock, flags);
9642 do_each_thread(g, p) {
9643
9644
9645
9646 if (!p->mm)
9647 continue;
9648
9649 p->se.exec_start = 0;
9650#ifdef CONFIG_SCHEDSTATS
9651 p->se.wait_start = 0;
9652 p->se.sleep_start = 0;
9653 p->se.block_start = 0;
9654#endif
9655
9656 if (!rt_task(p)) {
9657
9658
9659
9660
9661 if (TASK_NICE(p) < 0 && p->mm)
9662 set_user_nice(p, 0);
9663 continue;
9664 }
9665
9666 spin_lock(&p->pi_lock);
9667 rq = __task_rq_lock(p);
9668
9669 normalize_task(rq, p);
9670
9671 __task_rq_unlock(rq);
9672 spin_unlock(&p->pi_lock);
9673 } while_each_thread(g, p);
9674
9675 read_unlock_irqrestore(&tasklist_lock, flags);
9676}
9677
9678#endif
9679
9680#ifdef CONFIG_IA64
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697struct task_struct *curr_task(int cpu)
9698{
9699 return cpu_curr(cpu);
9700}
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717void set_curr_task(int cpu, struct task_struct *p)
9718{
9719 cpu_curr(cpu) = p;
9720}
9721
9722#endif
9723
9724#ifdef CONFIG_FAIR_GROUP_SCHED
9725static void free_fair_sched_group(struct task_group *tg)
9726{
9727 int i;
9728
9729 for_each_possible_cpu(i) {
9730 if (tg->cfs_rq)
9731 kfree(tg->cfs_rq[i]);
9732 if (tg->se)
9733 kfree(tg->se[i]);
9734 }
9735
9736 kfree(tg->cfs_rq);
9737 kfree(tg->se);
9738}
9739
9740static
9741int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9742{
9743 struct cfs_rq *cfs_rq;
9744 struct sched_entity *se;
9745 struct rq *rq;
9746 int i;
9747
9748 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9749 if (!tg->cfs_rq)
9750 goto err;
9751 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9752 if (!tg->se)
9753 goto err;
9754
9755 tg->shares = NICE_0_LOAD;
9756
9757 for_each_possible_cpu(i) {
9758 rq = cpu_rq(i);
9759
9760 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9761 GFP_KERNEL, cpu_to_node(i));
9762 if (!cfs_rq)
9763 goto err;
9764
9765 se = kzalloc_node(sizeof(struct sched_entity),
9766 GFP_KERNEL, cpu_to_node(i));
9767 if (!se)
9768 goto err;
9769
9770 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9771 }
9772
9773 return 1;
9774
9775 err:
9776 return 0;
9777}
9778
9779static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9780{
9781 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
9782 &cpu_rq(cpu)->leaf_cfs_rq_list);
9783}
9784
9785static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9786{
9787 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
9788}
9789#else
9790static inline void free_fair_sched_group(struct task_group *tg)
9791{
9792}
9793
9794static inline
9795int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9796{
9797 return 1;
9798}
9799
9800static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9801{
9802}
9803
9804static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9805{
9806}
9807#endif
9808
9809#ifdef CONFIG_RT_GROUP_SCHED
9810static void free_rt_sched_group(struct task_group *tg)
9811{
9812 int i;
9813
9814 destroy_rt_bandwidth(&tg->rt_bandwidth);
9815
9816 for_each_possible_cpu(i) {
9817 if (tg->rt_rq)
9818 kfree(tg->rt_rq[i]);
9819 if (tg->rt_se)
9820 kfree(tg->rt_se[i]);
9821 }
9822
9823 kfree(tg->rt_rq);
9824 kfree(tg->rt_se);
9825}
9826
9827static
9828int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9829{
9830 struct rt_rq *rt_rq;
9831 struct sched_rt_entity *rt_se;
9832 struct rq *rq;
9833 int i;
9834
9835 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
9836 if (!tg->rt_rq)
9837 goto err;
9838 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
9839 if (!tg->rt_se)
9840 goto err;
9841
9842 init_rt_bandwidth(&tg->rt_bandwidth,
9843 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
9844
9845 for_each_possible_cpu(i) {
9846 rq = cpu_rq(i);
9847
9848 rt_rq = kzalloc_node(sizeof(struct rt_rq),
9849 GFP_KERNEL, cpu_to_node(i));
9850 if (!rt_rq)
9851 goto err;
9852
9853 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9854 GFP_KERNEL, cpu_to_node(i));
9855 if (!rt_se)
9856 goto err;
9857
9858 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9859 }
9860
9861 return 1;
9862
9863 err:
9864 return 0;
9865}
9866
9867static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9868{
9869 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
9870 &cpu_rq(cpu)->leaf_rt_rq_list);
9871}
9872
9873static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9874{
9875 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
9876}
9877#else
9878static inline void free_rt_sched_group(struct task_group *tg)
9879{
9880}
9881
9882static inline
9883int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9884{
9885 return 1;
9886}
9887
9888static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9889{
9890}
9891
9892static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9893{
9894}
9895#endif
9896
9897#ifdef CONFIG_GROUP_SCHED
9898static void free_sched_group(struct task_group *tg)
9899{
9900 free_fair_sched_group(tg);
9901 free_rt_sched_group(tg);
9902 kfree(tg);
9903}
9904
9905
9906struct task_group *sched_create_group(struct task_group *parent)
9907{
9908 struct task_group *tg;
9909 unsigned long flags;
9910 int i;
9911
9912 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
9913 if (!tg)
9914 return ERR_PTR(-ENOMEM);
9915
9916 if (!alloc_fair_sched_group(tg, parent))
9917 goto err;
9918
9919 if (!alloc_rt_sched_group(tg, parent))
9920 goto err;
9921
9922 spin_lock_irqsave(&task_group_lock, flags);
9923 for_each_possible_cpu(i) {
9924 register_fair_sched_group(tg, i);
9925 register_rt_sched_group(tg, i);
9926 }
9927 list_add_rcu(&tg->list, &task_groups);
9928
9929 WARN_ON(!parent);
9930
9931 tg->parent = parent;
9932 INIT_LIST_HEAD(&tg->children);
9933 list_add_rcu(&tg->siblings, &parent->children);
9934 spin_unlock_irqrestore(&task_group_lock, flags);
9935
9936 return tg;
9937
9938err:
9939 free_sched_group(tg);
9940 return ERR_PTR(-ENOMEM);
9941}
9942
9943
9944static void free_sched_group_rcu(struct rcu_head *rhp)
9945{
9946
9947 free_sched_group(container_of(rhp, struct task_group, rcu));
9948}
9949
9950
9951void sched_destroy_group(struct task_group *tg)
9952{
9953 unsigned long flags;
9954 int i;
9955
9956 spin_lock_irqsave(&task_group_lock, flags);
9957 for_each_possible_cpu(i) {
9958 unregister_fair_sched_group(tg, i);
9959 unregister_rt_sched_group(tg, i);
9960 }
9961 list_del_rcu(&tg->list);
9962 list_del_rcu(&tg->siblings);
9963 spin_unlock_irqrestore(&task_group_lock, flags);
9964
9965
9966 call_rcu(&tg->rcu, free_sched_group_rcu);
9967}
9968
9969
9970
9971
9972
9973
9974void sched_move_task(struct task_struct *tsk)
9975{
9976 int on_rq, running;
9977 unsigned long flags;
9978 struct rq *rq;
9979
9980 rq = task_rq_lock(tsk, &flags);
9981
9982 update_rq_clock(rq);
9983
9984 running = task_current(rq, tsk);
9985 on_rq = tsk->se.on_rq;
9986
9987 if (on_rq)
9988 dequeue_task(rq, tsk, 0);
9989 if (unlikely(running))
9990 tsk->sched_class->put_prev_task(rq, tsk);
9991
9992 set_task_rq(tsk, task_cpu(tsk));
9993
9994#ifdef CONFIG_FAIR_GROUP_SCHED
9995 if (tsk->sched_class->moved_group)
9996 tsk->sched_class->moved_group(tsk);
9997#endif
9998
9999 if (unlikely(running))
10000 tsk->sched_class->set_curr_task(rq);
10001 if (on_rq)
10002 enqueue_task(rq, tsk, 0);
10003
10004 task_rq_unlock(rq, &flags);
10005}
10006#endif
10007
10008#ifdef CONFIG_FAIR_GROUP_SCHED
10009static void __set_se_shares(struct sched_entity *se, unsigned long shares)
10010{
10011 struct cfs_rq *cfs_rq = se->cfs_rq;
10012 int on_rq;
10013
10014 on_rq = se->on_rq;
10015 if (on_rq)
10016 dequeue_entity(cfs_rq, se, 0);
10017
10018 se->load.weight = shares;
10019 se->load.inv_weight = 0;
10020
10021 if (on_rq)
10022 enqueue_entity(cfs_rq, se, 0);
10023}
10024
10025static void set_se_shares(struct sched_entity *se, unsigned long shares)
10026{
10027 struct cfs_rq *cfs_rq = se->cfs_rq;
10028 struct rq *rq = cfs_rq->rq;
10029 unsigned long flags;
10030
10031 spin_lock_irqsave(&rq->lock, flags);
10032 __set_se_shares(se, shares);
10033 spin_unlock_irqrestore(&rq->lock, flags);
10034}
10035
10036static DEFINE_MUTEX(shares_mutex);
10037
10038int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10039{
10040 int i;
10041 unsigned long flags;
10042
10043
10044
10045
10046 if (!tg->se[0])
10047 return -EINVAL;
10048
10049 if (shares < MIN_SHARES)
10050 shares = MIN_SHARES;
10051 else if (shares > MAX_SHARES)
10052 shares = MAX_SHARES;
10053
10054 mutex_lock(&shares_mutex);
10055 if (tg->shares == shares)
10056 goto done;
10057
10058 spin_lock_irqsave(&task_group_lock, flags);
10059 for_each_possible_cpu(i)
10060 unregister_fair_sched_group(tg, i);
10061 list_del_rcu(&tg->siblings);
10062 spin_unlock_irqrestore(&task_group_lock, flags);
10063
10064
10065 synchronize_sched();
10066
10067
10068
10069
10070
10071 tg->shares = shares;
10072 for_each_possible_cpu(i) {
10073
10074
10075
10076 cfs_rq_set_shares(tg->cfs_rq[i], 0);
10077 set_se_shares(tg->se[i], shares);
10078 }
10079
10080
10081
10082
10083
10084 spin_lock_irqsave(&task_group_lock, flags);
10085 for_each_possible_cpu(i)
10086 register_fair_sched_group(tg, i);
10087 list_add_rcu(&tg->siblings, &tg->parent->children);
10088 spin_unlock_irqrestore(&task_group_lock, flags);
10089done:
10090 mutex_unlock(&shares_mutex);
10091 return 0;
10092}
10093
10094unsigned long sched_group_shares(struct task_group *tg)
10095{
10096 return tg->shares;
10097}
10098#endif
10099
10100#ifdef CONFIG_RT_GROUP_SCHED
10101
10102
10103
10104static DEFINE_MUTEX(rt_constraints_mutex);
10105
10106static unsigned long to_ratio(u64 period, u64 runtime)
10107{
10108 if (runtime == RUNTIME_INF)
10109 return 1ULL << 20;
10110
10111 return div64_u64(runtime << 20, period);
10112}
10113
10114
10115static inline int tg_has_rt_tasks(struct task_group *tg)
10116{
10117 struct task_struct *g, *p;
10118
10119 do_each_thread(g, p) {
10120 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
10121 return 1;
10122 } while_each_thread(g, p);
10123
10124 return 0;
10125}
10126
10127struct rt_schedulable_data {
10128 struct task_group *tg;
10129 u64 rt_period;
10130 u64 rt_runtime;
10131};
10132
10133static int tg_schedulable(struct task_group *tg, void *data)
10134{
10135 struct rt_schedulable_data *d = data;
10136 struct task_group *child;
10137 unsigned long total, sum = 0;
10138 u64 period, runtime;
10139
10140 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
10141 runtime = tg->rt_bandwidth.rt_runtime;
10142
10143 if (tg == d->tg) {
10144 period = d->rt_period;
10145 runtime = d->rt_runtime;
10146 }
10147
10148#ifdef CONFIG_USER_SCHED
10149 if (tg == &root_task_group) {
10150 period = global_rt_period();
10151 runtime = global_rt_runtime();
10152 }
10153#endif
10154
10155
10156
10157
10158 if (runtime > period && runtime != RUNTIME_INF)
10159 return -EINVAL;
10160
10161
10162
10163
10164 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
10165 return -EBUSY;
10166
10167 total = to_ratio(period, runtime);
10168
10169
10170
10171
10172 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
10173 return -EINVAL;
10174
10175
10176
10177
10178 list_for_each_entry_rcu(child, &tg->children, siblings) {
10179 period = ktime_to_ns(child->rt_bandwidth.rt_period);
10180 runtime = child->rt_bandwidth.rt_runtime;
10181
10182 if (child == d->tg) {
10183 period = d->rt_period;
10184 runtime = d->rt_runtime;
10185 }
10186
10187 sum += to_ratio(period, runtime);
10188 }
10189
10190 if (sum > total)
10191 return -EINVAL;
10192
10193 return 0;
10194}
10195
10196static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
10197{
10198 struct rt_schedulable_data data = {
10199 .tg = tg,
10200 .rt_period = period,
10201 .rt_runtime = runtime,
10202 };
10203
10204 return walk_tg_tree(tg_schedulable, tg_nop, &data);
10205}
10206
10207static int tg_set_bandwidth(struct task_group *tg,
10208 u64 rt_period, u64 rt_runtime)
10209{
10210 int i, err = 0;
10211
10212 mutex_lock(&rt_constraints_mutex);
10213 read_lock(&tasklist_lock);
10214 err = __rt_schedulable(tg, rt_period, rt_runtime);
10215 if (err)
10216 goto unlock;
10217
10218 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10219 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10220 tg->rt_bandwidth.rt_runtime = rt_runtime;
10221
10222 for_each_possible_cpu(i) {
10223 struct rt_rq *rt_rq = tg->rt_rq[i];
10224
10225 spin_lock(&rt_rq->rt_runtime_lock);
10226 rt_rq->rt_runtime = rt_runtime;
10227 spin_unlock(&rt_rq->rt_runtime_lock);
10228 }
10229 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10230 unlock:
10231 read_unlock(&tasklist_lock);
10232 mutex_unlock(&rt_constraints_mutex);
10233
10234 return err;
10235}
10236
10237int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
10238{
10239 u64 rt_runtime, rt_period;
10240
10241 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
10242 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
10243 if (rt_runtime_us < 0)
10244 rt_runtime = RUNTIME_INF;
10245
10246 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10247}
10248
10249long sched_group_rt_runtime(struct task_group *tg)
10250{
10251 u64 rt_runtime_us;
10252
10253 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
10254 return -1;
10255
10256 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
10257 do_div(rt_runtime_us, NSEC_PER_USEC);
10258 return rt_runtime_us;
10259}
10260
10261int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
10262{
10263 u64 rt_runtime, rt_period;
10264
10265 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
10266 rt_runtime = tg->rt_bandwidth.rt_runtime;
10267
10268 if (rt_period == 0)
10269 return -EINVAL;
10270
10271 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10272}
10273
10274long sched_group_rt_period(struct task_group *tg)
10275{
10276 u64 rt_period_us;
10277
10278 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
10279 do_div(rt_period_us, NSEC_PER_USEC);
10280 return rt_period_us;
10281}
10282
10283static int sched_rt_global_constraints(void)
10284{
10285 u64 runtime, period;
10286 int ret = 0;
10287
10288 if (sysctl_sched_rt_period <= 0)
10289 return -EINVAL;
10290
10291 runtime = global_rt_runtime();
10292 period = global_rt_period();
10293
10294
10295
10296
10297 if (runtime > period && runtime != RUNTIME_INF)
10298 return -EINVAL;
10299
10300 mutex_lock(&rt_constraints_mutex);
10301 read_lock(&tasklist_lock);
10302 ret = __rt_schedulable(NULL, 0, 0);
10303 read_unlock(&tasklist_lock);
10304 mutex_unlock(&rt_constraints_mutex);
10305
10306 return ret;
10307}
10308
10309int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
10310{
10311
10312 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
10313 return 0;
10314
10315 return 1;
10316}
10317
10318#else
10319static int sched_rt_global_constraints(void)
10320{
10321 unsigned long flags;
10322 int i;
10323
10324 if (sysctl_sched_rt_period <= 0)
10325 return -EINVAL;
10326
10327
10328
10329
10330
10331 if (sysctl_sched_rt_runtime == 0)
10332 return -EBUSY;
10333
10334 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10335 for_each_possible_cpu(i) {
10336 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10337
10338 spin_lock(&rt_rq->rt_runtime_lock);
10339 rt_rq->rt_runtime = global_rt_runtime();
10340 spin_unlock(&rt_rq->rt_runtime_lock);
10341 }
10342 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10343
10344 return 0;
10345}
10346#endif
10347
10348int sched_rt_handler(struct ctl_table *table, int write,
10349 void __user *buffer, size_t *lenp,
10350 loff_t *ppos)
10351{
10352 int ret;
10353 int old_period, old_runtime;
10354 static DEFINE_MUTEX(mutex);
10355
10356 mutex_lock(&mutex);
10357 old_period = sysctl_sched_rt_period;
10358 old_runtime = sysctl_sched_rt_runtime;
10359
10360 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10361
10362 if (!ret && write) {
10363 ret = sched_rt_global_constraints();
10364 if (ret) {
10365 sysctl_sched_rt_period = old_period;
10366 sysctl_sched_rt_runtime = old_runtime;
10367 } else {
10368 def_rt_bandwidth.rt_runtime = global_rt_runtime();
10369 def_rt_bandwidth.rt_period =
10370 ns_to_ktime(global_rt_period());
10371 }
10372 }
10373 mutex_unlock(&mutex);
10374
10375 return ret;
10376}
10377
10378#ifdef CONFIG_CGROUP_SCHED
10379
10380
10381static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
10382{
10383 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
10384 struct task_group, css);
10385}
10386
10387static struct cgroup_subsys_state *
10388cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
10389{
10390 struct task_group *tg, *parent;
10391
10392 if (!cgrp->parent) {
10393
10394 return &init_task_group.css;
10395 }
10396
10397 parent = cgroup_tg(cgrp->parent);
10398 tg = sched_create_group(parent);
10399 if (IS_ERR(tg))
10400 return ERR_PTR(-ENOMEM);
10401
10402 return &tg->css;
10403}
10404
10405static void
10406cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10407{
10408 struct task_group *tg = cgroup_tg(cgrp);
10409
10410 sched_destroy_group(tg);
10411}
10412
10413static int
10414cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10415{
10416#ifdef CONFIG_RT_GROUP_SCHED
10417 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
10418 return -EINVAL;
10419#else
10420
10421 if (tsk->sched_class != &fair_sched_class)
10422 return -EINVAL;
10423#endif
10424 return 0;
10425}
10426
10427static int
10428cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10429 struct task_struct *tsk, bool threadgroup)
10430{
10431 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10432 if (retval)
10433 return retval;
10434 if (threadgroup) {
10435 struct task_struct *c;
10436 rcu_read_lock();
10437 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10438 retval = cpu_cgroup_can_attach_task(cgrp, c);
10439 if (retval) {
10440 rcu_read_unlock();
10441 return retval;
10442 }
10443 }
10444 rcu_read_unlock();
10445 }
10446 return 0;
10447}
10448
10449static void
10450cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10451 struct cgroup *old_cont, struct task_struct *tsk,
10452 bool threadgroup)
10453{
10454 sched_move_task(tsk);
10455 if (threadgroup) {
10456 struct task_struct *c;
10457 rcu_read_lock();
10458 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10459 sched_move_task(c);
10460 }
10461 rcu_read_unlock();
10462 }
10463}
10464
10465#ifdef CONFIG_FAIR_GROUP_SCHED
10466static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
10467 u64 shareval)
10468{
10469 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
10470}
10471
10472static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
10473{
10474 struct task_group *tg = cgroup_tg(cgrp);
10475
10476 return (u64) tg->shares;
10477}
10478#endif
10479
10480#ifdef CONFIG_RT_GROUP_SCHED
10481static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
10482 s64 val)
10483{
10484 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
10485}
10486
10487static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
10488{
10489 return sched_group_rt_runtime(cgroup_tg(cgrp));
10490}
10491
10492static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
10493 u64 rt_period_us)
10494{
10495 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
10496}
10497
10498static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
10499{
10500 return sched_group_rt_period(cgroup_tg(cgrp));
10501}
10502#endif
10503
10504static struct cftype cpu_files[] = {
10505#ifdef CONFIG_FAIR_GROUP_SCHED
10506 {
10507 .name = "shares",
10508 .read_u64 = cpu_shares_read_u64,
10509 .write_u64 = cpu_shares_write_u64,
10510 },
10511#endif
10512#ifdef CONFIG_RT_GROUP_SCHED
10513 {
10514 .name = "rt_runtime_us",
10515 .read_s64 = cpu_rt_runtime_read,
10516 .write_s64 = cpu_rt_runtime_write,
10517 },
10518 {
10519 .name = "rt_period_us",
10520 .read_u64 = cpu_rt_period_read_uint,
10521 .write_u64 = cpu_rt_period_write_uint,
10522 },
10523#endif
10524};
10525
10526static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
10527{
10528 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
10529}
10530
10531struct cgroup_subsys cpu_cgroup_subsys = {
10532 .name = "cpu",
10533 .create = cpu_cgroup_create,
10534 .destroy = cpu_cgroup_destroy,
10535 .can_attach = cpu_cgroup_can_attach,
10536 .attach = cpu_cgroup_attach,
10537 .populate = cpu_cgroup_populate,
10538 .subsys_id = cpu_cgroup_subsys_id,
10539 .early_init = 1,
10540};
10541
10542#endif
10543
10544#ifdef CONFIG_CGROUP_CPUACCT
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554struct cpuacct {
10555 struct cgroup_subsys_state css;
10556
10557 u64 *cpuusage;
10558 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10559 struct cpuacct *parent;
10560};
10561
10562struct cgroup_subsys cpuacct_subsys;
10563
10564
10565static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
10566{
10567 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
10568 struct cpuacct, css);
10569}
10570
10571
10572static inline struct cpuacct *task_ca(struct task_struct *tsk)
10573{
10574 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
10575 struct cpuacct, css);
10576}
10577
10578
10579static struct cgroup_subsys_state *cpuacct_create(
10580 struct cgroup_subsys *ss, struct cgroup *cgrp)
10581{
10582 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10583 int i;
10584
10585 if (!ca)
10586 goto out;
10587
10588 ca->cpuusage = alloc_percpu(u64);
10589 if (!ca->cpuusage)
10590 goto out_free_ca;
10591
10592 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10593 if (percpu_counter_init(&ca->cpustat[i], 0))
10594 goto out_free_counters;
10595
10596 if (cgrp->parent)
10597 ca->parent = cgroup_ca(cgrp->parent);
10598
10599 return &ca->css;
10600
10601out_free_counters:
10602 while (--i >= 0)
10603 percpu_counter_destroy(&ca->cpustat[i]);
10604 free_percpu(ca->cpuusage);
10605out_free_ca:
10606 kfree(ca);
10607out:
10608 return ERR_PTR(-ENOMEM);
10609}
10610
10611
10612static void
10613cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10614{
10615 struct cpuacct *ca = cgroup_ca(cgrp);
10616 int i;
10617
10618 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10619 percpu_counter_destroy(&ca->cpustat[i]);
10620 free_percpu(ca->cpuusage);
10621 kfree(ca);
10622}
10623
10624static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10625{
10626 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10627 u64 data;
10628
10629#ifndef CONFIG_64BIT
10630
10631
10632
10633 spin_lock_irq(&cpu_rq(cpu)->lock);
10634 data = *cpuusage;
10635 spin_unlock_irq(&cpu_rq(cpu)->lock);
10636#else
10637 data = *cpuusage;
10638#endif
10639
10640 return data;
10641}
10642
10643static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10644{
10645 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10646
10647#ifndef CONFIG_64BIT
10648
10649
10650
10651 spin_lock_irq(&cpu_rq(cpu)->lock);
10652 *cpuusage = val;
10653 spin_unlock_irq(&cpu_rq(cpu)->lock);
10654#else
10655 *cpuusage = val;
10656#endif
10657}
10658
10659
10660static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
10661{
10662 struct cpuacct *ca = cgroup_ca(cgrp);
10663 u64 totalcpuusage = 0;
10664 int i;
10665
10666 for_each_present_cpu(i)
10667 totalcpuusage += cpuacct_cpuusage_read(ca, i);
10668
10669 return totalcpuusage;
10670}
10671
10672static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
10673 u64 reset)
10674{
10675 struct cpuacct *ca = cgroup_ca(cgrp);
10676 int err = 0;
10677 int i;
10678
10679 if (reset) {
10680 err = -EINVAL;
10681 goto out;
10682 }
10683
10684 for_each_present_cpu(i)
10685 cpuacct_cpuusage_write(ca, i, 0);
10686
10687out:
10688 return err;
10689}
10690
10691static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10692 struct seq_file *m)
10693{
10694 struct cpuacct *ca = cgroup_ca(cgroup);
10695 u64 percpu;
10696 int i;
10697
10698 for_each_present_cpu(i) {
10699 percpu = cpuacct_cpuusage_read(ca, i);
10700 seq_printf(m, "%llu ", (unsigned long long) percpu);
10701 }
10702 seq_printf(m, "\n");
10703 return 0;
10704}
10705
10706static const char *cpuacct_stat_desc[] = {
10707 [CPUACCT_STAT_USER] = "user",
10708 [CPUACCT_STAT_SYSTEM] = "system",
10709};
10710
10711static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10712 struct cgroup_map_cb *cb)
10713{
10714 struct cpuacct *ca = cgroup_ca(cgrp);
10715 int i;
10716
10717 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10718 s64 val = percpu_counter_read(&ca->cpustat[i]);
10719 val = cputime64_to_clock_t(val);
10720 cb->fill(cb, cpuacct_stat_desc[i], val);
10721 }
10722 return 0;
10723}
10724
10725static struct cftype files[] = {
10726 {
10727 .name = "usage",
10728 .read_u64 = cpuusage_read,
10729 .write_u64 = cpuusage_write,
10730 },
10731 {
10732 .name = "usage_percpu",
10733 .read_seq_string = cpuacct_percpu_seq_read,
10734 },
10735 {
10736 .name = "stat",
10737 .read_map = cpuacct_stats_show,
10738 },
10739};
10740
10741static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
10742{
10743 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
10744}
10745
10746
10747
10748
10749
10750
10751static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10752{
10753 struct cpuacct *ca;
10754 int cpu;
10755
10756 if (unlikely(!cpuacct_subsys.active))
10757 return;
10758
10759 cpu = task_cpu(tsk);
10760
10761 rcu_read_lock();
10762
10763 ca = task_ca(tsk);
10764
10765 for (; ca; ca = ca->parent) {
10766 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10767 *cpuusage += cputime;
10768 }
10769
10770 rcu_read_unlock();
10771}
10772
10773
10774
10775
10776static void cpuacct_update_stats(struct task_struct *tsk,
10777 enum cpuacct_stat_index idx, cputime_t val)
10778{
10779 struct cpuacct *ca;
10780
10781 if (unlikely(!cpuacct_subsys.active))
10782 return;
10783
10784 rcu_read_lock();
10785 ca = task_ca(tsk);
10786
10787 do {
10788 percpu_counter_add(&ca->cpustat[idx], val);
10789 ca = ca->parent;
10790 } while (ca);
10791 rcu_read_unlock();
10792}
10793
10794struct cgroup_subsys cpuacct_subsys = {
10795 .name = "cpuacct",
10796 .create = cpuacct_create,
10797 .destroy = cpuacct_destroy,
10798 .populate = cpuacct_populate,
10799 .subsys_id = cpuacct_subsys_id,
10800};
10801#endif
10802
10803#ifndef CONFIG_SMP
10804
10805int rcu_expedited_torture_stats(char *page)
10806{
10807 return 0;
10808}
10809EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10810
10811void synchronize_sched_expedited(void)
10812{
10813}
10814EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10815
10816#else
10817
10818static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10819static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10820
10821#define RCU_EXPEDITED_STATE_POST -2
10822#define RCU_EXPEDITED_STATE_IDLE -1
10823
10824static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10825
10826int rcu_expedited_torture_stats(char *page)
10827{
10828 int cnt = 0;
10829 int cpu;
10830
10831 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10832 for_each_online_cpu(cpu) {
10833 cnt += sprintf(&page[cnt], " %d:%d",
10834 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10835 }
10836 cnt += sprintf(&page[cnt], "\n");
10837 return cnt;
10838}
10839EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10840
10841static long synchronize_sched_expedited_count;
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853void synchronize_sched_expedited(void)
10854{
10855 int cpu;
10856 unsigned long flags;
10857 bool need_full_sync = 0;
10858 struct rq *rq;
10859 struct migration_req *req;
10860 long snap;
10861 int trycount = 0;
10862
10863 smp_mb();
10864 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10865 get_online_cpus();
10866 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10867 put_online_cpus();
10868 if (trycount++ < 10)
10869 udelay(trycount * num_online_cpus());
10870 else {
10871 synchronize_sched();
10872 return;
10873 }
10874 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10875 smp_mb();
10876 return;
10877 }
10878 get_online_cpus();
10879 }
10880 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10881 for_each_online_cpu(cpu) {
10882 rq = cpu_rq(cpu);
10883 req = &per_cpu(rcu_migration_req, cpu);
10884 init_completion(&req->done);
10885 req->task = NULL;
10886 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10887 spin_lock_irqsave(&rq->lock, flags);
10888 list_add(&req->list, &rq->migration_queue);
10889 spin_unlock_irqrestore(&rq->lock, flags);
10890 wake_up_process(rq->migration_thread);
10891 }
10892 for_each_online_cpu(cpu) {
10893 rcu_expedited_state = cpu;
10894 req = &per_cpu(rcu_migration_req, cpu);
10895 rq = cpu_rq(cpu);
10896 wait_for_completion(&req->done);
10897 spin_lock_irqsave(&rq->lock, flags);
10898 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10899 need_full_sync = 1;
10900 req->dest_cpu = RCU_MIGRATION_IDLE;
10901 spin_unlock_irqrestore(&rq->lock, flags);
10902 }
10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10904 mutex_unlock(&rcu_sched_expedited_mutex);
10905 put_online_cpus();
10906 if (need_full_sync)
10907 synchronize_sched();
10908}
10909EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10910
10911#endif
10912