1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h>
60#include <linux/seq_file.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74
75#include <asm/tlb.h>
76#include <asm/irq_regs.h>
77
78#include "sched_cpupri.h"
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/sched.h>
82
83
84
85
86
87
88#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
89#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
90#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
91
92
93
94
95
96
97#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
98#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
100
101
102
103
104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
105
106#define NICE_0_LOAD SCHED_LOAD_SCALE
107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
108
109
110
111
112
113
114
115#define DEF_TIMESLICE (100 * HZ / 1000)
116
117
118
119
120#define RUNTIME_INF ((u64)~0ULL)
121
122static inline int rt_policy(int policy)
123{
124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
125 return 1;
126 return 0;
127}
128
129static inline int task_has_rt_policy(struct task_struct *p)
130{
131 return rt_policy(p->policy);
132}
133
134
135
136
137struct rt_prio_array {
138 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
139 struct list_head queue[MAX_RT_PRIO];
140};
141
142struct rt_bandwidth {
143
144 spinlock_t rt_runtime_lock;
145 ktime_t rt_period;
146 u64 rt_runtime;
147 struct hrtimer rt_period_timer;
148};
149
150static struct rt_bandwidth def_rt_bandwidth;
151
152static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
153
154static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
155{
156 struct rt_bandwidth *rt_b =
157 container_of(timer, struct rt_bandwidth, rt_period_timer);
158 ktime_t now;
159 int overrun;
160 int idle = 0;
161
162 for (;;) {
163 now = hrtimer_cb_get_time(timer);
164 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
165
166 if (!overrun)
167 break;
168
169 idle = do_sched_rt_period_timer(rt_b, overrun);
170 }
171
172 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
173}
174
175static
176void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
177{
178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime;
180
181 spin_lock_init(&rt_b->rt_runtime_lock);
182
183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
185 rt_b->rt_period_timer.function = sched_rt_period_timer;
186}
187
188static inline int rt_bandwidth_enabled(void)
189{
190 return sysctl_sched_rt_runtime >= 0;
191}
192
193static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
194{
195 ktime_t now;
196
197 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
198 return;
199
200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return;
202
203 spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) {
205 unsigned long delta;
206 ktime_t soft, hard;
207
208 if (hrtimer_active(&rt_b->rt_period_timer))
209 break;
210
211 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
212 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
213
214 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
215 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
216 delta = ktime_to_ns(ktime_sub(hard, soft));
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0);
219 }
220 spin_unlock(&rt_b->rt_runtime_lock);
221}
222
223#ifdef CONFIG_RT_GROUP_SCHED
224static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
225{
226 hrtimer_cancel(&rt_b->rt_period_timer);
227}
228#endif
229
230
231
232
233
234static DEFINE_MUTEX(sched_domains_mutex);
235
236#ifdef CONFIG_GROUP_SCHED
237
238#include <linux/cgroup.h>
239
240struct cfs_rq;
241
242static LIST_HEAD(task_groups);
243
244
245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253
254#ifdef CONFIG_FAIR_GROUP_SCHED
255
256 struct sched_entity **se;
257
258 struct cfs_rq **cfs_rq;
259 unsigned long shares;
260#endif
261
262#ifdef CONFIG_RT_GROUP_SCHED
263 struct sched_rt_entity **rt_se;
264 struct rt_rq **rt_rq;
265
266 struct rt_bandwidth rt_bandwidth;
267#endif
268
269 struct rcu_head rcu;
270 struct list_head list;
271
272 struct task_group *parent;
273 struct list_head siblings;
274 struct list_head children;
275};
276
277#ifdef CONFIG_USER_SCHED
278
279
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285
286
287
288
289
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
302#endif
303#else
304#define root_task_group init_task_group
305#endif
306
307
308
309
310static DEFINE_SPINLOCK(task_group_lock);
311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
314#ifdef CONFIG_SMP
315static int root_task_group_empty(void)
316{
317 return list_empty(&root_task_group.children);
318}
319#endif
320
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif
326
327
328
329
330
331
332
333
334
335#define MIN_SHARES 2
336#define MAX_SHARES (1UL << 18)
337
338static int init_task_group_load = INIT_TASK_GROUP_LOAD;
339#endif
340
341
342
343
344struct task_group init_task_group;
345
346
347static inline struct task_group *task_group(struct task_struct *p)
348{
349 struct task_group *tg;
350
351#ifdef CONFIG_USER_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css);
358#else
359 tg = &init_task_group;
360#endif
361 return tg;
362}
363
364
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
366{
367#ifdef CONFIG_FAIR_GROUP_SCHED
368 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
369 p->se.parent = task_group(p)->se[cpu];
370#endif
371
372#ifdef CONFIG_RT_GROUP_SCHED
373 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
374 p->rt.parent = task_group(p)->rt_se[cpu];
375#endif
376}
377
378#else
379
380static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
381static inline struct task_group *task_group(struct task_struct *p)
382{
383 return NULL;
384}
385
386#endif
387
388
389struct cfs_rq {
390 struct load_weight load;
391 unsigned long nr_running;
392
393 u64 exec_clock;
394 u64 min_vruntime;
395
396 struct rb_root tasks_timeline;
397 struct rb_node *rb_leftmost;
398
399 struct list_head tasks;
400 struct list_head *balance_iterator;
401
402
403
404
405
406 struct sched_entity *curr, *next, *last;
407
408 unsigned int nr_spread_over;
409
410#ifdef CONFIG_FAIR_GROUP_SCHED
411 struct rq *rq;
412
413
414
415
416
417
418
419
420
421 struct list_head leaf_cfs_rq_list;
422 struct task_group *tg;
423
424#ifdef CONFIG_SMP
425
426
427
428 unsigned long task_weight;
429
430
431
432
433
434
435
436 unsigned long h_load;
437
438
439
440
441 unsigned long shares;
442
443
444
445
446 unsigned long rq_weight;
447#endif
448#endif
449};
450
451
452struct rt_rq {
453 struct rt_prio_array active;
454 unsigned long rt_nr_running;
455#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
456 struct {
457 int curr;
458#ifdef CONFIG_SMP
459 int next;
460#endif
461 } highest_prio;
462#endif
463#ifdef CONFIG_SMP
464 unsigned long rt_nr_migratory;
465 unsigned long rt_nr_total;
466 int overloaded;
467 struct plist_head pushable_tasks;
468#endif
469 int rt_throttled;
470 u64 rt_time;
471 u64 rt_runtime;
472
473 spinlock_t rt_runtime_lock;
474
475#ifdef CONFIG_RT_GROUP_SCHED
476 unsigned long rt_nr_boosted;
477
478 struct rq *rq;
479 struct list_head leaf_rt_rq_list;
480 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif
483};
484
485#ifdef CONFIG_SMP
486
487
488
489
490
491
492
493
494
495struct root_domain {
496 atomic_t refcount;
497 cpumask_var_t span;
498 cpumask_var_t online;
499
500
501
502
503
504 cpumask_var_t rto_mask;
505 atomic_t rto_count;
506#ifdef CONFIG_SMP
507 struct cpupri cpupri;
508#endif
509};
510
511
512
513
514
515static struct root_domain def_root_domain;
516
517#endif
518
519
520
521
522
523
524
525
526struct rq {
527
528 spinlock_t lock;
529
530
531
532
533
534 unsigned long nr_running;
535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently;
540#endif
541
542 struct load_weight load;
543 unsigned long nr_load_updates;
544 u64 nr_switches;
545 u64 nr_migrations_in;
546
547 struct cfs_rq cfs;
548 struct rt_rq rt;
549
550#ifdef CONFIG_FAIR_GROUP_SCHED
551
552 struct list_head leaf_cfs_rq_list;
553#endif
554#ifdef CONFIG_RT_GROUP_SCHED
555 struct list_head leaf_rt_rq_list;
556#endif
557
558
559
560
561
562
563
564 unsigned long nr_uninterruptible;
565
566 struct task_struct *curr, *idle;
567 unsigned long next_balance;
568 struct mm_struct *prev_mm;
569
570 u64 clock;
571
572 atomic_t nr_iowait;
573
574#ifdef CONFIG_SMP
575 struct root_domain *rd;
576 struct sched_domain *sd;
577
578 unsigned char idle_at_tick;
579
580 int post_schedule;
581 int active_balance;
582 int push_cpu;
583
584 int cpu;
585 int online;
586
587 unsigned long avg_load_per_task;
588
589 struct task_struct *migration_thread;
590 struct list_head migration_queue;
591
592 u64 rt_avg;
593 u64 age_stamp;
594#endif
595
596
597 unsigned long calc_load_update;
598 long calc_load_active;
599
600#ifdef CONFIG_SCHED_HRTICK
601#ifdef CONFIG_SMP
602 int hrtick_csd_pending;
603 struct call_single_data hrtick_csd;
604#endif
605 struct hrtimer hrtick_timer;
606#endif
607
608#ifdef CONFIG_SCHEDSTATS
609
610 struct sched_info rq_sched_info;
611 unsigned long long rq_cpu_time;
612
613
614
615 unsigned int yld_count;
616
617
618 unsigned int sched_switch;
619 unsigned int sched_count;
620 unsigned int sched_goidle;
621
622
623 unsigned int ttwu_count;
624 unsigned int ttwu_local;
625
626
627 unsigned int bkl_count;
628#endif
629};
630
631static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
632
633static inline
634void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
635{
636 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
637}
638
639static inline int cpu_of(struct rq *rq)
640{
641#ifdef CONFIG_SMP
642 return rq->cpu;
643#else
644 return 0;
645#endif
646}
647
648
649
650
651
652
653
654
655#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues))
660#define task_rq(p) cpu_rq(task_cpu(p))
661#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
662#define raw_rq() (&__raw_get_cpu_var(runqueues))
663
664inline void update_rq_clock(struct rq *rq)
665{
666 rq->clock = sched_clock_cpu(cpu_of(rq));
667}
668
669
670
671
672#ifdef CONFIG_SCHED_DEBUG
673# define const_debug __read_mostly
674#else
675# define const_debug static const
676#endif
677
678
679
680
681
682
683
684
685
686int runqueue_is_locked(int cpu)
687{
688 return spin_is_locked(&cpu_rq(cpu)->lock);
689}
690
691
692
693
694
695#define SCHED_FEAT(name, enabled) \
696 __SCHED_FEAT_##name ,
697
698enum {
699#include "sched_features.h"
700};
701
702#undef SCHED_FEAT
703
704#define SCHED_FEAT(name, enabled) \
705 (1UL << __SCHED_FEAT_##name) * enabled |
706
707const_debug unsigned int sysctl_sched_features =
708#include "sched_features.h"
709 0;
710
711#undef SCHED_FEAT
712
713#ifdef CONFIG_SCHED_DEBUG
714#define SCHED_FEAT(name, enabled) \
715 #name ,
716
717static __read_mostly char *sched_feat_names[] = {
718#include "sched_features.h"
719 NULL
720};
721
722#undef SCHED_FEAT
723
724static int sched_feat_show(struct seq_file *m, void *v)
725{
726 int i;
727
728 for (i = 0; sched_feat_names[i]; i++) {
729 if (!(sysctl_sched_features & (1UL << i)))
730 seq_puts(m, "NO_");
731 seq_printf(m, "%s ", sched_feat_names[i]);
732 }
733 seq_puts(m, "\n");
734
735 return 0;
736}
737
738static ssize_t
739sched_feat_write(struct file *filp, const char __user *ubuf,
740 size_t cnt, loff_t *ppos)
741{
742 char buf[64];
743 char *cmp = buf;
744 int neg = 0;
745 int i;
746
747 if (cnt > 63)
748 cnt = 63;
749
750 if (copy_from_user(&buf, ubuf, cnt))
751 return -EFAULT;
752
753 buf[cnt] = 0;
754
755 if (strncmp(buf, "NO_", 3) == 0) {
756 neg = 1;
757 cmp += 3;
758 }
759
760 for (i = 0; sched_feat_names[i]; i++) {
761 int len = strlen(sched_feat_names[i]);
762
763 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
764 if (neg)
765 sysctl_sched_features &= ~(1UL << i);
766 else
767 sysctl_sched_features |= (1UL << i);
768 break;
769 }
770 }
771
772 if (!sched_feat_names[i])
773 return -EINVAL;
774
775 filp->f_pos += cnt;
776
777 return cnt;
778}
779
780static int sched_feat_open(struct inode *inode, struct file *filp)
781{
782 return single_open(filp, sched_feat_show, NULL);
783}
784
785static const struct file_operations sched_feat_fops = {
786 .open = sched_feat_open,
787 .write = sched_feat_write,
788 .read = seq_read,
789 .llseek = seq_lseek,
790 .release = single_release,
791};
792
793static __init int sched_init_debug(void)
794{
795 debugfs_create_file("sched_features", 0644, NULL, NULL,
796 &sched_feat_fops);
797
798 return 0;
799}
800late_initcall(sched_init_debug);
801
802#endif
803
804#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
805
806
807
808
809
810const_debug unsigned int sysctl_sched_nr_migrate = 32;
811
812
813
814
815
816unsigned int sysctl_sched_shares_ratelimit = 250000;
817
818
819
820
821
822
823unsigned int sysctl_sched_shares_thresh = 4;
824
825
826
827
828
829
830
831const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
832
833
834
835
836
837unsigned int sysctl_sched_rt_period = 1000000;
838
839static __read_mostly int scheduler_running;
840
841
842
843
844
845int sysctl_sched_rt_runtime = 950000;
846
847static inline u64 global_rt_period(void)
848{
849 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
850}
851
852static inline u64 global_rt_runtime(void)
853{
854 if (sysctl_sched_rt_runtime < 0)
855 return RUNTIME_INF;
856
857 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
858}
859
860#ifndef prepare_arch_switch
861# define prepare_arch_switch(next) do { } while (0)
862#endif
863#ifndef finish_arch_switch
864# define finish_arch_switch(prev) do { } while (0)
865#endif
866
867static inline int task_current(struct rq *rq, struct task_struct *p)
868{
869 return rq->curr == p;
870}
871
872#ifndef __ARCH_WANT_UNLOCKED_CTXSW
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875 return task_current(rq, p);
876}
877
878static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
879{
880}
881
882static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
883{
884#ifdef CONFIG_DEBUG_SPINLOCK
885
886 rq->lock.owner = current;
887#endif
888
889
890
891
892
893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
894
895 spin_unlock_irq(&rq->lock);
896}
897
898#else
899static inline int task_running(struct rq *rq, struct task_struct *p)
900{
901#ifdef CONFIG_SMP
902 return p->oncpu;
903#else
904 return task_current(rq, p);
905#endif
906}
907
908static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
909{
910#ifdef CONFIG_SMP
911
912
913
914
915
916 next->oncpu = 1;
917#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 spin_unlock_irq(&rq->lock);
920#else
921 spin_unlock(&rq->lock);
922#endif
923}
924
925static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
926{
927#ifdef CONFIG_SMP
928
929
930
931
932
933 smp_wmb();
934 prev->oncpu = 0;
935#endif
936#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
937 local_irq_enable();
938#endif
939}
940#endif
941
942
943
944
945
946static inline struct rq *__task_rq_lock(struct task_struct *p)
947 __acquires(rq->lock)
948{
949 for (;;) {
950 struct rq *rq = task_rq(p);
951 spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p)))
953 return rq;
954 spin_unlock(&rq->lock);
955 }
956}
957
958
959
960
961
962
963static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
964 __acquires(rq->lock)
965{
966 struct rq *rq;
967
968 for (;;) {
969 local_irq_save(*flags);
970 rq = task_rq(p);
971 spin_lock(&rq->lock);
972 if (likely(rq == task_rq(p)))
973 return rq;
974 spin_unlock_irqrestore(&rq->lock, *flags);
975 }
976}
977
978void task_rq_unlock_wait(struct task_struct *p)
979{
980 struct rq *rq = task_rq(p);
981
982 smp_mb();
983 spin_unlock_wait(&rq->lock);
984}
985
986static void __task_rq_unlock(struct rq *rq)
987 __releases(rq->lock)
988{
989 spin_unlock(&rq->lock);
990}
991
992static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
993 __releases(rq->lock)
994{
995 spin_unlock_irqrestore(&rq->lock, *flags);
996}
997
998
999
1000
1001static struct rq *this_rq_lock(void)
1002 __acquires(rq->lock)
1003{
1004 struct rq *rq;
1005
1006 local_irq_disable();
1007 rq = this_rq();
1008 spin_lock(&rq->lock);
1009
1010 return rq;
1011}
1012
1013#ifdef CONFIG_SCHED_HRTICK
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030static inline int hrtick_enabled(struct rq *rq)
1031{
1032 if (!sched_feat(HRTICK))
1033 return 0;
1034 if (!cpu_active(cpu_of(rq)))
1035 return 0;
1036 return hrtimer_is_hres_active(&rq->hrtick_timer);
1037}
1038
1039static void hrtick_clear(struct rq *rq)
1040{
1041 if (hrtimer_active(&rq->hrtick_timer))
1042 hrtimer_cancel(&rq->hrtick_timer);
1043}
1044
1045
1046
1047
1048
1049static enum hrtimer_restart hrtick(struct hrtimer *timer)
1050{
1051 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1052
1053 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1054
1055 spin_lock(&rq->lock);
1056 update_rq_clock(rq);
1057 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1058 spin_unlock(&rq->lock);
1059
1060 return HRTIMER_NORESTART;
1061}
1062
1063#ifdef CONFIG_SMP
1064
1065
1066
1067static void __hrtick_start(void *arg)
1068{
1069 struct rq *rq = arg;
1070
1071 spin_lock(&rq->lock);
1072 hrtimer_restart(&rq->hrtick_timer);
1073 rq->hrtick_csd_pending = 0;
1074 spin_unlock(&rq->lock);
1075}
1076
1077
1078
1079
1080
1081
1082static void hrtick_start(struct rq *rq, u64 delay)
1083{
1084 struct hrtimer *timer = &rq->hrtick_timer;
1085 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1086
1087 hrtimer_set_expires(timer, time);
1088
1089 if (rq == this_rq()) {
1090 hrtimer_restart(timer);
1091 } else if (!rq->hrtick_csd_pending) {
1092 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1093 rq->hrtick_csd_pending = 1;
1094 }
1095}
1096
1097static int
1098hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1099{
1100 int cpu = (int)(long)hcpu;
1101
1102 switch (action) {
1103 case CPU_UP_CANCELED:
1104 case CPU_UP_CANCELED_FROZEN:
1105 case CPU_DOWN_PREPARE:
1106 case CPU_DOWN_PREPARE_FROZEN:
1107 case CPU_DEAD:
1108 case CPU_DEAD_FROZEN:
1109 hrtick_clear(cpu_rq(cpu));
1110 return NOTIFY_OK;
1111 }
1112
1113 return NOTIFY_DONE;
1114}
1115
1116static __init void init_hrtick(void)
1117{
1118 hotcpu_notifier(hotplug_hrtick, 0);
1119}
1120#else
1121
1122
1123
1124
1125
1126static void hrtick_start(struct rq *rq, u64 delay)
1127{
1128 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1129 HRTIMER_MODE_REL_PINNED, 0);
1130}
1131
1132static inline void init_hrtick(void)
1133{
1134}
1135#endif
1136
1137static void init_rq_hrtick(struct rq *rq)
1138{
1139#ifdef CONFIG_SMP
1140 rq->hrtick_csd_pending = 0;
1141
1142 rq->hrtick_csd.flags = 0;
1143 rq->hrtick_csd.func = __hrtick_start;
1144 rq->hrtick_csd.info = rq;
1145#endif
1146
1147 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1148 rq->hrtick_timer.function = hrtick;
1149}
1150#else
1151static inline void hrtick_clear(struct rq *rq)
1152{
1153}
1154
1155static inline void init_rq_hrtick(struct rq *rq)
1156{
1157}
1158
1159static inline void init_hrtick(void)
1160{
1161}
1162#endif
1163
1164
1165
1166
1167
1168
1169
1170
1171#ifdef CONFIG_SMP
1172
1173#ifndef tsk_is_polling
1174#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1175#endif
1176
1177static void resched_task(struct task_struct *p)
1178{
1179 int cpu;
1180
1181 assert_spin_locked(&task_rq(p)->lock);
1182
1183 if (test_tsk_need_resched(p))
1184 return;
1185
1186 set_tsk_need_resched(p);
1187
1188 cpu = task_cpu(p);
1189 if (cpu == smp_processor_id())
1190 return;
1191
1192
1193 smp_mb();
1194 if (!tsk_is_polling(p))
1195 smp_send_reschedule(cpu);
1196}
1197
1198static void resched_cpu(int cpu)
1199{
1200 struct rq *rq = cpu_rq(cpu);
1201 unsigned long flags;
1202
1203 if (!spin_trylock_irqsave(&rq->lock, flags))
1204 return;
1205 resched_task(cpu_curr(cpu));
1206 spin_unlock_irqrestore(&rq->lock, flags);
1207}
1208
1209#ifdef CONFIG_NO_HZ
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220void wake_up_idle_cpu(int cpu)
1221{
1222 struct rq *rq = cpu_rq(cpu);
1223
1224 if (cpu == smp_processor_id())
1225 return;
1226
1227
1228
1229
1230
1231
1232
1233
1234 if (rq->curr != rq->idle)
1235 return;
1236
1237
1238
1239
1240
1241
1242 set_tsk_need_resched(rq->idle);
1243
1244
1245 smp_mb();
1246 if (!tsk_is_polling(rq->idle))
1247 smp_send_reschedule(cpu);
1248}
1249#endif
1250
1251static u64 sched_avg_period(void)
1252{
1253 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1254}
1255
1256static void sched_avg_update(struct rq *rq)
1257{
1258 s64 period = sched_avg_period();
1259
1260 while ((s64)(rq->clock - rq->age_stamp) > period) {
1261 rq->age_stamp += period;
1262 rq->rt_avg /= 2;
1263 }
1264}
1265
1266static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1267{
1268 rq->rt_avg += rt_delta;
1269 sched_avg_update(rq);
1270}
1271
1272#else
1273static void resched_task(struct task_struct *p)
1274{
1275 assert_spin_locked(&task_rq(p)->lock);
1276 set_tsk_need_resched(p);
1277}
1278
1279static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1280{
1281}
1282#endif
1283
1284#if BITS_PER_LONG == 32
1285# define WMULT_CONST (~0UL)
1286#else
1287# define WMULT_CONST (1UL << 32)
1288#endif
1289
1290#define WMULT_SHIFT 32
1291
1292
1293
1294
1295#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1296
1297
1298
1299
1300static unsigned long
1301calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1302 struct load_weight *lw)
1303{
1304 u64 tmp;
1305
1306 if (!lw->inv_weight) {
1307 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1308 lw->inv_weight = 1;
1309 else
1310 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1311 / (lw->weight+1);
1312 }
1313
1314 tmp = (u64)delta_exec * weight;
1315
1316
1317
1318 if (unlikely(tmp > WMULT_CONST))
1319 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1320 WMULT_SHIFT/2);
1321 else
1322 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1323
1324 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1325}
1326
1327static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1328{
1329 lw->weight += inc;
1330 lw->inv_weight = 0;
1331}
1332
1333static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1334{
1335 lw->weight -= dec;
1336 lw->inv_weight = 0;
1337}
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348#define WEIGHT_IDLEPRIO 3
1349#define WMULT_IDLEPRIO 1431655765
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363static const int prio_to_weight[40] = {
1364 88761, 71755, 56483, 46273, 36291,
1365 29154, 23254, 18705, 14949, 11916,
1366 9548, 7620, 6100, 4904, 3906,
1367 3121, 2501, 1991, 1586, 1277,
1368 1024, 820, 655, 526, 423,
1369 335, 272, 215, 172, 137,
1370 110, 87, 70, 56, 45,
1371 36, 29, 23, 18, 15,
1372};
1373
1374
1375
1376
1377
1378
1379
1380
1381static const u32 prio_to_wmult[40] = {
1382 48388, 59856, 76040, 92818, 118348,
1383 147320, 184698, 229616, 287308, 360437,
1384 449829, 563644, 704093, 875809, 1099582,
1385 1376151, 1717300, 2157191, 2708050, 3363326,
1386 4194304, 5237765, 6557202, 8165337, 10153587,
1387 12820798, 15790321, 19976592, 24970740, 31350126,
1388 39045157, 49367440, 61356676, 76695844, 95443717,
1389 119304647, 148102320, 186737708, 238609294, 286331153,
1390};
1391
1392static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1393
1394
1395
1396
1397
1398
1399struct rq_iterator {
1400 void *arg;
1401 struct task_struct *(*start)(void *);
1402 struct task_struct *(*next)(void *);
1403};
1404
1405#ifdef CONFIG_SMP
1406static unsigned long
1407balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1408 unsigned long max_load_move, struct sched_domain *sd,
1409 enum cpu_idle_type idle, int *all_pinned,
1410 int *this_best_prio, struct rq_iterator *iterator);
1411
1412static int
1413iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1414 struct sched_domain *sd, enum cpu_idle_type idle,
1415 struct rq_iterator *iterator);
1416#endif
1417
1418
1419enum cpuacct_stat_index {
1420 CPUACCT_STAT_USER,
1421 CPUACCT_STAT_SYSTEM,
1422
1423 CPUACCT_STAT_NSTATS,
1424};
1425
1426#ifdef CONFIG_CGROUP_CPUACCT
1427static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1428static void cpuacct_update_stats(struct task_struct *tsk,
1429 enum cpuacct_stat_index idx, cputime_t val);
1430#else
1431static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1432static inline void cpuacct_update_stats(struct task_struct *tsk,
1433 enum cpuacct_stat_index idx, cputime_t val) {}
1434#endif
1435
1436static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1437{
1438 update_load_add(&rq->load, load);
1439}
1440
1441static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1442{
1443 update_load_sub(&rq->load, load);
1444}
1445
1446#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1447typedef int (*tg_visitor)(struct task_group *, void *);
1448
1449
1450
1451
1452
1453static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1454{
1455 struct task_group *parent, *child;
1456 int ret;
1457
1458 rcu_read_lock();
1459 parent = &root_task_group;
1460down:
1461 ret = (*down)(parent, data);
1462 if (ret)
1463 goto out_unlock;
1464 list_for_each_entry_rcu(child, &parent->children, siblings) {
1465 parent = child;
1466 goto down;
1467
1468up:
1469 continue;
1470 }
1471 ret = (*up)(parent, data);
1472 if (ret)
1473 goto out_unlock;
1474
1475 child = parent;
1476 parent = parent->parent;
1477 if (parent)
1478 goto up;
1479out_unlock:
1480 rcu_read_unlock();
1481
1482 return ret;
1483}
1484
1485static int tg_nop(struct task_group *tg, void *data)
1486{
1487 return 0;
1488}
1489#endif
1490
1491#ifdef CONFIG_SMP
1492
1493static unsigned long weighted_cpuload(const int cpu)
1494{
1495 return cpu_rq(cpu)->load.weight;
1496}
1497
1498
1499
1500
1501
1502
1503
1504
1505static unsigned long source_load(int cpu, int type)
1506{
1507 struct rq *rq = cpu_rq(cpu);
1508 unsigned long total = weighted_cpuload(cpu);
1509
1510 if (type == 0 || !sched_feat(LB_BIAS))
1511 return total;
1512
1513 return min(rq->cpu_load[type-1], total);
1514}
1515
1516
1517
1518
1519
1520static unsigned long target_load(int cpu, int type)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long total = weighted_cpuload(cpu);
1524
1525 if (type == 0 || !sched_feat(LB_BIAS))
1526 return total;
1527
1528 return max(rq->cpu_load[type-1], total);
1529}
1530
1531static struct sched_group *group_of(int cpu)
1532{
1533 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1534
1535 if (!sd)
1536 return NULL;
1537
1538 return sd->groups;
1539}
1540
1541static unsigned long power_of(int cpu)
1542{
1543 struct sched_group *group = group_of(cpu);
1544
1545 if (!group)
1546 return SCHED_LOAD_SCALE;
1547
1548 return group->cpu_power;
1549}
1550
1551static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1552
1553static unsigned long cpu_avg_load_per_task(int cpu)
1554{
1555 struct rq *rq = cpu_rq(cpu);
1556 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1557
1558 if (nr_running)
1559 rq->avg_load_per_task = rq->load.weight / nr_running;
1560 else
1561 rq->avg_load_per_task = 0;
1562
1563 return rq->avg_load_per_task;
1564}
1565
1566#ifdef CONFIG_FAIR_GROUP_SCHED
1567
1568static __read_mostly unsigned long *update_shares_data;
1569
1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1571
1572
1573
1574
1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1576 unsigned long sd_shares,
1577 unsigned long sd_rq_weight,
1578 unsigned long *usd_rq_weight)
1579{
1580 unsigned long shares, rq_weight;
1581 int boost = 0;
1582
1583 rq_weight = usd_rq_weight[cpu];
1584 if (!rq_weight) {
1585 boost = 1;
1586 rq_weight = NICE_0_LOAD;
1587 }
1588
1589
1590
1591
1592
1593
1594 shares = (sd_shares * rq_weight) / sd_rq_weight;
1595 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1596
1597 if (abs(shares - tg->se[cpu]->load.weight) >
1598 sysctl_sched_shares_thresh) {
1599 struct rq *rq = cpu_rq(cpu);
1600 unsigned long flags;
1601
1602 spin_lock_irqsave(&rq->lock, flags);
1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1605 __set_se_shares(tg->se[cpu], shares);
1606 spin_unlock_irqrestore(&rq->lock, flags);
1607 }
1608}
1609
1610
1611
1612
1613
1614
1615static int tg_shares_up(struct task_group *tg, void *data)
1616{
1617 unsigned long weight, rq_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data;
1620 unsigned long flags;
1621 int i;
1622
1623 if (!tg->se[0])
1624 return 0;
1625
1626 local_irq_save(flags);
1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1628
1629 for_each_cpu(i, sched_domain_span(sd)) {
1630 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight;
1632
1633
1634
1635
1636
1637
1638 if (!weight)
1639 weight = NICE_0_LOAD;
1640
1641 rq_weight += weight;
1642 shares += tg->cfs_rq[i]->shares;
1643 }
1644
1645 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares;
1647
1648 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1649 shares = tg->shares;
1650
1651 for_each_cpu(i, sched_domain_span(sd))
1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1653
1654 local_irq_restore(flags);
1655
1656 return 0;
1657}
1658
1659
1660
1661
1662
1663
1664static int tg_load_down(struct task_group *tg, void *data)
1665{
1666 unsigned long load;
1667 long cpu = (long)data;
1668
1669 if (!tg->parent) {
1670 load = cpu_rq(cpu)->load.weight;
1671 } else {
1672 load = tg->parent->cfs_rq[cpu]->h_load;
1673 load *= tg->cfs_rq[cpu]->shares;
1674 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1675 }
1676
1677 tg->cfs_rq[cpu]->h_load = load;
1678
1679 return 0;
1680}
1681
1682static void update_shares(struct sched_domain *sd)
1683{
1684 s64 elapsed;
1685 u64 now;
1686
1687 if (root_task_group_empty())
1688 return;
1689
1690 now = cpu_clock(raw_smp_processor_id());
1691 elapsed = now - sd->last_update;
1692
1693 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1694 sd->last_update = now;
1695 walk_tg_tree(tg_nop, tg_shares_up, sd);
1696 }
1697}
1698
1699static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1700{
1701 if (root_task_group_empty())
1702 return;
1703
1704 spin_unlock(&rq->lock);
1705 update_shares(sd);
1706 spin_lock(&rq->lock);
1707}
1708
1709static void update_h_load(long cpu)
1710{
1711 if (root_task_group_empty())
1712 return;
1713
1714 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1715}
1716
1717#else
1718
1719static inline void update_shares(struct sched_domain *sd)
1720{
1721}
1722
1723static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1724{
1725}
1726
1727#endif
1728
1729#ifdef CONFIG_PREEMPT
1730
1731static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1742 __releases(this_rq->lock)
1743 __acquires(busiest->lock)
1744 __acquires(this_rq->lock)
1745{
1746 spin_unlock(&this_rq->lock);
1747 double_rq_lock(this_rq, busiest);
1748
1749 return 1;
1750}
1751
1752#else
1753
1754
1755
1756
1757
1758
1759
1760static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1761 __releases(this_rq->lock)
1762 __acquires(busiest->lock)
1763 __acquires(this_rq->lock)
1764{
1765 int ret = 0;
1766
1767 if (unlikely(!spin_trylock(&busiest->lock))) {
1768 if (busiest < this_rq) {
1769 spin_unlock(&this_rq->lock);
1770 spin_lock(&busiest->lock);
1771 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1772 ret = 1;
1773 } else
1774 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1775 }
1776 return ret;
1777}
1778
1779#endif
1780
1781
1782
1783
1784static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1785{
1786 if (unlikely(!irqs_disabled())) {
1787
1788 spin_unlock(&this_rq->lock);
1789 BUG_ON(1);
1790 }
1791
1792 return _double_lock_balance(this_rq, busiest);
1793}
1794
1795static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1796 __releases(busiest->lock)
1797{
1798 spin_unlock(&busiest->lock);
1799 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1800}
1801#endif
1802
1803#ifdef CONFIG_FAIR_GROUP_SCHED
1804static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805{
1806#ifdef CONFIG_SMP
1807 cfs_rq->shares = shares;
1808#endif
1809}
1810#endif
1811
1812static void calc_load_account_active(struct rq *this_rq);
1813
1814#include "sched_stats.h"
1815#include "sched_idletask.c"
1816#include "sched_fair.c"
1817#include "sched_rt.c"
1818#ifdef CONFIG_SCHED_DEBUG
1819# include "sched_debug.c"
1820#endif
1821
1822#define sched_class_highest (&rt_sched_class)
1823#define for_each_class(class) \
1824 for (class = sched_class_highest; class; class = class->next)
1825
1826static void inc_nr_running(struct rq *rq)
1827{
1828 rq->nr_running++;
1829}
1830
1831static void dec_nr_running(struct rq *rq)
1832{
1833 rq->nr_running--;
1834}
1835
1836static void set_load_weight(struct task_struct *p)
1837{
1838 if (task_has_rt_policy(p)) {
1839 p->se.load.weight = prio_to_weight[0] * 2;
1840 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1841 return;
1842 }
1843
1844
1845
1846
1847 if (p->policy == SCHED_IDLE) {
1848 p->se.load.weight = WEIGHT_IDLEPRIO;
1849 p->se.load.inv_weight = WMULT_IDLEPRIO;
1850 return;
1851 }
1852
1853 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1854 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1855}
1856
1857static void update_avg(u64 *avg, u64 sample)
1858{
1859 s64 diff = sample - *avg;
1860 *avg += diff >> 3;
1861}
1862
1863static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1864{
1865 if (wakeup)
1866 p->se.start_runtime = p->se.sum_exec_runtime;
1867
1868 sched_info_queued(p);
1869 p->sched_class->enqueue_task(rq, p, wakeup);
1870 p->se.on_rq = 1;
1871}
1872
1873static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1874{
1875 if (sleep) {
1876 if (p->se.last_wakeup) {
1877 update_avg(&p->se.avg_overlap,
1878 p->se.sum_exec_runtime - p->se.last_wakeup);
1879 p->se.last_wakeup = 0;
1880 } else {
1881 update_avg(&p->se.avg_wakeup,
1882 sysctl_sched_wakeup_granularity);
1883 }
1884 }
1885
1886 sched_info_dequeued(p);
1887 p->sched_class->dequeue_task(rq, p, sleep);
1888 p->se.on_rq = 0;
1889}
1890
1891
1892
1893
1894static inline int __normal_prio(struct task_struct *p)
1895{
1896 return p->static_prio;
1897}
1898
1899
1900
1901
1902
1903
1904
1905
1906static inline int normal_prio(struct task_struct *p)
1907{
1908 int prio;
1909
1910 if (task_has_rt_policy(p))
1911 prio = MAX_RT_PRIO-1 - p->rt_priority;
1912 else
1913 prio = __normal_prio(p);
1914 return prio;
1915}
1916
1917
1918
1919
1920
1921
1922
1923
1924static int effective_prio(struct task_struct *p)
1925{
1926 p->normal_prio = normal_prio(p);
1927
1928
1929
1930
1931
1932 if (!rt_prio(p->prio))
1933 return p->normal_prio;
1934 return p->prio;
1935}
1936
1937
1938
1939
1940static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1941{
1942 if (task_contributes_to_load(p))
1943 rq->nr_uninterruptible--;
1944
1945 enqueue_task(rq, p, wakeup);
1946 inc_nr_running(rq);
1947}
1948
1949
1950
1951
1952static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1953{
1954 if (task_contributes_to_load(p))
1955 rq->nr_uninterruptible++;
1956
1957 dequeue_task(rq, p, sleep);
1958 dec_nr_running(rq);
1959}
1960
1961
1962
1963
1964
1965inline int task_curr(const struct task_struct *p)
1966{
1967 return cpu_curr(task_cpu(p)) == p;
1968}
1969
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974
1975
1976
1977
1978
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class,
1986 int oldprio, int running)
1987{
1988 if (prev_class != p->sched_class) {
1989 if (prev_class->switched_from)
1990 prev_class->switched_from(rq, p, running);
1991 p->sched_class->switched_to(rq, p, running);
1992 } else
1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1;
2023 p->flags |= PF_THREAD_BOUND;
2024 spin_unlock_irqrestore(&rq->lock, flags);
2025}
2026EXPORT_SYMBOL(kthread_bind);
2027
2028#ifdef CONFIG_SMP
2029
2030
2031
2032static int
2033task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2034{
2035 s64 delta;
2036
2037
2038
2039
2040 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2041 (&p->se == cfs_rq_of(&p->se)->next ||
2042 &p->se == cfs_rq_of(&p->se)->last))
2043 return 1;
2044
2045 if (p->sched_class != &fair_sched_class)
2046 return 0;
2047
2048 if (sysctl_sched_migration_cost == -1)
2049 return 1;
2050 if (sysctl_sched_migration_cost == 0)
2051 return 0;
2052
2053 delta = now - p->se.exec_start;
2054
2055 return delta < (s64)sysctl_sched_migration_cost;
2056}
2057
2058
2059void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2060{
2061 int old_cpu = task_cpu(p);
2062 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2063 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2064 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2065 u64 clock_offset;
2066
2067 clock_offset = old_rq->clock - new_rq->clock;
2068
2069 trace_sched_migrate_task(p, new_cpu);
2070
2071#ifdef CONFIG_SCHEDSTATS
2072 if (p->se.wait_start)
2073 p->se.wait_start -= clock_offset;
2074 if (p->se.sleep_start)
2075 p->se.sleep_start -= clock_offset;
2076 if (p->se.block_start)
2077 p->se.block_start -= clock_offset;
2078#endif
2079 if (old_cpu != new_cpu) {
2080 p->se.nr_migrations++;
2081 new_rq->nr_migrations_in++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0);
2088 }
2089 p->se.vruntime -= old_cfsrq->min_vruntime -
2090 new_cfsrq->min_vruntime;
2091
2092 __set_task_cpu(p, new_cpu);
2093}
2094
2095struct migration_req {
2096 struct list_head list;
2097
2098 struct task_struct *task;
2099 int dest_cpu;
2100
2101 struct completion done;
2102};
2103
2104
2105
2106
2107
2108static int
2109migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2110{
2111 struct rq *rq = task_rq(p);
2112
2113
2114
2115
2116
2117 if (!p->se.on_rq && !task_running(rq, p)) {
2118 set_task_cpu(p, dest_cpu);
2119 return 0;
2120 }
2121
2122 init_completion(&req->done);
2123 req->task = p;
2124 req->dest_cpu = dest_cpu;
2125 list_add(&req->list, &rq->migration_queue);
2126
2127 return 1;
2128}
2129
2130
2131
2132
2133
2134
2135
2136void wait_task_context_switch(struct task_struct *p)
2137{
2138 unsigned long nvcsw, nivcsw, flags;
2139 int running;
2140 struct rq *rq;
2141
2142 nvcsw = p->nvcsw;
2143 nivcsw = p->nivcsw;
2144 for (;;) {
2145
2146
2147
2148
2149
2150
2151
2152
2153 rq = task_rq_lock(p, &flags);
2154 running = task_running(rq, p);
2155 task_rq_unlock(rq, &flags);
2156
2157 if (likely(!running))
2158 break;
2159
2160
2161
2162
2163
2164 if ((p->nvcsw - nvcsw) > 1)
2165 break;
2166 if ((p->nivcsw - nivcsw) > 1)
2167 break;
2168
2169 cpu_relax();
2170 }
2171}
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2190{
2191 unsigned long flags;
2192 int running, on_rq;
2193 unsigned long ncsw;
2194 struct rq *rq;
2195
2196 for (;;) {
2197
2198
2199
2200
2201
2202
2203 rq = task_rq(p);
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216 while (task_running(rq, p)) {
2217 if (match_state && unlikely(p->state != match_state))
2218 return 0;
2219 cpu_relax();
2220 }
2221
2222
2223
2224
2225
2226
2227 rq = task_rq_lock(p, &flags);
2228 trace_sched_wait_task(rq, p);
2229 running = task_running(rq, p);
2230 on_rq = p->se.on_rq;
2231 ncsw = 0;
2232 if (!match_state || p->state == match_state)
2233 ncsw = p->nvcsw | LONG_MIN;
2234 task_rq_unlock(rq, &flags);
2235
2236
2237
2238
2239 if (unlikely(!ncsw))
2240 break;
2241
2242
2243
2244
2245
2246
2247
2248 if (unlikely(running)) {
2249 cpu_relax();
2250 continue;
2251 }
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262 if (unlikely(on_rq)) {
2263 schedule_timeout_uninterruptible(1);
2264 continue;
2265 }
2266
2267
2268
2269
2270
2271
2272 break;
2273 }
2274
2275 return ncsw;
2276}
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291void kick_process(struct task_struct *p)
2292{
2293 int cpu;
2294
2295 preempt_disable();
2296 cpu = task_cpu(p);
2297 if ((cpu != smp_processor_id()) && task_curr(p))
2298 smp_send_reschedule(cpu);
2299 preempt_enable();
2300}
2301EXPORT_SYMBOL_GPL(kick_process);
2302#endif
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313void task_oncpu_function_call(struct task_struct *p,
2314 void (*func) (void *info), void *info)
2315{
2316 int cpu;
2317
2318 preempt_disable();
2319 cpu = task_cpu(p);
2320 if (task_curr(p))
2321 smp_call_function_single(cpu, func, info, 1);
2322 preempt_enable();
2323}
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339static int try_to_wake_up(struct task_struct *p, unsigned int state,
2340 int wake_flags)
2341{
2342 int cpu, orig_cpu, this_cpu, success = 0;
2343 unsigned long flags;
2344 struct rq *rq, *orig_rq;
2345
2346 if (!sched_feat(SYNC_WAKEUPS))
2347 wake_flags &= ~WF_SYNC;
2348
2349 this_cpu = get_cpu();
2350
2351 smp_wmb();
2352 rq = orig_rq = task_rq_lock(p, &flags);
2353 update_rq_clock(rq);
2354 if (!(p->state & state))
2355 goto out;
2356
2357 if (p->se.on_rq)
2358 goto out_running;
2359
2360 cpu = task_cpu(p);
2361 orig_cpu = cpu;
2362
2363#ifdef CONFIG_SMP
2364 if (unlikely(task_running(rq, p)))
2365 goto out_activate;
2366
2367
2368
2369
2370
2371
2372
2373 if (task_contributes_to_load(p))
2374 rq->nr_uninterruptible--;
2375 p->state = TASK_WAKING;
2376 task_rq_unlock(rq, &flags);
2377
2378 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2379 if (cpu != orig_cpu)
2380 set_task_cpu(p, cpu);
2381
2382 rq = task_rq_lock(p, &flags);
2383
2384 if (rq != orig_rq)
2385 update_rq_clock(rq);
2386
2387 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p);
2389
2390#ifdef CONFIG_SCHEDSTATS
2391 schedstat_inc(rq, ttwu_count);
2392 if (cpu == this_cpu)
2393 schedstat_inc(rq, ttwu_local);
2394 else {
2395 struct sched_domain *sd;
2396 for_each_domain(this_cpu, sd) {
2397 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2398 schedstat_inc(sd, ttwu_wake_remote);
2399 break;
2400 }
2401 }
2402 }
2403#endif
2404
2405out_activate:
2406#endif
2407 schedstat_inc(p, se.nr_wakeups);
2408 if (wake_flags & WF_SYNC)
2409 schedstat_inc(p, se.nr_wakeups_sync);
2410 if (orig_cpu != cpu)
2411 schedstat_inc(p, se.nr_wakeups_migrate);
2412 if (cpu == this_cpu)
2413 schedstat_inc(p, se.nr_wakeups_local);
2414 else
2415 schedstat_inc(p, se.nr_wakeups_remote);
2416 activate_task(rq, p, 1);
2417 success = 1;
2418
2419
2420
2421
2422 if (!in_interrupt()) {
2423 struct sched_entity *se = ¤t->se;
2424 u64 sample = se->sum_exec_runtime;
2425
2426 if (se->last_wakeup)
2427 sample -= se->last_wakeup;
2428 else
2429 sample -= se->start_runtime;
2430 update_avg(&se->avg_wakeup, sample);
2431
2432 se->last_wakeup = se->sum_exec_runtime;
2433 }
2434
2435out_running:
2436 trace_sched_wakeup(rq, p, success);
2437 check_preempt_curr(rq, p, wake_flags);
2438
2439 p->state = TASK_RUNNING;
2440#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up)
2442 p->sched_class->task_wake_up(rq, p);
2443#endif
2444out:
2445 task_rq_unlock(rq, &flags);
2446 put_cpu();
2447
2448 return success;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462int wake_up_process(struct task_struct *p)
2463{
2464 return try_to_wake_up(p, TASK_ALL, 0);
2465}
2466EXPORT_SYMBOL(wake_up_process);
2467
2468int wake_up_state(struct task_struct *p, unsigned int state)
2469{
2470 return try_to_wake_up(p, state, 0);
2471}
2472
2473
2474
2475
2476
2477
2478
2479static void __sched_fork(struct task_struct *p)
2480{
2481 p->se.exec_start = 0;
2482 p->se.sum_exec_runtime = 0;
2483 p->se.prev_sum_exec_runtime = 0;
2484 p->se.nr_migrations = 0;
2485 p->se.last_wakeup = 0;
2486 p->se.avg_overlap = 0;
2487 p->se.start_runtime = 0;
2488 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2489 p->se.avg_running = 0;
2490
2491#ifdef CONFIG_SCHEDSTATS
2492 p->se.wait_start = 0;
2493 p->se.wait_max = 0;
2494 p->se.wait_count = 0;
2495 p->se.wait_sum = 0;
2496
2497 p->se.sleep_start = 0;
2498 p->se.sleep_max = 0;
2499 p->se.sum_sleep_runtime = 0;
2500
2501 p->se.block_start = 0;
2502 p->se.block_max = 0;
2503 p->se.exec_max = 0;
2504 p->se.slice_max = 0;
2505
2506 p->se.nr_migrations_cold = 0;
2507 p->se.nr_failed_migrations_affine = 0;
2508 p->se.nr_failed_migrations_running = 0;
2509 p->se.nr_failed_migrations_hot = 0;
2510 p->se.nr_forced_migrations = 0;
2511 p->se.nr_forced2_migrations = 0;
2512
2513 p->se.nr_wakeups = 0;
2514 p->se.nr_wakeups_sync = 0;
2515 p->se.nr_wakeups_migrate = 0;
2516 p->se.nr_wakeups_local = 0;
2517 p->se.nr_wakeups_remote = 0;
2518 p->se.nr_wakeups_affine = 0;
2519 p->se.nr_wakeups_affine_attempts = 0;
2520 p->se.nr_wakeups_passive = 0;
2521 p->se.nr_wakeups_idle = 0;
2522
2523#endif
2524
2525 INIT_LIST_HEAD(&p->rt.run_list);
2526 p->se.on_rq = 0;
2527 INIT_LIST_HEAD(&p->se.group_node);
2528
2529#ifdef CONFIG_PREEMPT_NOTIFIERS
2530 INIT_HLIST_HEAD(&p->preempt_notifiers);
2531#endif
2532
2533
2534
2535
2536
2537
2538
2539 p->state = TASK_RUNNING;
2540}
2541
2542
2543
2544
2545void sched_fork(struct task_struct *p, int clone_flags)
2546{
2547 int cpu = get_cpu();
2548
2549 __sched_fork(p);
2550
2551
2552
2553
2554 if (unlikely(p->sched_reset_on_fork)) {
2555 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2556 p->policy = SCHED_NORMAL;
2557 p->normal_prio = p->static_prio;
2558 }
2559
2560 if (PRIO_TO_NICE(p->static_prio) < 0) {
2561 p->static_prio = NICE_TO_PRIO(0);
2562 p->normal_prio = p->static_prio;
2563 set_load_weight(p);
2564 }
2565
2566
2567
2568
2569
2570 p->sched_reset_on_fork = 0;
2571 }
2572
2573
2574
2575
2576 p->prio = current->normal_prio;
2577
2578 if (!rt_prio(p->prio))
2579 p->sched_class = &fair_sched_class;
2580
2581#ifdef CONFIG_SMP
2582 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2583#endif
2584 set_task_cpu(p, cpu);
2585
2586#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2587 if (likely(sched_info_on()))
2588 memset(&p->sched_info, 0, sizeof(p->sched_info));
2589#endif
2590#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2591 p->oncpu = 0;
2592#endif
2593#ifdef CONFIG_PREEMPT
2594
2595 task_thread_info(p)->preempt_count = 1;
2596#endif
2597 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2598
2599 put_cpu();
2600}
2601
2602
2603
2604
2605
2606
2607
2608
2609void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2610{
2611 unsigned long flags;
2612 struct rq *rq;
2613
2614 rq = task_rq_lock(p, &flags);
2615 BUG_ON(p->state != TASK_RUNNING);
2616 update_rq_clock(rq);
2617
2618 if (!p->sched_class->task_new || !current->se.on_rq) {
2619 activate_task(rq, p, 0);
2620 } else {
2621
2622
2623
2624
2625 p->sched_class->task_new(rq, p);
2626 inc_nr_running(rq);
2627 }
2628 trace_sched_wakeup_new(rq, p, 1);
2629 check_preempt_curr(rq, p, WF_FORK);
2630#ifdef CONFIG_SMP
2631 if (p->sched_class->task_wake_up)
2632 p->sched_class->task_wake_up(rq, p);
2633#endif
2634 task_rq_unlock(rq, &flags);
2635}
2636
2637#ifdef CONFIG_PREEMPT_NOTIFIERS
2638
2639
2640
2641
2642
2643void preempt_notifier_register(struct preempt_notifier *notifier)
2644{
2645 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2646}
2647EXPORT_SYMBOL_GPL(preempt_notifier_register);
2648
2649
2650
2651
2652
2653
2654
2655void preempt_notifier_unregister(struct preempt_notifier *notifier)
2656{
2657 hlist_del(¬ifier->link);
2658}
2659EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2660
2661static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2662{
2663 struct preempt_notifier *notifier;
2664 struct hlist_node *node;
2665
2666 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2667 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2668}
2669
2670static void
2671fire_sched_out_preempt_notifiers(struct task_struct *curr,
2672 struct task_struct *next)
2673{
2674 struct preempt_notifier *notifier;
2675 struct hlist_node *node;
2676
2677 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2678 notifier->ops->sched_out(notifier, next);
2679}
2680
2681#else
2682
2683static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2684{
2685}
2686
2687static void
2688fire_sched_out_preempt_notifiers(struct task_struct *curr,
2689 struct task_struct *next)
2690{
2691}
2692
2693#endif
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708static inline void
2709prepare_task_switch(struct rq *rq, struct task_struct *prev,
2710 struct task_struct *next)
2711{
2712 fire_sched_out_preempt_notifiers(prev, next);
2713 prepare_lock_switch(rq, next);
2714 prepare_arch_switch(next);
2715}
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2733 __releases(rq->lock)
2734{
2735 struct mm_struct *mm = rq->prev_mm;
2736 long prev_state;
2737
2738 rq->prev_mm = NULL;
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751 prev_state = prev->state;
2752 finish_arch_switch(prev);
2753 perf_event_task_sched_in(current, cpu_of(rq));
2754 finish_lock_switch(rq, prev);
2755
2756 fire_sched_in_preempt_notifiers(current);
2757 if (mm)
2758 mmdrop(mm);
2759 if (unlikely(prev_state == TASK_DEAD)) {
2760
2761
2762
2763
2764 kprobe_flush_task(prev);
2765 put_task_struct(prev);
2766 }
2767}
2768
2769#ifdef CONFIG_SMP
2770
2771
2772static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2773{
2774 if (prev->sched_class->pre_schedule)
2775 prev->sched_class->pre_schedule(rq, prev);
2776}
2777
2778
2779static inline void post_schedule(struct rq *rq)
2780{
2781 if (rq->post_schedule) {
2782 unsigned long flags;
2783
2784 spin_lock_irqsave(&rq->lock, flags);
2785 if (rq->curr->sched_class->post_schedule)
2786 rq->curr->sched_class->post_schedule(rq);
2787 spin_unlock_irqrestore(&rq->lock, flags);
2788
2789 rq->post_schedule = 0;
2790 }
2791}
2792
2793#else
2794
2795static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2796{
2797}
2798
2799static inline void post_schedule(struct rq *rq)
2800{
2801}
2802
2803#endif
2804
2805
2806
2807
2808
2809asmlinkage void schedule_tail(struct task_struct *prev)
2810 __releases(rq->lock)
2811{
2812 struct rq *rq = this_rq();
2813
2814 finish_task_switch(rq, prev);
2815
2816
2817
2818
2819
2820 post_schedule(rq);
2821
2822#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2823
2824 preempt_enable();
2825#endif
2826 if (current->set_child_tid)
2827 put_user(task_pid_vnr(current), current->set_child_tid);
2828}
2829
2830
2831
2832
2833
2834static inline void
2835context_switch(struct rq *rq, struct task_struct *prev,
2836 struct task_struct *next)
2837{
2838 struct mm_struct *mm, *oldmm;
2839
2840 prepare_task_switch(rq, prev, next);
2841 trace_sched_switch(rq, prev, next);
2842 mm = next->mm;
2843 oldmm = prev->active_mm;
2844
2845
2846
2847
2848
2849 arch_start_context_switch(prev);
2850
2851 if (unlikely(!mm)) {
2852 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next);
2855 } else
2856 switch_mm(oldmm, mm, next);
2857
2858 if (unlikely(!prev->mm)) {
2859 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm;
2861 }
2862
2863
2864
2865
2866
2867
2868#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2869 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2870#endif
2871
2872
2873 switch_to(prev, next, prev);
2874
2875 barrier();
2876
2877
2878
2879
2880
2881 finish_task_switch(this_rq(), prev);
2882}
2883
2884
2885
2886
2887
2888
2889
2890
2891unsigned long nr_running(void)
2892{
2893 unsigned long i, sum = 0;
2894
2895 for_each_online_cpu(i)
2896 sum += cpu_rq(i)->nr_running;
2897
2898 return sum;
2899}
2900
2901unsigned long nr_uninterruptible(void)
2902{
2903 unsigned long i, sum = 0;
2904
2905 for_each_possible_cpu(i)
2906 sum += cpu_rq(i)->nr_uninterruptible;
2907
2908
2909
2910
2911
2912 if (unlikely((long)sum < 0))
2913 sum = 0;
2914
2915 return sum;
2916}
2917
2918unsigned long long nr_context_switches(void)
2919{
2920 int i;
2921 unsigned long long sum = 0;
2922
2923 for_each_possible_cpu(i)
2924 sum += cpu_rq(i)->nr_switches;
2925
2926 return sum;
2927}
2928
2929unsigned long nr_iowait(void)
2930{
2931 unsigned long i, sum = 0;
2932
2933 for_each_possible_cpu(i)
2934 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2935
2936 return sum;
2937}
2938
2939unsigned long nr_iowait_cpu(void)
2940{
2941 struct rq *this = this_rq();
2942 return atomic_read(&this->nr_iowait);
2943}
2944
2945unsigned long this_cpu_load(void)
2946{
2947 struct rq *this = this_rq();
2948 return this->cpu_load[0];
2949}
2950
2951
2952
2953static atomic_long_t calc_load_tasks;
2954static unsigned long calc_load_update;
2955unsigned long avenrun[3];
2956EXPORT_SYMBOL(avenrun);
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2967{
2968 loads[0] = (avenrun[0] + offset) << shift;
2969 loads[1] = (avenrun[1] + offset) << shift;
2970 loads[2] = (avenrun[2] + offset) << shift;
2971}
2972
2973static unsigned long
2974calc_load(unsigned long load, unsigned long exp, unsigned long active)
2975{
2976 load *= exp;
2977 load += active * (FIXED_1 - exp);
2978 return load >> FSHIFT;
2979}
2980
2981
2982
2983
2984
2985void calc_global_load(void)
2986{
2987 unsigned long upd = calc_load_update + 10;
2988 long active;
2989
2990 if (time_before(jiffies, upd))
2991 return;
2992
2993 active = atomic_long_read(&calc_load_tasks);
2994 active = active > 0 ? active * FIXED_1 : 0;
2995
2996 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2997 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2998 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2999
3000 calc_load_update += LOAD_FREQ;
3001}
3002
3003
3004
3005
3006static void calc_load_account_active(struct rq *this_rq)
3007{
3008 long nr_active, delta;
3009
3010 nr_active = this_rq->nr_running;
3011 nr_active += (long) this_rq->nr_uninterruptible;
3012
3013 if (nr_active != this_rq->calc_load_active) {
3014 delta = nr_active - this_rq->calc_load_active;
3015 this_rq->calc_load_active = nr_active;
3016 atomic_long_add(delta, &calc_load_tasks);
3017 }
3018}
3019
3020
3021
3022
3023
3024u64 cpu_nr_migrations(int cpu)
3025{
3026 return cpu_rq(cpu)->nr_migrations_in;
3027}
3028
3029
3030
3031
3032
3033static void update_cpu_load(struct rq *this_rq)
3034{
3035 unsigned long this_load = this_rq->load.weight;
3036 int i, scale;
3037
3038 this_rq->nr_load_updates++;
3039
3040
3041 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3042 unsigned long old_load, new_load;
3043
3044
3045
3046 old_load = this_rq->cpu_load[i];
3047 new_load = this_load;
3048
3049
3050
3051
3052
3053 if (new_load > old_load)
3054 new_load += scale-1;
3055 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3056 }
3057
3058 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3059 this_rq->calc_load_update += LOAD_FREQ;
3060 calc_load_account_active(this_rq);
3061 }
3062}
3063
3064#ifdef CONFIG_SMP
3065
3066
3067
3068
3069
3070
3071
3072static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3073 __acquires(rq1->lock)
3074 __acquires(rq2->lock)
3075{
3076 BUG_ON(!irqs_disabled());
3077 if (rq1 == rq2) {
3078 spin_lock(&rq1->lock);
3079 __acquire(rq2->lock);
3080 } else {
3081 if (rq1 < rq2) {
3082 spin_lock(&rq1->lock);
3083 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3084 } else {
3085 spin_lock(&rq2->lock);
3086 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3087 }
3088 }
3089 update_rq_clock(rq1);
3090 update_rq_clock(rq2);
3091}
3092
3093
3094
3095
3096
3097
3098
3099static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3100 __releases(rq1->lock)
3101 __releases(rq2->lock)
3102{
3103 spin_unlock(&rq1->lock);
3104 if (rq1 != rq2)
3105 spin_unlock(&rq2->lock);
3106 else
3107 __release(rq2->lock);
3108}
3109
3110
3111
3112
3113
3114
3115
3116static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3117{
3118 struct migration_req req;
3119 unsigned long flags;
3120 struct rq *rq;
3121
3122 rq = task_rq_lock(p, &flags);
3123 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3124 || unlikely(!cpu_active(dest_cpu)))
3125 goto out;
3126
3127
3128 if (migrate_task(p, dest_cpu, &req)) {
3129
3130 struct task_struct *mt = rq->migration_thread;
3131
3132 get_task_struct(mt);
3133 task_rq_unlock(rq, &flags);
3134 wake_up_process(mt);
3135 put_task_struct(mt);
3136 wait_for_completion(&req.done);
3137
3138 return;
3139 }
3140out:
3141 task_rq_unlock(rq, &flags);
3142}
3143
3144
3145
3146
3147
3148void sched_exec(void)
3149{
3150 int new_cpu, this_cpu = get_cpu();
3151 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3152 put_cpu();
3153 if (new_cpu != this_cpu)
3154 sched_migrate_task(current, new_cpu);
3155}
3156
3157
3158
3159
3160
3161static void pull_task(struct rq *src_rq, struct task_struct *p,
3162 struct rq *this_rq, int this_cpu)
3163{
3164 deactivate_task(src_rq, p, 0);
3165 set_task_cpu(p, this_cpu);
3166 activate_task(this_rq, p, 0);
3167
3168
3169
3170
3171 check_preempt_curr(this_rq, p, 0);
3172}
3173
3174
3175
3176
3177static
3178int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3179 struct sched_domain *sd, enum cpu_idle_type idle,
3180 int *all_pinned)
3181{
3182 int tsk_cache_hot = 0;
3183
3184
3185
3186
3187
3188
3189 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3190 schedstat_inc(p, se.nr_failed_migrations_affine);
3191 return 0;
3192 }
3193 *all_pinned = 0;
3194
3195 if (task_running(rq, p)) {
3196 schedstat_inc(p, se.nr_failed_migrations_running);
3197 return 0;
3198 }
3199
3200
3201
3202
3203
3204
3205
3206 tsk_cache_hot = task_hot(p, rq->clock, sd);
3207 if (!tsk_cache_hot ||
3208 sd->nr_balance_failed > sd->cache_nice_tries) {
3209#ifdef CONFIG_SCHEDSTATS
3210 if (tsk_cache_hot) {
3211 schedstat_inc(sd, lb_hot_gained[idle]);
3212 schedstat_inc(p, se.nr_forced_migrations);
3213 }
3214#endif
3215 return 1;
3216 }
3217
3218 if (tsk_cache_hot) {
3219 schedstat_inc(p, se.nr_failed_migrations_hot);
3220 return 0;
3221 }
3222 return 1;
3223}
3224
3225static unsigned long
3226balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3227 unsigned long max_load_move, struct sched_domain *sd,
3228 enum cpu_idle_type idle, int *all_pinned,
3229 int *this_best_prio, struct rq_iterator *iterator)
3230{
3231 int loops = 0, pulled = 0, pinned = 0;
3232 struct task_struct *p;
3233 long rem_load_move = max_load_move;
3234
3235 if (max_load_move == 0)
3236 goto out;
3237
3238 pinned = 1;
3239
3240
3241
3242
3243 p = iterator->start(iterator->arg);
3244next:
3245 if (!p || loops++ > sysctl_sched_nr_migrate)
3246 goto out;
3247
3248 if ((p->se.load.weight >> 1) > rem_load_move ||
3249 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3250 p = iterator->next(iterator->arg);
3251 goto next;
3252 }
3253
3254 pull_task(busiest, p, this_rq, this_cpu);
3255 pulled++;
3256 rem_load_move -= p->se.load.weight;
3257
3258#ifdef CONFIG_PREEMPT
3259
3260
3261
3262
3263
3264 if (idle == CPU_NEWLY_IDLE)
3265 goto out;
3266#endif
3267
3268
3269
3270
3271 if (rem_load_move > 0) {
3272 if (p->prio < *this_best_prio)
3273 *this_best_prio = p->prio;
3274 p = iterator->next(iterator->arg);
3275 goto next;
3276 }
3277out:
3278
3279
3280
3281
3282
3283 schedstat_add(sd, lb_gained[idle], pulled);
3284
3285 if (all_pinned)
3286 *all_pinned = pinned;
3287
3288 return max_load_move - rem_load_move;
3289}
3290
3291
3292
3293
3294
3295
3296
3297
3298static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3299 unsigned long max_load_move,
3300 struct sched_domain *sd, enum cpu_idle_type idle,
3301 int *all_pinned)
3302{
3303 const struct sched_class *class = sched_class_highest;
3304 unsigned long total_load_moved = 0;
3305 int this_best_prio = this_rq->curr->prio;
3306
3307 do {
3308 total_load_moved +=
3309 class->load_balance(this_rq, this_cpu, busiest,
3310 max_load_move - total_load_moved,
3311 sd, idle, all_pinned, &this_best_prio);
3312 class = class->next;
3313
3314#ifdef CONFIG_PREEMPT
3315
3316
3317
3318
3319
3320 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3321 break;
3322#endif
3323 } while (class && max_load_move > total_load_moved);
3324
3325 return total_load_moved > 0;
3326}
3327
3328static int
3329iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3330 struct sched_domain *sd, enum cpu_idle_type idle,
3331 struct rq_iterator *iterator)
3332{
3333 struct task_struct *p = iterator->start(iterator->arg);
3334 int pinned = 0;
3335
3336 while (p) {
3337 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3338 pull_task(busiest, p, this_rq, this_cpu);
3339
3340
3341
3342
3343
3344 schedstat_inc(sd, lb_gained[idle]);
3345
3346 return 1;
3347 }
3348 p = iterator->next(iterator->arg);
3349 }
3350
3351 return 0;
3352}
3353
3354
3355
3356
3357
3358
3359
3360
3361static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3362 struct sched_domain *sd, enum cpu_idle_type idle)
3363{
3364 const struct sched_class *class;
3365
3366 for_each_class(class) {
3367 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3368 return 1;
3369 }
3370
3371 return 0;
3372}
3373
3374
3375
3376
3377
3378struct sd_lb_stats {
3379 struct sched_group *busiest;
3380 struct sched_group *this;
3381 unsigned long total_load;
3382 unsigned long total_pwr;
3383 unsigned long avg_load;
3384
3385
3386 unsigned long this_load;
3387 unsigned long this_load_per_task;
3388 unsigned long this_nr_running;
3389
3390
3391 unsigned long max_load;
3392 unsigned long busiest_load_per_task;
3393 unsigned long busiest_nr_running;
3394
3395 int group_imb;
3396#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3397 int power_savings_balance;
3398 struct sched_group *group_min;
3399 struct sched_group *group_leader;
3400 unsigned long min_load_per_task;
3401 unsigned long leader_nr_running;
3402 unsigned long min_nr_running;
3403#endif
3404};
3405
3406
3407
3408
3409struct sg_lb_stats {
3410 unsigned long avg_load;
3411 unsigned long group_load;
3412 unsigned long sum_nr_running;
3413 unsigned long sum_weighted_load;
3414 unsigned long group_capacity;
3415 int group_imb;
3416};
3417
3418
3419
3420
3421
3422static inline unsigned int group_first_cpu(struct sched_group *group)
3423{
3424 return cpumask_first(sched_group_cpus(group));
3425}
3426
3427
3428
3429
3430
3431
3432static inline int get_sd_load_idx(struct sched_domain *sd,
3433 enum cpu_idle_type idle)
3434{
3435 int load_idx;
3436
3437 switch (idle) {
3438 case CPU_NOT_IDLE:
3439 load_idx = sd->busy_idx;
3440 break;
3441
3442 case CPU_NEWLY_IDLE:
3443 load_idx = sd->newidle_idx;
3444 break;
3445 default:
3446 load_idx = sd->idle_idx;
3447 break;
3448 }
3449
3450 return load_idx;
3451}
3452
3453
3454#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3455
3456
3457
3458
3459
3460
3461
3462
3463static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3464 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3465{
3466
3467
3468
3469
3470 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3471 sds->power_savings_balance = 0;
3472 else {
3473 sds->power_savings_balance = 1;
3474 sds->min_nr_running = ULONG_MAX;
3475 sds->leader_nr_running = 0;
3476 }
3477}
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489static inline void update_sd_power_savings_stats(struct sched_group *group,
3490 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3491{
3492
3493 if (!sds->power_savings_balance)
3494 return;
3495
3496
3497
3498
3499
3500 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3501 !sds->this_nr_running))
3502 sds->power_savings_balance = 0;
3503
3504
3505
3506
3507
3508 if (!sds->power_savings_balance ||
3509 sgs->sum_nr_running >= sgs->group_capacity ||
3510 !sgs->sum_nr_running)
3511 return;
3512
3513
3514
3515
3516
3517
3518 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3519 (sgs->sum_nr_running == sds->min_nr_running &&
3520 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3521 sds->group_min = group;
3522 sds->min_nr_running = sgs->sum_nr_running;
3523 sds->min_load_per_task = sgs->sum_weighted_load /
3524 sgs->sum_nr_running;
3525 }
3526
3527
3528
3529
3530
3531
3532 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3533 return;
3534
3535 if (sgs->sum_nr_running > sds->leader_nr_running ||
3536 (sgs->sum_nr_running == sds->leader_nr_running &&
3537 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3538 sds->group_leader = group;
3539 sds->leader_nr_running = sgs->sum_nr_running;
3540 }
3541}
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3559 int this_cpu, unsigned long *imbalance)
3560{
3561 if (!sds->power_savings_balance)
3562 return 0;
3563
3564 if (sds->this != sds->group_leader ||
3565 sds->group_leader == sds->group_min)
3566 return 0;
3567
3568 *imbalance = sds->min_load_per_task;
3569 sds->busiest = sds->group_min;
3570
3571 return 1;
3572
3573}
3574#else
3575static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3576 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3577{
3578 return;
3579}
3580
3581static inline void update_sd_power_savings_stats(struct sched_group *group,
3582 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3583{
3584 return;
3585}
3586
3587static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3588 int this_cpu, unsigned long *imbalance)
3589{
3590 return 0;
3591}
3592#endif
3593
3594
3595unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3596{
3597 return SCHED_LOAD_SCALE;
3598}
3599
3600unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3601{
3602 return default_scale_freq_power(sd, cpu);
3603}
3604
3605unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3606{
3607 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3608 unsigned long smt_gain = sd->smt_gain;
3609
3610 smt_gain /= weight;
3611
3612 return smt_gain;
3613}
3614
3615unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3616{
3617 return default_scale_smt_power(sd, cpu);
3618}
3619
3620unsigned long scale_rt_power(int cpu)
3621{
3622 struct rq *rq = cpu_rq(cpu);
3623 u64 total, available;
3624
3625 sched_avg_update(rq);
3626
3627 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3628 available = total - rq->rt_avg;
3629
3630 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3631 total = SCHED_LOAD_SCALE;
3632
3633 total >>= SCHED_LOAD_SHIFT;
3634
3635 return div_u64(available, total);
3636}
3637
3638static void update_cpu_power(struct sched_domain *sd, int cpu)
3639{
3640 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3641 unsigned long power = SCHED_LOAD_SCALE;
3642 struct sched_group *sdg = sd->groups;
3643
3644 if (sched_feat(ARCH_POWER))
3645 power *= arch_scale_freq_power(sd, cpu);
3646 else
3647 power *= default_scale_freq_power(sd, cpu);
3648
3649 power >>= SCHED_LOAD_SHIFT;
3650
3651 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3652 if (sched_feat(ARCH_POWER))
3653 power *= arch_scale_smt_power(sd, cpu);
3654 else
3655 power *= default_scale_smt_power(sd, cpu);
3656
3657 power >>= SCHED_LOAD_SHIFT;
3658 }
3659
3660 power *= scale_rt_power(cpu);
3661 power >>= SCHED_LOAD_SHIFT;
3662
3663 if (!power)
3664 power = 1;
3665
3666 sdg->cpu_power = power;
3667}
3668
3669static void update_group_power(struct sched_domain *sd, int cpu)
3670{
3671 struct sched_domain *child = sd->child;
3672 struct sched_group *group, *sdg = sd->groups;
3673 unsigned long power;
3674
3675 if (!child) {
3676 update_cpu_power(sd, cpu);
3677 return;
3678 }
3679
3680 power = 0;
3681
3682 group = child->groups;
3683 do {
3684 power += group->cpu_power;
3685 group = group->next;
3686 } while (group != child->groups);
3687
3688 sdg->cpu_power = power;
3689}
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704static inline void update_sg_lb_stats(struct sched_domain *sd,
3705 struct sched_group *group, int this_cpu,
3706 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3707 int local_group, const struct cpumask *cpus,
3708 int *balance, struct sg_lb_stats *sgs)
3709{
3710 unsigned long load, max_cpu_load, min_cpu_load;
3711 int i;
3712 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3713 unsigned long sum_avg_load_per_task;
3714 unsigned long avg_load_per_task;
3715
3716 if (local_group) {
3717 balance_cpu = group_first_cpu(group);
3718 if (balance_cpu == this_cpu)
3719 update_group_power(sd, this_cpu);
3720 }
3721
3722
3723 sum_avg_load_per_task = avg_load_per_task = 0;
3724 max_cpu_load = 0;
3725 min_cpu_load = ~0UL;
3726
3727 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3728 struct rq *rq = cpu_rq(i);
3729
3730 if (*sd_idle && rq->nr_running)
3731 *sd_idle = 0;
3732
3733
3734 if (local_group) {
3735 if (idle_cpu(i) && !first_idle_cpu) {
3736 first_idle_cpu = 1;
3737 balance_cpu = i;
3738 }
3739
3740 load = target_load(i, load_idx);
3741 } else {
3742 load = source_load(i, load_idx);
3743 if (load > max_cpu_load)
3744 max_cpu_load = load;
3745 if (min_cpu_load > load)
3746 min_cpu_load = load;
3747 }
3748
3749 sgs->group_load += load;
3750 sgs->sum_nr_running += rq->nr_running;
3751 sgs->sum_weighted_load += weighted_cpuload(i);
3752
3753 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3754 }
3755
3756
3757
3758
3759
3760
3761
3762 if (idle != CPU_NEWLY_IDLE && local_group &&
3763 balance_cpu != this_cpu && balance) {
3764 *balance = 0;
3765 return;
3766 }
3767
3768
3769 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3782 group->cpu_power;
3783
3784 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3785 sgs->group_imb = 1;
3786
3787 sgs->group_capacity =
3788 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3789}
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3802 enum cpu_idle_type idle, int *sd_idle,
3803 const struct cpumask *cpus, int *balance,
3804 struct sd_lb_stats *sds)
3805{
3806 struct sched_domain *child = sd->child;
3807 struct sched_group *group = sd->groups;
3808 struct sg_lb_stats sgs;
3809 int load_idx, prefer_sibling = 0;
3810
3811 if (child && child->flags & SD_PREFER_SIBLING)
3812 prefer_sibling = 1;
3813
3814 init_sd_power_savings_stats(sd, sds, idle);
3815 load_idx = get_sd_load_idx(sd, idle);
3816
3817 do {
3818 int local_group;
3819
3820 local_group = cpumask_test_cpu(this_cpu,
3821 sched_group_cpus(group));
3822 memset(&sgs, 0, sizeof(sgs));
3823 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3824 local_group, cpus, balance, &sgs);
3825
3826 if (local_group && balance && !(*balance))
3827 return;
3828
3829 sds->total_load += sgs.group_load;
3830 sds->total_pwr += group->cpu_power;
3831
3832
3833
3834
3835
3836
3837 if (prefer_sibling)
3838 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3839
3840 if (local_group) {
3841 sds->this_load = sgs.avg_load;
3842 sds->this = group;
3843 sds->this_nr_running = sgs.sum_nr_running;
3844 sds->this_load_per_task = sgs.sum_weighted_load;
3845 } else if (sgs.avg_load > sds->max_load &&
3846 (sgs.sum_nr_running > sgs.group_capacity ||
3847 sgs.group_imb)) {
3848 sds->max_load = sgs.avg_load;
3849 sds->busiest = group;
3850 sds->busiest_nr_running = sgs.sum_nr_running;
3851 sds->busiest_load_per_task = sgs.sum_weighted_load;
3852 sds->group_imb = sgs.group_imb;
3853 }
3854
3855 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3856 group = group->next;
3857 } while (group != sd->groups);
3858}
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3869 int this_cpu, unsigned long *imbalance)
3870{
3871 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3872 unsigned int imbn = 2;
3873
3874 if (sds->this_nr_running) {
3875 sds->this_load_per_task /= sds->this_nr_running;
3876 if (sds->busiest_load_per_task >
3877 sds->this_load_per_task)
3878 imbn = 1;
3879 } else
3880 sds->this_load_per_task =
3881 cpu_avg_load_per_task(this_cpu);
3882
3883 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3884 sds->busiest_load_per_task * imbn) {
3885 *imbalance = sds->busiest_load_per_task;
3886 return;
3887 }
3888
3889
3890
3891
3892
3893
3894
3895 pwr_now += sds->busiest->cpu_power *
3896 min(sds->busiest_load_per_task, sds->max_load);
3897 pwr_now += sds->this->cpu_power *
3898 min(sds->this_load_per_task, sds->this_load);
3899 pwr_now /= SCHED_LOAD_SCALE;
3900
3901
3902 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3903 sds->busiest->cpu_power;
3904 if (sds->max_load > tmp)
3905 pwr_move += sds->busiest->cpu_power *
3906 min(sds->busiest_load_per_task, sds->max_load - tmp);
3907
3908
3909 if (sds->max_load * sds->busiest->cpu_power <
3910 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3911 tmp = (sds->max_load * sds->busiest->cpu_power) /
3912 sds->this->cpu_power;
3913 else
3914 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3915 sds->this->cpu_power;
3916 pwr_move += sds->this->cpu_power *
3917 min(sds->this_load_per_task, sds->this_load + tmp);
3918 pwr_move /= SCHED_LOAD_SCALE;
3919
3920
3921 if (pwr_move > pwr_now)
3922 *imbalance = sds->busiest_load_per_task;
3923}
3924
3925
3926
3927
3928
3929
3930
3931
3932static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3933 unsigned long *imbalance)
3934{
3935 unsigned long max_pull;
3936
3937
3938
3939
3940
3941 if (sds->max_load < sds->avg_load) {
3942 *imbalance = 0;
3943 return fix_small_imbalance(sds, this_cpu, imbalance);
3944 }
3945
3946
3947 max_pull = min(sds->max_load - sds->avg_load,
3948 sds->max_load - sds->busiest_load_per_task);
3949
3950
3951 *imbalance = min(max_pull * sds->busiest->cpu_power,
3952 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3953 / SCHED_LOAD_SCALE;
3954
3955
3956
3957
3958
3959
3960
3961 if (*imbalance < sds->busiest_load_per_task)
3962 return fix_small_imbalance(sds, this_cpu, imbalance);
3963
3964}
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992static struct sched_group *
3993find_busiest_group(struct sched_domain *sd, int this_cpu,
3994 unsigned long *imbalance, enum cpu_idle_type idle,
3995 int *sd_idle, const struct cpumask *cpus, int *balance)
3996{
3997 struct sd_lb_stats sds;
3998
3999 memset(&sds, 0, sizeof(sds));
4000
4001
4002
4003
4004
4005 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4006 balance, &sds);
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018 if (balance && !(*balance))
4019 goto ret;
4020
4021 if (!sds.busiest || sds.busiest_nr_running == 0)
4022 goto out_balanced;
4023
4024 if (sds.this_load >= sds.max_load)
4025 goto out_balanced;
4026
4027 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4028
4029 if (sds.this_load >= sds.avg_load)
4030 goto out_balanced;
4031
4032 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4033 goto out_balanced;
4034
4035 sds.busiest_load_per_task /= sds.busiest_nr_running;
4036 if (sds.group_imb)
4037 sds.busiest_load_per_task =
4038 min(sds.busiest_load_per_task, sds.avg_load);
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051 if (sds.max_load <= sds.busiest_load_per_task)
4052 goto out_balanced;
4053
4054
4055 calculate_imbalance(&sds, this_cpu, imbalance);
4056 return sds.busiest;
4057
4058out_balanced:
4059
4060
4061
4062
4063 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4064 return sds.busiest;
4065ret:
4066 *imbalance = 0;
4067 return NULL;
4068}
4069
4070
4071
4072
4073static struct rq *
4074find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4075 unsigned long imbalance, const struct cpumask *cpus)
4076{
4077 struct rq *busiest = NULL, *rq;
4078 unsigned long max_load = 0;
4079 int i;
4080
4081 for_each_cpu(i, sched_group_cpus(group)) {
4082 unsigned long power = power_of(i);
4083 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4084 unsigned long wl;
4085
4086 if (!cpumask_test_cpu(i, cpus))
4087 continue;
4088
4089 rq = cpu_rq(i);
4090 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4091 wl /= power;
4092
4093 if (capacity && rq->nr_running == 1 && wl > imbalance)
4094 continue;
4095
4096 if (wl > max_load) {
4097 max_load = wl;
4098 busiest = rq;
4099 }
4100 }
4101
4102 return busiest;
4103}
4104
4105
4106
4107
4108
4109#define MAX_PINNED_INTERVAL 512
4110
4111
4112static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4113
4114
4115
4116
4117
4118static int load_balance(int this_cpu, struct rq *this_rq,
4119 struct sched_domain *sd, enum cpu_idle_type idle,
4120 int *balance)
4121{
4122 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4123 struct sched_group *group;
4124 unsigned long imbalance;
4125 struct rq *busiest;
4126 unsigned long flags;
4127 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4128
4129 cpumask_setall(cpus);
4130
4131
4132
4133
4134
4135
4136
4137 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4138 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4139 sd_idle = 1;
4140
4141 schedstat_inc(sd, lb_count[idle]);
4142
4143redo:
4144 update_shares(sd);
4145 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4146 cpus, balance);
4147
4148 if (*balance == 0)
4149 goto out_balanced;
4150
4151 if (!group) {
4152 schedstat_inc(sd, lb_nobusyg[idle]);
4153 goto out_balanced;
4154 }
4155
4156 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4157 if (!busiest) {
4158 schedstat_inc(sd, lb_nobusyq[idle]);
4159 goto out_balanced;
4160 }
4161
4162 BUG_ON(busiest == this_rq);
4163
4164 schedstat_add(sd, lb_imbalance[idle], imbalance);
4165
4166 ld_moved = 0;
4167 if (busiest->nr_running > 1) {
4168
4169
4170
4171
4172
4173
4174 local_irq_save(flags);
4175 double_rq_lock(this_rq, busiest);
4176 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4177 imbalance, sd, idle, &all_pinned);
4178 double_rq_unlock(this_rq, busiest);
4179 local_irq_restore(flags);
4180
4181
4182
4183
4184 if (ld_moved && this_cpu != smp_processor_id())
4185 resched_cpu(this_cpu);
4186
4187
4188 if (unlikely(all_pinned)) {
4189 cpumask_clear_cpu(cpu_of(busiest), cpus);
4190 if (!cpumask_empty(cpus))
4191 goto redo;
4192 goto out_balanced;
4193 }
4194 }
4195
4196 if (!ld_moved) {
4197 schedstat_inc(sd, lb_failed[idle]);
4198 sd->nr_balance_failed++;
4199
4200 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4201
4202 spin_lock_irqsave(&busiest->lock, flags);
4203
4204
4205
4206
4207 if (!cpumask_test_cpu(this_cpu,
4208 &busiest->curr->cpus_allowed)) {
4209 spin_unlock_irqrestore(&busiest->lock, flags);
4210 all_pinned = 1;
4211 goto out_one_pinned;
4212 }
4213
4214 if (!busiest->active_balance) {
4215 busiest->active_balance = 1;
4216 busiest->push_cpu = this_cpu;
4217 active_balance = 1;
4218 }
4219 spin_unlock_irqrestore(&busiest->lock, flags);
4220 if (active_balance)
4221 wake_up_process(busiest->migration_thread);
4222
4223
4224
4225
4226
4227 sd->nr_balance_failed = sd->cache_nice_tries+1;
4228 }
4229 } else
4230 sd->nr_balance_failed = 0;
4231
4232 if (likely(!active_balance)) {
4233
4234 sd->balance_interval = sd->min_interval;
4235 } else {
4236
4237
4238
4239
4240
4241
4242 if (sd->balance_interval < sd->max_interval)
4243 sd->balance_interval *= 2;
4244 }
4245
4246 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4247 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4248 ld_moved = -1;
4249
4250 goto out;
4251
4252out_balanced:
4253 schedstat_inc(sd, lb_balanced[idle]);
4254
4255 sd->nr_balance_failed = 0;
4256
4257out_one_pinned:
4258
4259 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4260 (sd->balance_interval < sd->max_interval))
4261 sd->balance_interval *= 2;
4262
4263 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4264 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4265 ld_moved = -1;
4266 else
4267 ld_moved = 0;
4268out:
4269 if (ld_moved)
4270 update_shares(sd);
4271 return ld_moved;
4272}
4273
4274
4275
4276
4277
4278
4279
4280
4281static int
4282load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4283{
4284 struct sched_group *group;
4285 struct rq *busiest = NULL;
4286 unsigned long imbalance;
4287 int ld_moved = 0;
4288 int sd_idle = 0;
4289 int all_pinned = 0;
4290 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4291
4292 cpumask_setall(cpus);
4293
4294
4295
4296
4297
4298
4299
4300 if (sd->flags & SD_SHARE_CPUPOWER &&
4301 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4302 sd_idle = 1;
4303
4304 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4305redo:
4306 update_shares_locked(this_rq, sd);
4307 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4308 &sd_idle, cpus, NULL);
4309 if (!group) {
4310 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4311 goto out_balanced;
4312 }
4313
4314 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4315 if (!busiest) {
4316 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4317 goto out_balanced;
4318 }
4319
4320 BUG_ON(busiest == this_rq);
4321
4322 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4323
4324 ld_moved = 0;
4325 if (busiest->nr_running > 1) {
4326
4327 double_lock_balance(this_rq, busiest);
4328
4329 update_rq_clock(busiest);
4330 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4331 imbalance, sd, CPU_NEWLY_IDLE,
4332 &all_pinned);
4333 double_unlock_balance(this_rq, busiest);
4334
4335 if (unlikely(all_pinned)) {
4336 cpumask_clear_cpu(cpu_of(busiest), cpus);
4337 if (!cpumask_empty(cpus))
4338 goto redo;
4339 }
4340 }
4341
4342 if (!ld_moved) {
4343 int active_balance = 0;
4344
4345 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4346 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4347 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4348 return -1;
4349
4350 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4351 return -1;
4352
4353 if (sd->nr_balance_failed++ < 2)
4354 return -1;
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379 double_lock_balance(this_rq, busiest);
4380
4381
4382
4383
4384
4385 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4386 double_unlock_balance(this_rq, busiest);
4387 all_pinned = 1;
4388 return ld_moved;
4389 }
4390
4391 if (!busiest->active_balance) {
4392 busiest->active_balance = 1;
4393 busiest->push_cpu = this_cpu;
4394 active_balance = 1;
4395 }
4396
4397 double_unlock_balance(this_rq, busiest);
4398
4399
4400
4401 spin_unlock(&this_rq->lock);
4402 if (active_balance)
4403 wake_up_process(busiest->migration_thread);
4404 spin_lock(&this_rq->lock);
4405
4406 } else
4407 sd->nr_balance_failed = 0;
4408
4409 update_shares_locked(this_rq, sd);
4410 return ld_moved;
4411
4412out_balanced:
4413 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4414 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4415 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4416 return -1;
4417 sd->nr_balance_failed = 0;
4418
4419 return 0;
4420}
4421
4422
4423
4424
4425
4426static void idle_balance(int this_cpu, struct rq *this_rq)
4427{
4428 struct sched_domain *sd;
4429 int pulled_task = 0;
4430 unsigned long next_balance = jiffies + HZ;
4431
4432 for_each_domain(this_cpu, sd) {
4433 unsigned long interval;
4434
4435 if (!(sd->flags & SD_LOAD_BALANCE))
4436 continue;
4437
4438 if (sd->flags & SD_BALANCE_NEWIDLE)
4439
4440 pulled_task = load_balance_newidle(this_cpu, this_rq,
4441 sd);
4442
4443 interval = msecs_to_jiffies(sd->balance_interval);
4444 if (time_after(next_balance, sd->last_balance + interval))
4445 next_balance = sd->last_balance + interval;
4446 if (pulled_task)
4447 break;
4448 }
4449 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4450
4451
4452
4453
4454 this_rq->next_balance = next_balance;
4455 }
4456}
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4467{
4468 int target_cpu = busiest_rq->push_cpu;
4469 struct sched_domain *sd;
4470 struct rq *target_rq;
4471
4472
4473 if (busiest_rq->nr_running <= 1)
4474 return;
4475
4476 target_rq = cpu_rq(target_cpu);
4477
4478
4479
4480
4481
4482
4483 BUG_ON(busiest_rq == target_rq);
4484
4485
4486 double_lock_balance(busiest_rq, target_rq);
4487 update_rq_clock(busiest_rq);
4488 update_rq_clock(target_rq);
4489
4490
4491 for_each_domain(target_cpu, sd) {
4492 if ((sd->flags & SD_LOAD_BALANCE) &&
4493 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4494 break;
4495 }
4496
4497 if (likely(sd)) {
4498 schedstat_inc(sd, alb_count);
4499
4500 if (move_one_task(target_rq, target_cpu, busiest_rq,
4501 sd, CPU_IDLE))
4502 schedstat_inc(sd, alb_pushed);
4503 else
4504 schedstat_inc(sd, alb_failed);
4505 }
4506 double_unlock_balance(busiest_rq, target_rq);
4507}
4508
4509#ifdef CONFIG_NO_HZ
4510static struct {
4511 atomic_t load_balancer;
4512 cpumask_var_t cpu_mask;
4513 cpumask_var_t ilb_grp_nohz_mask;
4514} nohz ____cacheline_aligned = {
4515 .load_balancer = ATOMIC_INIT(-1),
4516};
4517
4518int get_nohz_load_balancer(void)
4519{
4520 return atomic_read(&nohz.load_balancer);
4521}
4522
4523#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4534{
4535 struct sched_domain *sd;
4536
4537 for_each_domain(cpu, sd)
4538 if (sd && (sd->flags & flag))
4539 break;
4540
4541 return sd;
4542}
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554#define for_each_flag_domain(cpu, sd, flag) \
4555 for (sd = lowest_flag_domain(cpu, flag); \
4556 (sd && (sd->flags & flag)); sd = sd->parent)
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568static inline int is_semi_idle_group(struct sched_group *ilb_group)
4569{
4570 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4571 sched_group_cpus(ilb_group));
4572
4573
4574
4575
4576
4577 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4578 return 0;
4579
4580 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4581 return 0;
4582
4583 return 1;
4584}
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597static int find_new_ilb(int cpu)
4598{
4599 struct sched_domain *sd;
4600 struct sched_group *ilb_group;
4601
4602
4603
4604
4605
4606 if (!(sched_smt_power_savings || sched_mc_power_savings))
4607 goto out_done;
4608
4609
4610
4611
4612
4613 if (cpumask_weight(nohz.cpu_mask) < 2)
4614 goto out_done;
4615
4616 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4617 ilb_group = sd->groups;
4618
4619 do {
4620 if (is_semi_idle_group(ilb_group))
4621 return cpumask_first(nohz.ilb_grp_nohz_mask);
4622
4623 ilb_group = ilb_group->next;
4624
4625 } while (ilb_group != sd->groups);
4626 }
4627
4628out_done:
4629 return cpumask_first(nohz.cpu_mask);
4630}
4631#else
4632static inline int find_new_ilb(int call_cpu)
4633{
4634 return cpumask_first(nohz.cpu_mask);
4635}
4636#endif
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658int select_nohz_load_balancer(int stop_tick)
4659{
4660 int cpu = smp_processor_id();
4661
4662 if (stop_tick) {
4663 cpu_rq(cpu)->in_nohz_recently = 1;
4664
4665 if (!cpu_active(cpu)) {
4666 if (atomic_read(&nohz.load_balancer) != cpu)
4667 return 0;
4668
4669
4670
4671
4672
4673 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4674 BUG();
4675
4676 return 0;
4677 }
4678
4679 cpumask_set_cpu(cpu, nohz.cpu_mask);
4680
4681
4682 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4683 if (atomic_read(&nohz.load_balancer) == cpu)
4684 atomic_set(&nohz.load_balancer, -1);
4685 return 0;
4686 }
4687
4688 if (atomic_read(&nohz.load_balancer) == -1) {
4689
4690 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4691 return 1;
4692 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4693 int new_ilb;
4694
4695 if (!(sched_smt_power_savings ||
4696 sched_mc_power_savings))
4697 return 1;
4698
4699
4700
4701
4702 new_ilb = find_new_ilb(cpu);
4703 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4704 atomic_set(&nohz.load_balancer, -1);
4705 resched_cpu(new_ilb);
4706 return 0;
4707 }
4708 return 1;
4709 }
4710 } else {
4711 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4712 return 0;
4713
4714 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4715
4716 if (atomic_read(&nohz.load_balancer) == cpu)
4717 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4718 BUG();
4719 }
4720 return 0;
4721}
4722#endif
4723
4724static DEFINE_SPINLOCK(balancing);
4725
4726
4727
4728
4729
4730
4731
4732static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4733{
4734 int balance = 1;
4735 struct rq *rq = cpu_rq(cpu);
4736 unsigned long interval;
4737 struct sched_domain *sd;
4738
4739 unsigned long next_balance = jiffies + 60*HZ;
4740 int update_next_balance = 0;
4741 int need_serialize;
4742
4743 for_each_domain(cpu, sd) {
4744 if (!(sd->flags & SD_LOAD_BALANCE))
4745 continue;
4746
4747 interval = sd->balance_interval;
4748 if (idle != CPU_IDLE)
4749 interval *= sd->busy_factor;
4750
4751
4752 interval = msecs_to_jiffies(interval);
4753 if (unlikely(!interval))
4754 interval = 1;
4755 if (interval > HZ*NR_CPUS/10)
4756 interval = HZ*NR_CPUS/10;
4757
4758 need_serialize = sd->flags & SD_SERIALIZE;
4759
4760 if (need_serialize) {
4761 if (!spin_trylock(&balancing))
4762 goto out;
4763 }
4764
4765 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4766 if (load_balance(cpu, rq, sd, idle, &balance)) {
4767
4768
4769
4770
4771
4772 idle = CPU_NOT_IDLE;
4773 }
4774 sd->last_balance = jiffies;
4775 }
4776 if (need_serialize)
4777 spin_unlock(&balancing);
4778out:
4779 if (time_after(next_balance, sd->last_balance + interval)) {
4780 next_balance = sd->last_balance + interval;
4781 update_next_balance = 1;
4782 }
4783
4784
4785
4786
4787
4788
4789 if (!balance)
4790 break;
4791 }
4792
4793
4794
4795
4796
4797
4798 if (likely(update_next_balance))
4799 rq->next_balance = next_balance;
4800}
4801
4802
4803
4804
4805
4806
4807static void run_rebalance_domains(struct softirq_action *h)
4808{
4809 int this_cpu = smp_processor_id();
4810 struct rq *this_rq = cpu_rq(this_cpu);
4811 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4812 CPU_IDLE : CPU_NOT_IDLE;
4813
4814 rebalance_domains(this_cpu, idle);
4815
4816#ifdef CONFIG_NO_HZ
4817
4818
4819
4820
4821
4822 if (this_rq->idle_at_tick &&
4823 atomic_read(&nohz.load_balancer) == this_cpu) {
4824 struct rq *rq;
4825 int balance_cpu;
4826
4827 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4828 if (balance_cpu == this_cpu)
4829 continue;
4830
4831
4832
4833
4834
4835
4836 if (need_resched())
4837 break;
4838
4839 rebalance_domains(balance_cpu, CPU_IDLE);
4840
4841 rq = cpu_rq(balance_cpu);
4842 if (time_after(this_rq->next_balance, rq->next_balance))
4843 this_rq->next_balance = rq->next_balance;
4844 }
4845 }
4846#endif
4847}
4848
4849static inline int on_null_domain(int cpu)
4850{
4851 return !rcu_dereference(cpu_rq(cpu)->sd);
4852}
4853
4854
4855
4856
4857
4858
4859
4860
4861static inline void trigger_load_balance(struct rq *rq, int cpu)
4862{
4863#ifdef CONFIG_NO_HZ
4864
4865
4866
4867
4868
4869 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4870 rq->in_nohz_recently = 0;
4871
4872 if (atomic_read(&nohz.load_balancer) == cpu) {
4873 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4874 atomic_set(&nohz.load_balancer, -1);
4875 }
4876
4877 if (atomic_read(&nohz.load_balancer) == -1) {
4878 int ilb = find_new_ilb(cpu);
4879
4880 if (ilb < nr_cpu_ids)
4881 resched_cpu(ilb);
4882 }
4883 }
4884
4885
4886
4887
4888
4889 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4890 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4891 resched_cpu(cpu);
4892 return;
4893 }
4894
4895
4896
4897
4898
4899 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4900 cpumask_test_cpu(cpu, nohz.cpu_mask))
4901 return;
4902#endif
4903
4904 if (time_after_eq(jiffies, rq->next_balance) &&
4905 likely(!on_null_domain(cpu)))
4906 raise_softirq(SCHED_SOFTIRQ);
4907}
4908
4909#else
4910
4911
4912
4913
4914static inline void idle_balance(int cpu, struct rq *rq)
4915{
4916}
4917
4918#endif
4919
4920DEFINE_PER_CPU(struct kernel_stat, kstat);
4921
4922EXPORT_PER_CPU_SYMBOL(kstat);
4923
4924
4925
4926
4927
4928
4929
4930static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4931{
4932 u64 ns = 0;
4933
4934 if (task_current(rq, p)) {
4935 update_rq_clock(rq);
4936 ns = rq->clock - p->se.exec_start;
4937 if ((s64)ns < 0)
4938 ns = 0;
4939 }
4940
4941 return ns;
4942}
4943
4944unsigned long long task_delta_exec(struct task_struct *p)
4945{
4946 unsigned long flags;
4947 struct rq *rq;
4948 u64 ns = 0;
4949
4950 rq = task_rq_lock(p, &flags);
4951 ns = do_task_delta_exec(p, rq);
4952 task_rq_unlock(rq, &flags);
4953
4954 return ns;
4955}
4956
4957
4958
4959
4960
4961
4962unsigned long long task_sched_runtime(struct task_struct *p)
4963{
4964 unsigned long flags;
4965 struct rq *rq;
4966 u64 ns = 0;
4967
4968 rq = task_rq_lock(p, &flags);
4969 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4970 task_rq_unlock(rq, &flags);
4971
4972 return ns;
4973}
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984unsigned long long thread_group_sched_runtime(struct task_struct *p)
4985{
4986 struct task_cputime totals;
4987 unsigned long flags;
4988 struct rq *rq;
4989 u64 ns;
4990
4991 rq = task_rq_lock(p, &flags);
4992 thread_group_cputime(p, &totals);
4993 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4994 task_rq_unlock(rq, &flags);
4995
4996 return ns;
4997}
4998
4999
5000
5001
5002
5003
5004
5005void account_user_time(struct task_struct *p, cputime_t cputime,
5006 cputime_t cputime_scaled)
5007{
5008 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5009 cputime64_t tmp;
5010
5011
5012 p->utime = cputime_add(p->utime, cputime);
5013 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
5014 account_group_user_time(p, cputime);
5015
5016
5017 tmp = cputime_to_cputime64(cputime);
5018 if (TASK_NICE(p) > 0)
5019 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5020 else
5021 cpustat->user = cputime64_add(cpustat->user, tmp);
5022
5023 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
5024
5025 acct_update_integrals(p);
5026}
5027
5028
5029
5030
5031
5032
5033
5034static void account_guest_time(struct task_struct *p, cputime_t cputime,
5035 cputime_t cputime_scaled)
5036{
5037 cputime64_t tmp;
5038 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5039
5040 tmp = cputime_to_cputime64(cputime);
5041
5042
5043 p->utime = cputime_add(p->utime, cputime);
5044 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
5045 account_group_user_time(p, cputime);
5046 p->gtime = cputime_add(p->gtime, cputime);
5047
5048
5049 cpustat->user = cputime64_add(cpustat->user, tmp);
5050 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5051}
5052
5053
5054
5055
5056
5057
5058
5059
5060void account_system_time(struct task_struct *p, int hardirq_offset,
5061 cputime_t cputime, cputime_t cputime_scaled)
5062{
5063 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5064 cputime64_t tmp;
5065
5066 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
5067 account_guest_time(p, cputime, cputime_scaled);
5068 return;
5069 }
5070
5071
5072 p->stime = cputime_add(p->stime, cputime);
5073 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
5074 account_group_system_time(p, cputime);
5075
5076
5077 tmp = cputime_to_cputime64(cputime);
5078 if (hardirq_count() - hardirq_offset)
5079 cpustat->irq = cputime64_add(cpustat->irq, tmp);
5080 else if (softirq_count())
5081 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
5082 else
5083 cpustat->system = cputime64_add(cpustat->system, tmp);
5084
5085 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
5086
5087
5088 acct_update_integrals(p);
5089}
5090
5091
5092
5093
5094
5095void account_steal_time(cputime_t cputime)
5096{
5097 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5098 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5099
5100 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
5101}
5102
5103
5104
5105
5106
5107void account_idle_time(cputime_t cputime)
5108{
5109 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5110 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5111 struct rq *rq = this_rq();
5112
5113 if (atomic_read(&rq->nr_iowait) > 0)
5114 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
5115 else
5116 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
5117}
5118
5119#ifndef CONFIG_VIRT_CPU_ACCOUNTING
5120
5121
5122
5123
5124
5125
5126void account_process_tick(struct task_struct *p, int user_tick)
5127{
5128 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5129 struct rq *rq = this_rq();
5130
5131 if (user_tick)
5132 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5133 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5134 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5135 one_jiffy_scaled);
5136 else
5137 account_idle_time(cputime_one_jiffy);
5138}
5139
5140
5141
5142
5143
5144
5145void account_steal_ticks(unsigned long ticks)
5146{
5147 account_steal_time(jiffies_to_cputime(ticks));
5148}
5149
5150
5151
5152
5153
5154void account_idle_ticks(unsigned long ticks)
5155{
5156 account_idle_time(jiffies_to_cputime(ticks));
5157}
5158
5159#endif
5160
5161
5162
5163
5164#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5165cputime_t task_utime(struct task_struct *p)
5166{
5167 return p->utime;
5168}
5169
5170cputime_t task_stime(struct task_struct *p)
5171{
5172 return p->stime;
5173}
5174#else
5175cputime_t task_utime(struct task_struct *p)
5176{
5177 clock_t utime = cputime_to_clock_t(p->utime),
5178 total = utime + cputime_to_clock_t(p->stime);
5179 u64 temp;
5180
5181
5182
5183
5184 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
5185
5186 if (total) {
5187 temp *= utime;
5188 do_div(temp, total);
5189 }
5190 utime = (clock_t)temp;
5191
5192 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
5193 return p->prev_utime;
5194}
5195
5196cputime_t task_stime(struct task_struct *p)
5197{
5198 clock_t stime;
5199
5200
5201
5202
5203
5204
5205 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5206 cputime_to_clock_t(task_utime(p));
5207
5208 if (stime >= 0)
5209 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
5210
5211 return p->prev_stime;
5212}
5213#endif
5214
5215inline cputime_t task_gtime(struct task_struct *p)
5216{
5217 return p->gtime;
5218}
5219
5220
5221
5222
5223
5224
5225
5226
5227void scheduler_tick(void)
5228{
5229 int cpu = smp_processor_id();
5230 struct rq *rq = cpu_rq(cpu);
5231 struct task_struct *curr = rq->curr;
5232
5233 sched_clock_tick();
5234
5235 spin_lock(&rq->lock);
5236 update_rq_clock(rq);
5237 update_cpu_load(rq);
5238 curr->sched_class->task_tick(rq, curr, 0);
5239 spin_unlock(&rq->lock);
5240
5241 perf_event_task_tick(curr, cpu);
5242
5243#ifdef CONFIG_SMP
5244 rq->idle_at_tick = idle_cpu(cpu);
5245 trigger_load_balance(rq, cpu);
5246#endif
5247}
5248
5249notrace unsigned long get_parent_ip(unsigned long addr)
5250{
5251 if (in_lock_functions(addr)) {
5252 addr = CALLER_ADDR2;
5253 if (in_lock_functions(addr))
5254 addr = CALLER_ADDR3;
5255 }
5256 return addr;
5257}
5258
5259#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5260 defined(CONFIG_PREEMPT_TRACER))
5261
5262void __kprobes add_preempt_count(int val)
5263{
5264#ifdef CONFIG_DEBUG_PREEMPT
5265
5266
5267
5268 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5269 return;
5270#endif
5271 preempt_count() += val;
5272#ifdef CONFIG_DEBUG_PREEMPT
5273
5274
5275
5276 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5277 PREEMPT_MASK - 10);
5278#endif
5279 if (preempt_count() == val)
5280 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5281}
5282EXPORT_SYMBOL(add_preempt_count);
5283
5284void __kprobes sub_preempt_count(int val)
5285{
5286#ifdef CONFIG_DEBUG_PREEMPT
5287
5288
5289
5290 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5291 return;
5292
5293
5294
5295 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5296 !(preempt_count() & PREEMPT_MASK)))
5297 return;
5298#endif
5299
5300 if (preempt_count() == val)
5301 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5302 preempt_count() -= val;
5303}
5304EXPORT_SYMBOL(sub_preempt_count);
5305
5306#endif
5307
5308
5309
5310
5311static noinline void __schedule_bug(struct task_struct *prev)
5312{
5313 struct pt_regs *regs = get_irq_regs();
5314
5315 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5316 prev->comm, prev->pid, preempt_count());
5317
5318 debug_show_held_locks(prev);
5319 print_modules();
5320 if (irqs_disabled())
5321 print_irqtrace_events(prev);
5322
5323 if (regs)
5324 show_regs(regs);
5325 else
5326 dump_stack();
5327}
5328
5329
5330
5331
5332static inline void schedule_debug(struct task_struct *prev)
5333{
5334
5335
5336
5337
5338
5339 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
5340 __schedule_bug(prev);
5341
5342 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5343
5344 schedstat_inc(this_rq(), sched_count);
5345#ifdef CONFIG_SCHEDSTATS
5346 if (unlikely(prev->lock_depth >= 0)) {
5347 schedstat_inc(this_rq(), bkl_count);
5348 schedstat_inc(prev, sched_info.bkl_count);
5349 }
5350#endif
5351}
5352
5353static void put_prev_task(struct rq *rq, struct task_struct *p)
5354{
5355 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5356
5357 update_avg(&p->se.avg_running, runtime);
5358
5359 if (p->state == TASK_RUNNING) {
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5370 update_avg(&p->se.avg_overlap, runtime);
5371 } else {
5372 update_avg(&p->se.avg_running, 0);
5373 }
5374 p->sched_class->put_prev_task(rq, p);
5375}
5376
5377
5378
5379
5380static inline struct task_struct *
5381pick_next_task(struct rq *rq)
5382{
5383 const struct sched_class *class;
5384 struct task_struct *p;
5385
5386
5387
5388
5389
5390 if (likely(rq->nr_running == rq->cfs.nr_running)) {
5391 p = fair_sched_class.pick_next_task(rq);
5392 if (likely(p))
5393 return p;
5394 }
5395
5396 class = sched_class_highest;
5397 for ( ; ; ) {
5398 p = class->pick_next_task(rq);
5399 if (p)
5400 return p;
5401
5402
5403
5404
5405 class = class->next;
5406 }
5407}
5408
5409
5410
5411
5412asmlinkage void __sched schedule(void)
5413{
5414 struct task_struct *prev, *next;
5415 unsigned long *switch_count;
5416 struct rq *rq;
5417 int cpu;
5418
5419need_resched:
5420 preempt_disable();
5421 cpu = smp_processor_id();
5422 rq = cpu_rq(cpu);
5423 rcu_sched_qs(cpu);
5424 prev = rq->curr;
5425 switch_count = &prev->nivcsw;
5426
5427 release_kernel_lock(prev);
5428need_resched_nonpreemptible:
5429
5430 schedule_debug(prev);
5431
5432 if (sched_feat(HRTICK))
5433 hrtick_clear(rq);
5434
5435 spin_lock_irq(&rq->lock);
5436 update_rq_clock(rq);
5437 clear_tsk_need_resched(prev);
5438
5439 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5440 if (unlikely(signal_pending_state(prev->state, prev)))
5441 prev->state = TASK_RUNNING;
5442 else
5443 deactivate_task(rq, prev, 1);
5444 switch_count = &prev->nvcsw;
5445 }
5446
5447 pre_schedule(rq, prev);
5448
5449 if (unlikely(!rq->nr_running))
5450 idle_balance(cpu, rq);
5451
5452 put_prev_task(rq, prev);
5453 next = pick_next_task(rq);
5454
5455 if (likely(prev != next)) {
5456 sched_info_switch(prev, next);
5457 perf_event_task_sched_out(prev, next, cpu);
5458
5459 rq->nr_switches++;
5460 rq->curr = next;
5461 ++*switch_count;
5462
5463 context_switch(rq, prev, next);
5464
5465
5466
5467
5468 cpu = smp_processor_id();
5469 rq = cpu_rq(cpu);
5470 } else
5471 spin_unlock_irq(&rq->lock);
5472
5473 post_schedule(rq);
5474
5475 if (unlikely(reacquire_kernel_lock(current) < 0))
5476 goto need_resched_nonpreemptible;
5477
5478 preempt_enable_no_resched();
5479 if (need_resched())
5480 goto need_resched;
5481}
5482EXPORT_SYMBOL(schedule);
5483
5484#ifdef CONFIG_SMP
5485
5486
5487
5488
5489int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5490{
5491 unsigned int cpu;
5492 struct rq *rq;
5493
5494 if (!sched_feat(OWNER_SPIN))
5495 return 0;
5496
5497#ifdef CONFIG_DEBUG_PAGEALLOC
5498
5499
5500
5501
5502
5503 if (probe_kernel_address(&owner->cpu, cpu))
5504 goto out;
5505#else
5506 cpu = owner->cpu;
5507#endif
5508
5509
5510
5511
5512
5513 if (cpu >= nr_cpumask_bits)
5514 goto out;
5515
5516
5517
5518
5519
5520 if (!cpu_online(cpu))
5521 goto out;
5522
5523 rq = cpu_rq(cpu);
5524
5525 for (;;) {
5526
5527
5528
5529 if (lock->owner != owner)
5530 break;
5531
5532
5533
5534
5535 if (task_thread_info(rq->curr) != owner || need_resched())
5536 return 0;
5537
5538 cpu_relax();
5539 }
5540out:
5541 return 1;
5542}
5543#endif
5544
5545#ifdef CONFIG_PREEMPT
5546
5547
5548
5549
5550
5551asmlinkage void __sched preempt_schedule(void)
5552{
5553 struct thread_info *ti = current_thread_info();
5554
5555
5556
5557
5558
5559 if (likely(ti->preempt_count || irqs_disabled()))
5560 return;
5561
5562 do {
5563 add_preempt_count(PREEMPT_ACTIVE);
5564 schedule();
5565 sub_preempt_count(PREEMPT_ACTIVE);
5566
5567
5568
5569
5570
5571 barrier();
5572 } while (need_resched());
5573}
5574EXPORT_SYMBOL(preempt_schedule);
5575
5576
5577
5578
5579
5580
5581
5582asmlinkage void __sched preempt_schedule_irq(void)
5583{
5584 struct thread_info *ti = current_thread_info();
5585
5586
5587 BUG_ON(ti->preempt_count || !irqs_disabled());
5588
5589 do {
5590 add_preempt_count(PREEMPT_ACTIVE);
5591 local_irq_enable();
5592 schedule();
5593 local_irq_disable();
5594 sub_preempt_count(PREEMPT_ACTIVE);
5595
5596
5597
5598
5599
5600 barrier();
5601 } while (need_resched());
5602}
5603
5604#endif
5605
5606int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5607 void *key)
5608{
5609 return try_to_wake_up(curr->private, mode, wake_flags);
5610}
5611EXPORT_SYMBOL(default_wake_function);
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5623 int nr_exclusive, int wake_flags, void *key)
5624{
5625 wait_queue_t *curr, *next;
5626
5627 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5628 unsigned flags = curr->flags;
5629
5630 if (curr->func(curr, mode, wake_flags, key) &&
5631 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5632 break;
5633 }
5634}
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646void __wake_up(wait_queue_head_t *q, unsigned int mode,
5647 int nr_exclusive, void *key)
5648{
5649 unsigned long flags;
5650
5651 spin_lock_irqsave(&q->lock, flags);
5652 __wake_up_common(q, mode, nr_exclusive, 0, key);
5653 spin_unlock_irqrestore(&q->lock, flags);
5654}
5655EXPORT_SYMBOL(__wake_up);
5656
5657
5658
5659
5660void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5661{
5662 __wake_up_common(q, mode, 1, 0, NULL);
5663}
5664
5665void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5666{
5667 __wake_up_common(q, mode, 1, 0, key);
5668}
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5688 int nr_exclusive, void *key)
5689{
5690 unsigned long flags;
5691 int wake_flags = WF_SYNC;
5692
5693 if (unlikely(!q))
5694 return;
5695
5696 if (unlikely(!nr_exclusive))
5697 wake_flags = 0;
5698
5699 spin_lock_irqsave(&q->lock, flags);
5700 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5701 spin_unlock_irqrestore(&q->lock, flags);
5702}
5703EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5704
5705
5706
5707
5708void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5709{
5710 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5711}
5712EXPORT_SYMBOL_GPL(__wake_up_sync);
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726void complete(struct completion *x)
5727{
5728 unsigned long flags;
5729
5730 spin_lock_irqsave(&x->wait.lock, flags);
5731 x->done++;
5732 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5733 spin_unlock_irqrestore(&x->wait.lock, flags);
5734}
5735EXPORT_SYMBOL(complete);
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746void complete_all(struct completion *x)
5747{
5748 unsigned long flags;
5749
5750 spin_lock_irqsave(&x->wait.lock, flags);
5751 x->done += UINT_MAX/2;
5752 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5753 spin_unlock_irqrestore(&x->wait.lock, flags);
5754}
5755EXPORT_SYMBOL(complete_all);
5756
5757static inline long __sched
5758do_wait_for_common(struct completion *x, long timeout, int state)
5759{
5760 if (!x->done) {
5761 DECLARE_WAITQUEUE(wait, current);
5762
5763 wait.flags |= WQ_FLAG_EXCLUSIVE;
5764 __add_wait_queue_tail(&x->wait, &wait);
5765 do {
5766 if (signal_pending_state(state, current)) {
5767 timeout = -ERESTARTSYS;
5768 break;
5769 }
5770 __set_current_state(state);
5771 spin_unlock_irq(&x->wait.lock);
5772 timeout = schedule_timeout(timeout);
5773 spin_lock_irq(&x->wait.lock);
5774 } while (!x->done && timeout);
5775 __remove_wait_queue(&x->wait, &wait);
5776 if (!x->done)
5777 return timeout;
5778 }
5779 x->done--;
5780 return timeout ?: 1;
5781}
5782
5783static long __sched
5784wait_for_common(struct completion *x, long timeout, int state)
5785{
5786 might_sleep();
5787
5788 spin_lock_irq(&x->wait.lock);
5789 timeout = do_wait_for_common(x, timeout, state);
5790 spin_unlock_irq(&x->wait.lock);
5791 return timeout;
5792}
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804void __sched wait_for_completion(struct completion *x)
5805{
5806 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5807}
5808EXPORT_SYMBOL(wait_for_completion);
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819unsigned long __sched
5820wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5821{
5822 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5823}
5824EXPORT_SYMBOL(wait_for_completion_timeout);
5825
5826
5827
5828
5829
5830
5831
5832
5833int __sched wait_for_completion_interruptible(struct completion *x)
5834{
5835 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5836 if (t == -ERESTARTSYS)
5837 return t;
5838 return 0;
5839}
5840EXPORT_SYMBOL(wait_for_completion_interruptible);
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850unsigned long __sched
5851wait_for_completion_interruptible_timeout(struct completion *x,
5852 unsigned long timeout)
5853{
5854 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5855}
5856EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5857
5858
5859
5860
5861
5862
5863
5864
5865int __sched wait_for_completion_killable(struct completion *x)
5866{
5867 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5868 if (t == -ERESTARTSYS)
5869 return t;
5870 return 0;
5871}
5872EXPORT_SYMBOL(wait_for_completion_killable);
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886bool try_wait_for_completion(struct completion *x)
5887{
5888 int ret = 1;
5889
5890 spin_lock_irq(&x->wait.lock);
5891 if (!x->done)
5892 ret = 0;
5893 else
5894 x->done--;
5895 spin_unlock_irq(&x->wait.lock);
5896 return ret;
5897}
5898EXPORT_SYMBOL(try_wait_for_completion);
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908bool completion_done(struct completion *x)
5909{
5910 int ret = 1;
5911
5912 spin_lock_irq(&x->wait.lock);
5913 if (!x->done)
5914 ret = 0;
5915 spin_unlock_irq(&x->wait.lock);
5916 return ret;
5917}
5918EXPORT_SYMBOL(completion_done);
5919
5920static long __sched
5921sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5922{
5923 unsigned long flags;
5924 wait_queue_t wait;
5925
5926 init_waitqueue_entry(&wait, current);
5927
5928 __set_current_state(state);
5929
5930 spin_lock_irqsave(&q->lock, flags);
5931 __add_wait_queue(q, &wait);
5932 spin_unlock(&q->lock);
5933 timeout = schedule_timeout(timeout);
5934 spin_lock_irq(&q->lock);
5935 __remove_wait_queue(q, &wait);
5936 spin_unlock_irqrestore(&q->lock, flags);
5937
5938 return timeout;
5939}
5940
5941void __sched interruptible_sleep_on(wait_queue_head_t *q)
5942{
5943 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5944}
5945EXPORT_SYMBOL(interruptible_sleep_on);
5946
5947long __sched
5948interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5949{
5950 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5951}
5952EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5953
5954void __sched sleep_on(wait_queue_head_t *q)
5955{
5956 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5957}
5958EXPORT_SYMBOL(sleep_on);
5959
5960long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5961{
5962 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5963}
5964EXPORT_SYMBOL(sleep_on_timeout);
5965
5966#ifdef CONFIG_RT_MUTEXES
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978void rt_mutex_setprio(struct task_struct *p, int prio)
5979{
5980 unsigned long flags;
5981 int oldprio, on_rq, running;
5982 struct rq *rq;
5983 const struct sched_class *prev_class = p->sched_class;
5984
5985 BUG_ON(prio < 0 || prio > MAX_PRIO);
5986
5987 rq = task_rq_lock(p, &flags);
5988 update_rq_clock(rq);
5989
5990 oldprio = p->prio;
5991 on_rq = p->se.on_rq;
5992 running = task_current(rq, p);
5993 if (on_rq)
5994 dequeue_task(rq, p, 0);
5995 if (running)
5996 p->sched_class->put_prev_task(rq, p);
5997
5998 if (rt_prio(prio))
5999 p->sched_class = &rt_sched_class;
6000 else
6001 p->sched_class = &fair_sched_class;
6002
6003 p->prio = prio;
6004
6005 if (running)
6006 p->sched_class->set_curr_task(rq);
6007 if (on_rq) {
6008 enqueue_task(rq, p, 0);
6009
6010 check_class_changed(rq, p, prev_class, oldprio, running);
6011 }
6012 task_rq_unlock(rq, &flags);
6013}
6014
6015#endif
6016
6017void set_user_nice(struct task_struct *p, long nice)
6018{
6019 int old_prio, delta, on_rq;
6020 unsigned long flags;
6021 struct rq *rq;
6022
6023 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
6024 return;
6025
6026
6027
6028
6029 rq = task_rq_lock(p, &flags);
6030 update_rq_clock(rq);
6031
6032
6033
6034
6035
6036
6037 if (task_has_rt_policy(p)) {
6038 p->static_prio = NICE_TO_PRIO(nice);
6039 goto out_unlock;
6040 }
6041 on_rq = p->se.on_rq;
6042 if (on_rq)
6043 dequeue_task(rq, p, 0);
6044
6045 p->static_prio = NICE_TO_PRIO(nice);
6046 set_load_weight(p);
6047 old_prio = p->prio;
6048 p->prio = effective_prio(p);
6049 delta = p->prio - old_prio;
6050
6051 if (on_rq) {
6052 enqueue_task(rq, p, 0);
6053
6054
6055
6056
6057 if (delta < 0 || (delta > 0 && task_running(rq, p)))
6058 resched_task(rq->curr);
6059 }
6060out_unlock:
6061 task_rq_unlock(rq, &flags);
6062}
6063EXPORT_SYMBOL(set_user_nice);
6064
6065
6066
6067
6068
6069
6070int can_nice(const struct task_struct *p, const int nice)
6071{
6072
6073 int nice_rlim = 20 - nice;
6074
6075 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
6076 capable(CAP_SYS_NICE));
6077}
6078
6079#ifdef __ARCH_WANT_SYS_NICE
6080
6081
6082
6083
6084
6085
6086
6087
6088SYSCALL_DEFINE1(nice, int, increment)
6089{
6090 long nice, retval;
6091
6092
6093
6094
6095
6096
6097 if (increment < -40)
6098 increment = -40;
6099 if (increment > 40)
6100 increment = 40;
6101
6102 nice = TASK_NICE(current) + increment;
6103 if (nice < -20)
6104 nice = -20;
6105 if (nice > 19)
6106 nice = 19;
6107
6108 if (increment < 0 && !can_nice(current, nice))
6109 return -EPERM;
6110
6111 retval = security_task_setnice(current, nice);
6112 if (retval)
6113 return retval;
6114
6115 set_user_nice(current, nice);
6116 return 0;
6117}
6118
6119#endif
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129int task_prio(const struct task_struct *p)
6130{
6131 return p->prio - MAX_RT_PRIO;
6132}
6133
6134
6135
6136
6137
6138int task_nice(const struct task_struct *p)
6139{
6140 return TASK_NICE(p);
6141}
6142EXPORT_SYMBOL(task_nice);
6143
6144
6145
6146
6147
6148int idle_cpu(int cpu)
6149{
6150 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
6151}
6152
6153
6154
6155
6156
6157struct task_struct *idle_task(int cpu)
6158{
6159 return cpu_rq(cpu)->idle;
6160}
6161
6162
6163
6164
6165
6166static struct task_struct *find_process_by_pid(pid_t pid)
6167{
6168 return pid ? find_task_by_vpid(pid) : current;
6169}
6170
6171
6172static void
6173__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6174{
6175 BUG_ON(p->se.on_rq);
6176
6177 p->policy = policy;
6178 switch (p->policy) {
6179 case SCHED_NORMAL:
6180 case SCHED_BATCH:
6181 case SCHED_IDLE:
6182 p->sched_class = &fair_sched_class;
6183 break;
6184 case SCHED_FIFO:
6185 case SCHED_RR:
6186 p->sched_class = &rt_sched_class;
6187 break;
6188 }
6189
6190 p->rt_priority = prio;
6191 p->normal_prio = normal_prio(p);
6192
6193 p->prio = rt_mutex_getprio(p);
6194 set_load_weight(p);
6195}
6196
6197
6198
6199
6200static bool check_same_owner(struct task_struct *p)
6201{
6202 const struct cred *cred = current_cred(), *pcred;
6203 bool match;
6204
6205 rcu_read_lock();
6206 pcred = __task_cred(p);
6207 match = (cred->euid == pcred->euid ||
6208 cred->euid == pcred->uid);
6209 rcu_read_unlock();
6210 return match;
6211}
6212
6213static int __sched_setscheduler(struct task_struct *p, int policy,
6214 struct sched_param *param, bool user)
6215{
6216 int retval, oldprio, oldpolicy = -1, on_rq, running;
6217 unsigned long flags;
6218 const struct sched_class *prev_class = p->sched_class;
6219 struct rq *rq;
6220 int reset_on_fork;
6221
6222
6223 BUG_ON(in_interrupt());
6224recheck:
6225
6226 if (policy < 0) {
6227 reset_on_fork = p->sched_reset_on_fork;
6228 policy = oldpolicy = p->policy;
6229 } else {
6230 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6231 policy &= ~SCHED_RESET_ON_FORK;
6232
6233 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6234 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6235 policy != SCHED_IDLE)
6236 return -EINVAL;
6237 }
6238
6239
6240
6241
6242
6243
6244 if (param->sched_priority < 0 ||
6245 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
6246 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
6247 return -EINVAL;
6248 if (rt_policy(policy) != (param->sched_priority != 0))
6249 return -EINVAL;
6250
6251
6252
6253
6254 if (user && !capable(CAP_SYS_NICE)) {
6255 if (rt_policy(policy)) {
6256 unsigned long rlim_rtprio;
6257
6258 if (!lock_task_sighand(p, &flags))
6259 return -ESRCH;
6260 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
6261 unlock_task_sighand(p, &flags);
6262
6263
6264 if (policy != p->policy && !rlim_rtprio)
6265 return -