1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/debugfs.h>
71#include <linux/ctype.h>
72#include <linux/ftrace.h>
73#include <linux/slab.h>
74#include <linux/init_task.h>
75
76#include <asm/tlb.h>
77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h>
81#endif
82
83#include "sched_cpupri.h"
84#include "workqueue_sched.h"
85#include "sched_autogroup.h"
86
87#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h>
89
90
91
92
93
94
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99
100
101
102
103
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108
109
110
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116
117
118
119
120
121
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124
125
126
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141
142
143
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{
202 unsigned long delta;
203 ktime_t soft, hard, now;
204
205 for (;;) {
206 if (hrtimer_active(period_timer))
207 break;
208
209 now = hrtimer_cb_get_time(period_timer);
210 hrtimer_forward(period_timer, now, period);
211
212 soft = hrtimer_get_softexpires(period_timer);
213 hard = hrtimer_get_expires(period_timer);
214 delta = ktime_to_ns(ktime_sub(hard, soft));
215 __hrtimer_start_range_ns(period_timer, soft, delta,
216 HRTIMER_MODE_ABS_PINNED, 0);
217 }
218}
219
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221{
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240
241
242
243
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277
278 struct sched_entity **se;
279
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314
315
316
317
318
319
320
321
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328
329
330
331struct task_group root_task_group;
332
333#endif
334
335
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352
353
354
355
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq;
364
365
366
367
368
369
370
371
372
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg;
376
377#ifdef CONFIG_SMP
378
379
380
381 unsigned long task_weight;
382
383
384
385
386
387
388
389 unsigned long h_load;
390
391
392
393
394
395
396
397
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480
481
482
483
484
485
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif
516#endif
517
518
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr;
525#ifdef CONFIG_SMP
526 int next;
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553
554
555
556
557
558
559
560
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568
569
570
571
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576
577
578
579
580static struct root_domain def_root_domain;
581
582#endif
583
584
585
586
587
588
589
590
591struct rq {
592
593 raw_spinlock_t lock;
594
595
596
597
598
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625
626
627
628
629
630
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690
691
692
693 unsigned int yld_count;
694
695
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728
729
730
731
732
733
734
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746
747
748
749
750
751
752
753
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif
790
791static void update_rq_clock_task(struct rq *rq, s64 delta);
792
793static void update_rq_clock(struct rq *rq)
794{
795 s64 delta;
796
797 if (rq->skip_clock_update > 0)
798 return;
799
800 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
801 rq->clock += delta;
802 update_rq_clock_task(rq, delta);
803}
804
805
806
807
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814
815
816
817
818
819
820
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826
827
828
829
830#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled |
841
842const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h"
844 0;
845
846#undef SCHED_FEAT
847
848#ifdef CONFIG_SCHED_DEBUG
849#define SCHED_FEAT(name, enabled) \
850 #name ,
851
852static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h"
854 NULL
855};
856
857#undef SCHED_FEAT
858
859static int sched_feat_show(struct seq_file *m, void *v)
860{
861 int i;
862
863 for (i = 0; sched_feat_names[i]; i++) {
864 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]);
867 }
868 seq_puts(m, "\n");
869
870 return 0;
871}
872
873static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos)
876{
877 char buf[64];
878 char *cmp;
879 int neg = 0;
880 int i;
881
882 if (cnt > 63)
883 cnt = 63;
884
885 if (copy_from_user(&buf, ubuf, cnt))
886 return -EFAULT;
887
888 buf[cnt] = 0;
889 cmp = strstrip(buf);
890
891 if (strncmp(cmp, "NO_", 3) == 0) {
892 neg = 1;
893 cmp += 3;
894 }
895
896 for (i = 0; sched_feat_names[i]; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg)
899 sysctl_sched_features &= ~(1UL << i);
900 else
901 sysctl_sched_features |= (1UL << i);
902 break;
903 }
904 }
905
906 if (!sched_feat_names[i])
907 return -EINVAL;
908
909 *ppos += cnt;
910
911 return cnt;
912}
913
914static int sched_feat_open(struct inode *inode, struct file *filp)
915{
916 return single_open(filp, sched_feat_show, NULL);
917}
918
919static const struct file_operations sched_feat_fops = {
920 .open = sched_feat_open,
921 .write = sched_feat_write,
922 .read = seq_read,
923 .llseek = seq_lseek,
924 .release = single_release,
925};
926
927static __init int sched_init_debug(void)
928{
929 debugfs_create_file("sched_features", 0644, NULL, NULL,
930 &sched_feat_fops);
931
932 return 0;
933}
934late_initcall(sched_init_debug);
935
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939
940
941
942
943
944const_debug unsigned int sysctl_sched_nr_migrate = 32;
945
946
947
948
949
950
951
952const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
953
954
955
956
957
958unsigned int sysctl_sched_rt_period = 1000000;
959
960static __read_mostly int scheduler_running;
961
962
963
964
965
966int sysctl_sched_rt_runtime = 950000;
967
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006
1007
1008
1009
1010
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018
1019
1020
1021
1022
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027
1028 rq->lock.owner = current;
1029#endif
1030
1031
1032
1033
1034
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039
1040#else
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044
1045
1046
1047
1048
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061
1062
1063
1064
1065
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif
1074
1075
1076
1077
1078static inline struct rq *__task_rq_lock(struct task_struct *p)
1079 __acquires(rq->lock)
1080{
1081 struct rq *rq;
1082
1083 lockdep_assert_held(&p->pi_lock);
1084
1085 for (;;) {
1086 rq = task_rq(p);
1087 raw_spin_lock(&rq->lock);
1088 if (likely(rq == task_rq(p)))
1089 return rq;
1090 raw_spin_unlock(&rq->lock);
1091 }
1092}
1093
1094
1095
1096
1097static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1098 __acquires(p->pi_lock)
1099 __acquires(rq->lock)
1100{
1101 struct rq *rq;
1102
1103 for (;;) {
1104 raw_spin_lock_irqsave(&p->pi_lock, *flags);
1105 rq = task_rq(p);
1106 raw_spin_lock(&rq->lock);
1107 if (likely(rq == task_rq(p)))
1108 return rq;
1109 raw_spin_unlock(&rq->lock);
1110 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1111 }
1112}
1113
1114static void __task_rq_unlock(struct rq *rq)
1115 __releases(rq->lock)
1116{
1117 raw_spin_unlock(&rq->lock);
1118}
1119
1120static inline void
1121task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
1122 __releases(rq->lock)
1123 __releases(p->pi_lock)
1124{
1125 raw_spin_unlock(&rq->lock);
1126 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1127}
1128
1129
1130
1131
1132static struct rq *this_rq_lock(void)
1133 __acquires(rq->lock)
1134{
1135 struct rq *rq;
1136
1137 local_irq_disable();
1138 rq = this_rq();
1139 raw_spin_lock(&rq->lock);
1140
1141 return rq;
1142}
1143
1144#ifdef CONFIG_SCHED_HRTICK
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq)
1171{
1172 if (hrtimer_active(&rq->hrtick_timer))
1173 hrtimer_cancel(&rq->hrtick_timer);
1174}
1175
1176
1177
1178
1179
1180static enum hrtimer_restart hrtick(struct hrtimer *timer)
1181{
1182 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1183
1184 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1185
1186 raw_spin_lock(&rq->lock);
1187 update_rq_clock(rq);
1188 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1189 raw_spin_unlock(&rq->lock);
1190
1191 return HRTIMER_NORESTART;
1192}
1193
1194#ifdef CONFIG_SMP
1195
1196
1197
1198static void __hrtick_start(void *arg)
1199{
1200 struct rq *rq = arg;
1201
1202 raw_spin_lock(&rq->lock);
1203 hrtimer_restart(&rq->hrtick_timer);
1204 rq->hrtick_csd_pending = 0;
1205 raw_spin_unlock(&rq->lock);
1206}
1207
1208
1209
1210
1211
1212
1213static void hrtick_start(struct rq *rq, u64 delay)
1214{
1215 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1217
1218 hrtimer_set_expires(timer, time);
1219
1220 if (rq == this_rq()) {
1221 hrtimer_restart(timer);
1222 } else if (!rq->hrtick_csd_pending) {
1223 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1224 rq->hrtick_csd_pending = 1;
1225 }
1226}
1227
1228static int
1229hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1230{
1231 int cpu = (int)(long)hcpu;
1232
1233 switch (action) {
1234 case CPU_UP_CANCELED:
1235 case CPU_UP_CANCELED_FROZEN:
1236 case CPU_DOWN_PREPARE:
1237 case CPU_DOWN_PREPARE_FROZEN:
1238 case CPU_DEAD:
1239 case CPU_DEAD_FROZEN:
1240 hrtick_clear(cpu_rq(cpu));
1241 return NOTIFY_OK;
1242 }
1243
1244 return NOTIFY_DONE;
1245}
1246
1247static __init void init_hrtick(void)
1248{
1249 hotcpu_notifier(hotplug_hrtick, 0);
1250}
1251#else
1252
1253
1254
1255
1256
1257static void hrtick_start(struct rq *rq, u64 delay)
1258{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0);
1261}
1262
1263static inline void init_hrtick(void)
1264{
1265}
1266#endif
1267
1268static void init_rq_hrtick(struct rq *rq)
1269{
1270#ifdef CONFIG_SMP
1271 rq->hrtick_csd_pending = 0;
1272
1273 rq->hrtick_csd.flags = 0;
1274 rq->hrtick_csd.func = __hrtick_start;
1275 rq->hrtick_csd.info = rq;
1276#endif
1277
1278 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1279 rq->hrtick_timer.function = hrtick;
1280}
1281#else
1282static inline void hrtick_clear(struct rq *rq)
1283{
1284}
1285
1286static inline void init_rq_hrtick(struct rq *rq)
1287{
1288}
1289
1290static inline void init_hrtick(void)
1291{
1292}
1293#endif
1294
1295
1296
1297
1298
1299
1300
1301
1302#ifdef CONFIG_SMP
1303
1304#ifndef tsk_is_polling
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif
1307
1308static void resched_task(struct task_struct *p)
1309{
1310 int cpu;
1311
1312 assert_raw_spin_locked(&task_rq(p)->lock);
1313
1314 if (test_tsk_need_resched(p))
1315 return;
1316
1317 set_tsk_need_resched(p);
1318
1319 cpu = task_cpu(p);
1320 if (cpu == smp_processor_id())
1321 return;
1322
1323
1324 smp_mb();
1325 if (!tsk_is_polling(p))
1326 smp_send_reschedule(cpu);
1327}
1328
1329static void resched_cpu(int cpu)
1330{
1331 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags;
1333
1334 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1335 return;
1336 resched_task(cpu_curr(cpu));
1337 raw_spin_unlock_irqrestore(&rq->lock, flags);
1338}
1339
1340#ifdef CONFIG_NO_HZ
1341
1342
1343
1344
1345
1346
1347
1348
1349int get_nohz_timer_target(void)
1350{
1351 int cpu = smp_processor_id();
1352 int i;
1353 struct sched_domain *sd;
1354
1355 rcu_read_lock();
1356 for_each_domain(cpu, sd) {
1357 for_each_cpu(i, sched_domain_span(sd)) {
1358 if (!idle_cpu(i)) {
1359 cpu = i;
1360 goto unlock;
1361 }
1362 }
1363 }
1364unlock:
1365 rcu_read_unlock();
1366 return cpu;
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378void wake_up_idle_cpu(int cpu)
1379{
1380 struct rq *rq = cpu_rq(cpu);
1381
1382 if (cpu == smp_processor_id())
1383 return;
1384
1385
1386
1387
1388
1389
1390
1391
1392 if (rq->curr != rq->idle)
1393 return;
1394
1395
1396
1397
1398
1399
1400 set_tsk_need_resched(rq->idle);
1401
1402
1403 smp_mb();
1404 if (!tsk_is_polling(rq->idle))
1405 smp_send_reschedule(cpu);
1406}
1407
1408static inline bool got_nohz_idle_kick(void)
1409{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1411}
1412
1413#else
1414
1415static inline bool got_nohz_idle_kick(void)
1416{
1417 return false;
1418}
1419
1420#endif
1421
1422static u64 sched_avg_period(void)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{
1429 s64 period = sched_avg_period();
1430
1431 while ((s64)(rq->clock - rq->age_stamp) > period) {
1432
1433
1434
1435
1436
1437 asm("" : "+rm" (rq->age_stamp));
1438 rq->age_stamp += period;
1439 rq->rt_avg /= 2;
1440 }
1441}
1442
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else
1450static void resched_task(struct task_struct *p)
1451{
1452 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p);
1454}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif
1464
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473
1474
1475
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478
1479
1480
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487
1488
1489
1490
1491
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508
1509
1510
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562static const int prio_to_weight[40] = {
1563 88761, 71755, 56483, 46273, 36291,
1564 29154, 23254, 18705, 14949, 11916,
1565 9548, 7620, 6100, 4904, 3906,
1566 3121, 2501, 1991, 1586, 1277,
1567 1024, 820, 655, 526, 423,
1568 335, 272, 215, 172, 137,
1569 110, 87, 70, 56, 45,
1570 36, 29, 23, 18, 15,
1571};
1572
1573
1574
1575
1576
1577
1578
1579
1580static const u32 prio_to_wmult[40] = {
1581 48388, 59856, 76040, 92818, 118348,
1582 147320, 184698, 229616, 287308, 360437,
1583 449829, 563644, 704093, 875809, 1099582,
1584 1376151, 1717300, 2157191, 2708050, 3363326,
1585 4194304, 5237765, 6557202, 8165337, 10153587,
1586 12820798, 15790321, 19976592, 24970740, 31350126,
1587 39045157, 49367440, 61356676, 76695844, 95443717,
1588 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER,
1594 CPUACCT_STAT_SYSTEM,
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623
1624
1625
1626
1627
1628
1629static int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data)
1631{
1632 struct task_group *parent, *child;
1633 int ret;
1634
1635 parent = from;
1636
1637down:
1638 ret = (*down)(parent, data);
1639 if (ret)
1640 goto out;
1641 list_for_each_entry_rcu(child, &parent->children, siblings) {
1642 parent = child;
1643 goto down;
1644
1645up:
1646 continue;
1647 }
1648 ret = (*up)(parent, data);
1649 if (ret || parent == from)
1650 goto out;
1651
1652 child = parent;
1653 parent = parent->parent;
1654 if (parent)
1655 goto up;
1656out:
1657 return ret;
1658}
1659
1660
1661
1662
1663
1664
1665
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{
1674 return 0;
1675}
1676#endif
1677
1678#ifdef CONFIG_SMP
1679
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685
1686
1687
1688
1689
1690
1691
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703
1704
1705
1706
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0;
1734}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760
1761
1762
1763
1764
1765
1766
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif
1789
1790
1791
1792
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811
1812
1813
1814
1815
1816
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock);
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836
1837
1838
1839
1840
1841
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else
1854
1855
1856
1857
1858
1859
1860
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock);
1869}
1870
1871
1872
1873
1874
1875
1876
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif
1887
1888static void calc_load_account_idle(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897
1898
1899
1900
1901
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924
1925static void set_load_weight(struct task_struct *p)
1926{
1927 int prio = p->static_prio - MAX_RT_PRIO;
1928 struct load_weight *load = &p->se.load;
1929
1930
1931
1932
1933 if (p->policy == SCHED_IDLE) {
1934 load->weight = scale_load(WEIGHT_IDLEPRIO);
1935 load->inv_weight = WMULT_IDLEPRIO;
1936 return;
1937 }
1938
1939 load->weight = scale_load(prio_to_weight[prio]);
1940 load->inv_weight = prio_to_wmult[prio];
1941}
1942
1943static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1944{
1945 update_rq_clock(rq);
1946 sched_info_queued(p);
1947 p->sched_class->enqueue_task(rq, p, flags);
1948}
1949
1950static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1951{
1952 update_rq_clock(rq);
1953 sched_info_dequeued(p);
1954 p->sched_class->dequeue_task(rq, p, flags);
1955}
1956
1957
1958
1959
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{
1962 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--;
1964
1965 enqueue_task(rq, p, flags);
1966}
1967
1968
1969
1970
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{
1973 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++;
1975
1976 dequeue_task(rq, p, flags);
1977}
1978
1979#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1993static DEFINE_PER_CPU(u64, cpu_softirq_time);
1994
1995static DEFINE_PER_CPU(u64, irq_start_time);
1996static int sched_clock_irqtime;
1997
1998void enable_sched_clock_irqtime(void)
1999{
2000 sched_clock_irqtime = 1;
2001}
2002
2003void disable_sched_clock_irqtime(void)
2004{
2005 sched_clock_irqtime = 0;
2006}
2007
2008#ifndef CONFIG_64BIT
2009static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
2010
2011static inline void irq_time_write_begin(void)
2012{
2013 __this_cpu_inc(irq_time_seq.sequence);
2014 smp_wmb();
2015}
2016
2017static inline void irq_time_write_end(void)
2018{
2019 smp_wmb();
2020 __this_cpu_inc(irq_time_seq.sequence);
2021}
2022
2023static inline u64 irq_time_read(int cpu)
2024{
2025 u64 irq_time;
2026 unsigned seq;
2027
2028 do {
2029 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
2030 irq_time = per_cpu(cpu_softirq_time, cpu) +
2031 per_cpu(cpu_hardirq_time, cpu);
2032 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
2033
2034 return irq_time;
2035}
2036#else
2037static inline void irq_time_write_begin(void)
2038{
2039}
2040
2041static inline void irq_time_write_end(void)
2042{
2043}
2044
2045static inline u64 irq_time_read(int cpu)
2046{
2047 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
2048}
2049#endif
2050
2051
2052
2053
2054
2055void account_system_vtime(struct task_struct *curr)
2056{
2057 unsigned long flags;
2058 s64 delta;
2059 int cpu;
2060
2061 if (!sched_clock_irqtime)
2062 return;
2063
2064 local_irq_save(flags);
2065
2066 cpu = smp_processor_id();
2067 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2068 __this_cpu_add(irq_start_time, delta);
2069
2070 irq_time_write_begin();
2071
2072
2073
2074
2075
2076
2077 if (hardirq_count())
2078 __this_cpu_add(cpu_hardirq_time, delta);
2079 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
2080 __this_cpu_add(cpu_softirq_time, delta);
2081
2082 irq_time_write_end();
2083 local_irq_restore(flags);
2084}
2085EXPORT_SYMBOL_GPL(account_system_vtime);
2086
2087#endif
2088
2089#ifdef CONFIG_PARAVIRT
2090static inline u64 steal_ticks(u64 steal)
2091{
2092 if (unlikely(steal > NSEC_PER_SEC))
2093 return div_u64(steal, TICK_NSEC);
2094
2095 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
2096}
2097#endif
2098
2099static void update_rq_clock_task(struct rq *rq, s64 delta)
2100{
2101
2102
2103
2104
2105#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
2106 s64 steal = 0, irq_delta = 0;
2107#endif
2108#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2109 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126 if (irq_delta > delta)
2127 irq_delta = delta;
2128
2129 rq->prev_irq_time += irq_delta;
2130 delta -= irq_delta;
2131#endif
2132#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
2133 if (static_branch((¶virt_steal_rq_enabled))) {
2134 u64 st;
2135
2136 steal = paravirt_steal_clock(cpu_of(rq));
2137 steal -= rq->prev_steal_time_rq;
2138
2139 if (unlikely(steal > delta))
2140 steal = delta;
2141
2142 st = steal_ticks(steal);
2143 steal = st * TICK_NSEC;
2144
2145 rq->prev_steal_time_rq += steal;
2146
2147 delta -= steal;
2148 }
2149#endif
2150
2151 rq->clock_task += delta;
2152
2153#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
2154 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2155 sched_rt_avg_update(rq, irq_delta + steal);
2156#endif
2157}
2158
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void)
2161{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2163 unsigned long flags;
2164 u64 latest_ns;
2165 int ret = 0;
2166
2167 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
2170 ret = 1;
2171 local_irq_restore(flags);
2172 return ret;
2173}
2174
2175static int irqtime_account_si_update(void)
2176{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2178 unsigned long flags;
2179 u64 latest_ns;
2180 int ret = 0;
2181
2182 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2185 ret = 1;
2186 local_irq_restore(flags);
2187 return ret;
2188}
2189
2190#else
2191
2192#define sched_clock_irqtime (0)
2193
2194#endif
2195
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2208 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2209
2210 if (stop) {
2211
2212
2213
2214
2215
2216
2217
2218
2219 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2220
2221 stop->sched_class = &stop_sched_class;
2222 }
2223
2224 cpu_rq(cpu)->stop = stop;
2225
2226 if (old_stop) {
2227
2228
2229
2230
2231 old_stop->sched_class = &rt_sched_class;
2232 }
2233}
2234
2235
2236
2237
2238static inline int __normal_prio(struct task_struct *p)
2239{
2240 return p->static_prio;
2241}
2242
2243
2244
2245
2246
2247
2248
2249
2250static inline int normal_prio(struct task_struct *p)
2251{
2252 int prio;
2253
2254 if (task_has_rt_policy(p))
2255 prio = MAX_RT_PRIO-1 - p->rt_priority;
2256 else
2257 prio = __normal_prio(p);
2258 return prio;
2259}
2260
2261
2262
2263
2264
2265
2266
2267
2268static int effective_prio(struct task_struct *p)
2269{
2270 p->normal_prio = normal_prio(p);
2271
2272
2273
2274
2275
2276 if (!rt_prio(p->prio))
2277 return p->normal_prio;
2278 return p->prio;
2279}
2280
2281
2282
2283
2284
2285inline int task_curr(const struct task_struct *p)
2286{
2287 return cpu_curr(task_cpu(p)) == p;
2288}
2289
2290static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2291 const struct sched_class *prev_class,
2292 int oldprio)
2293{
2294 if (prev_class != p->sched_class) {
2295 if (prev_class->switched_from)
2296 prev_class->switched_from(rq, p);
2297 p->sched_class->switched_to(rq, p);
2298 } else if (oldprio != p->prio)
2299 p->sched_class->prio_changed(rq, p, oldprio);
2300}
2301
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{
2304 const struct sched_class *class;
2305
2306 if (p->sched_class == rq->curr->sched_class) {
2307 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2308 } else {
2309 for_each_class(class) {
2310 if (class == rq->curr->sched_class)
2311 break;
2312 if (class == p->sched_class) {
2313 resched_task(rq->curr);
2314 break;
2315 }
2316 }
2317 }
2318
2319
2320
2321
2322
2323 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2324 rq->skip_clock_update = 1;
2325}
2326
2327#ifdef CONFIG_SMP
2328
2329
2330
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342
2343
2344
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{
2362#ifdef CONFIG_SCHED_DEBUG
2363
2364
2365
2366
2367 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2368 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2369
2370#ifdef CONFIG_LOCKDEP
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2382 lockdep_is_held(&task_rq(p)->lock)));
2383#endif
2384#endif
2385
2386 trace_sched_migrate_task(p, new_cpu);
2387
2388 if (task_cpu(p) != new_cpu) {
2389 p->se.nr_migrations++;
2390 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2391 }
2392
2393 __set_task_cpu(p, new_cpu);
2394}
2395
2396struct migration_arg {
2397 struct task_struct *task;
2398 int dest_cpu;
2399};
2400
2401static int migration_cpu_stop(void *data);
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2420{
2421 unsigned long flags;
2422 int running, on_rq;
2423 unsigned long ncsw;
2424 struct rq *rq;
2425
2426 for (;;) {
2427
2428
2429
2430
2431
2432
2433 rq = task_rq(p);
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446 while (task_running(rq, p)) {
2447 if (match_state && unlikely(p->state != match_state))
2448 return 0;
2449 cpu_relax();
2450 }
2451
2452
2453
2454
2455
2456
2457 rq = task_rq_lock(p, &flags);
2458 trace_sched_wait_task(p);
2459 running = task_running(rq, p);
2460 on_rq = p->on_rq;
2461 ncsw = 0;
2462 if (!match_state || p->state == match_state)
2463 ncsw = p->nvcsw | LONG_MIN;
2464 task_rq_unlock(rq, p, &flags);
2465
2466
2467
2468
2469 if (unlikely(!ncsw))
2470 break;
2471
2472
2473
2474
2475
2476
2477
2478 if (unlikely(running)) {
2479 cpu_relax();
2480 continue;
2481 }
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492 if (unlikely(on_rq)) {
2493 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2494
2495 set_current_state(TASK_UNINTERRUPTIBLE);
2496 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2497 continue;
2498 }
2499
2500
2501
2502
2503
2504
2505 break;
2506 }
2507
2508 return ncsw;
2509}
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524void kick_process(struct task_struct *p)
2525{
2526 int cpu;
2527
2528 preempt_disable();
2529 cpu = task_cpu(p);
2530 if ((cpu != smp_processor_id()) && task_curr(p))
2531 smp_send_reschedule(cpu);
2532 preempt_enable();
2533}
2534EXPORT_SYMBOL_GPL(kick_process);
2535#endif
2536
2537#ifdef CONFIG_SMP
2538
2539
2540
2541static int select_fallback_rq(int cpu, struct task_struct *p)
2542{
2543 int dest_cpu;
2544 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2545
2546
2547 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2548 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2549 return dest_cpu;
2550
2551
2552 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2553 if (dest_cpu < nr_cpu_ids)
2554 return dest_cpu;
2555
2556
2557 dest_cpu = cpuset_cpus_allowed_fallback(p);
2558
2559
2560
2561
2562
2563 if (p->mm && printk_ratelimit()) {
2564 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2565 task_pid_nr(p), p->comm, cpu);
2566 }
2567
2568 return dest_cpu;
2569}
2570
2571
2572
2573
2574static inline
2575int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2576{
2577 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2590 !cpu_online(cpu)))
2591 cpu = select_fallback_rq(task_cpu(p), p);
2592
2593 return cpu;
2594}
2595
2596static void update_avg(u64 *avg, u64 sample)
2597{
2598 s64 diff = sample - *avg;
2599 *avg += diff >> 3;
2600}
2601#endif
2602
2603static void
2604ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2605{
2606#ifdef CONFIG_SCHEDSTATS
2607 struct rq *rq = this_rq();
2608
2609#ifdef CONFIG_SMP
2610 int this_cpu = smp_processor_id();
2611
2612 if (cpu == this_cpu) {
2613 schedstat_inc(rq, ttwu_local);
2614 schedstat_inc(p, se.statistics.nr_wakeups_local);
2615 } else {
2616 struct sched_domain *sd;
2617
2618 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2619 rcu_read_lock();
2620 for_each_domain(this_cpu, sd) {
2621 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2622 schedstat_inc(sd, ttwu_wake_remote);
2623 break;
2624 }
2625 }
2626 rcu_read_unlock();
2627 }
2628
2629 if (wake_flags & WF_MIGRATED)
2630 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2631
2632#endif
2633
2634 schedstat_inc(rq, ttwu_count);
2635 schedstat_inc(p, se.statistics.nr_wakeups);
2636
2637 if (wake_flags & WF_SYNC)
2638 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2639
2640#endif
2641}
2642
2643static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2644{
2645 activate_task(rq, p, en_flags);
2646 p->on_rq = 1;
2647
2648
2649 if (p->flags & PF_WQ_WORKER)
2650 wq_worker_waking_up(p, cpu_of(rq));
2651}
2652
2653
2654
2655
2656static void
2657ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2658{
2659 trace_sched_wakeup(p, true);
2660 check_preempt_curr(rq, p, wake_flags);
2661
2662 p->state = TASK_RUNNING;
2663#ifdef CONFIG_SMP
2664 if (p->sched_class->task_woken)
2665 p->sched_class->task_woken(rq, p);
2666
2667 if (rq->idle_stamp) {
2668 u64 delta = rq->clock - rq->idle_stamp;
2669 u64 max = 2*sysctl_sched_migration_cost;
2670
2671 if (delta > max)
2672 rq->avg_idle = max;
2673 else
2674 update_avg(&rq->avg_idle, delta);
2675 rq->idle_stamp = 0;
2676 }
2677#endif
2678}
2679
2680static void
2681ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2682{
2683#ifdef CONFIG_SMP
2684 if (p->sched_contributes_to_load)
2685 rq->nr_uninterruptible--;
2686#endif
2687
2688 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2689 ttwu_do_wakeup(rq, p, wake_flags);
2690}
2691
2692
2693
2694
2695
2696
2697
2698static int ttwu_remote(struct task_struct *p, int wake_flags)
2699{
2700 struct rq *rq;
2701 int ret = 0;
2702
2703 rq = __task_rq_lock(p);
2704 if (p->on_rq) {
2705 ttwu_do_wakeup(rq, p, wake_flags);
2706 ret = 1;
2707 }
2708 __task_rq_unlock(rq);
2709
2710 return ret;
2711}
2712
2713#ifdef CONFIG_SMP
2714static void sched_ttwu_pending(void)
2715{
2716 struct rq *rq = this_rq();
2717 struct llist_node *llist = llist_del_all(&rq->wake_list);
2718 struct task_struct *p;
2719
2720 raw_spin_lock(&rq->lock);
2721
2722 while (llist) {
2723 p = llist_entry(llist, struct task_struct, wake_entry);
2724 llist = llist_next(llist);
2725 ttwu_do_activate(rq, p, 0);
2726 }
2727
2728 raw_spin_unlock(&rq->lock);
2729}
2730
2731void scheduler_ipi(void)
2732{
2733 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2734 return;
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749 irq_enter();
2750 sched_ttwu_pending();
2751
2752
2753
2754
2755 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2756 this_rq()->idle_balance = 1;
2757 raise_softirq_irqoff(SCHED_SOFTIRQ);
2758 }
2759 irq_exit();
2760}
2761
2762static void ttwu_queue_remote(struct task_struct *p, int cpu)
2763{
2764 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2765 smp_send_reschedule(cpu);
2766}
2767
2768#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2769static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2770{
2771 struct rq *rq;
2772 int ret = 0;
2773
2774 rq = __task_rq_lock(p);
2775 if (p->on_cpu) {
2776 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2777 ttwu_do_wakeup(rq, p, wake_flags);
2778 ret = 1;
2779 }
2780 __task_rq_unlock(rq);
2781
2782 return ret;
2783
2784}
2785#endif
2786#endif
2787
2788static void ttwu_queue(struct task_struct *p, int cpu)
2789{
2790 struct rq *rq = cpu_rq(cpu);
2791
2792#if defined(CONFIG_SMP)
2793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2794 sched_clock_cpu(cpu);
2795 ttwu_queue_remote(p, cpu);
2796 return;
2797 }
2798#endif
2799
2800 raw_spin_lock(&rq->lock);
2801 ttwu_do_activate(rq, p, 0);
2802 raw_spin_unlock(&rq->lock);
2803}
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820static int
2821try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2822{
2823 unsigned long flags;
2824 int cpu, success = 0;
2825
2826 smp_wmb();
2827 raw_spin_lock_irqsave(&p->pi_lock, flags);
2828 if (!(p->state & state))
2829 goto out;
2830
2831 success = 1;
2832 cpu = task_cpu(p);
2833
2834 if (p->on_rq && ttwu_remote(p, wake_flags))
2835 goto stat;
2836
2837#ifdef CONFIG_SMP
2838
2839
2840
2841
2842 while (p->on_cpu) {
2843#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2844
2845
2846
2847
2848
2849
2850
2851 if (ttwu_activate_remote(p, wake_flags))
2852 goto stat;
2853#else
2854 cpu_relax();
2855#endif
2856 }
2857
2858
2859
2860 smp_rmb();
2861
2862 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2863 p->state = TASK_WAKING;
2864
2865 if (p->sched_class->task_waking)
2866 p->sched_class->task_waking(p);
2867
2868 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2869 if (task_cpu(p) != cpu) {
2870 wake_flags |= WF_MIGRATED;
2871 set_task_cpu(p, cpu);
2872 }
2873#endif
2874
2875 ttwu_queue(p, cpu);
2876stat:
2877 ttwu_stat(p, cpu, wake_flags);
2878out:
2879 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2880
2881 return success;
2882}
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892static void try_to_wake_up_local(struct task_struct *p)
2893{
2894 struct rq *rq = task_rq(p);
2895
2896 BUG_ON(rq != this_rq());
2897 BUG_ON(p == current);
2898 lockdep_assert_held(&rq->lock);
2899
2900 if (!raw_spin_trylock(&p->pi_lock)) {
2901 raw_spin_unlock(&rq->lock);
2902 raw_spin_lock(&p->pi_lock);
2903 raw_spin_lock(&rq->lock);
2904 }
2905
2906 if (!(p->state & TASK_NORMAL))
2907 goto out;
2908
2909 if (!p->on_rq)
2910 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2911
2912 ttwu_do_wakeup(rq, p, 0);
2913 ttwu_stat(p, smp_processor_id(), 0);
2914out:
2915 raw_spin_unlock(&p->pi_lock);
2916}
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929int wake_up_process(struct task_struct *p)
2930{
2931 return try_to_wake_up(p, TASK_ALL, 0);
2932}
2933EXPORT_SYMBOL(wake_up_process);
2934
2935int wake_up_state(struct task_struct *p, unsigned int state)
2936{
2937 return try_to_wake_up(p, state, 0);
2938}
2939
2940
2941
2942
2943
2944
2945
2946static void __sched_fork(struct task_struct *p)
2947{
2948 p->on_rq = 0;
2949
2950 p->se.on_rq = 0;
2951 p->se.exec_start = 0;
2952 p->se.sum_exec_runtime = 0;
2953 p->se.prev_sum_exec_runtime = 0;
2954 p->se.nr_migrations = 0;
2955 p->se.vruntime = 0;
2956 INIT_LIST_HEAD(&p->se.group_node);
2957
2958#ifdef CONFIG_SCHEDSTATS
2959 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2960#endif
2961
2962 INIT_LIST_HEAD(&p->rt.run_list);
2963
2964#ifdef CONFIG_PREEMPT_NOTIFIERS
2965 INIT_HLIST_HEAD(&p->preempt_notifiers);
2966#endif
2967}
2968
2969
2970
2971
2972void sched_fork(struct task_struct *p)
2973{
2974 unsigned long flags;
2975 int cpu = get_cpu();
2976
2977 __sched_fork(p);
2978
2979
2980
2981
2982
2983 p->state = TASK_RUNNING;
2984
2985
2986
2987
2988 p->prio = current->normal_prio;
2989
2990
2991
2992
2993 if (unlikely(p->sched_reset_on_fork)) {
2994 if (task_has_rt_policy(p)) {
2995 p->policy = SCHED_NORMAL;
2996 p->static_prio = NICE_TO_PRIO(0);
2997 p->rt_priority = 0;
2998 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2999 p->static_prio = NICE_TO_PRIO(0);
3000
3001 p->prio = p->normal_prio = __normal_prio(p);
3002 set_load_weight(p);
3003
3004
3005
3006
3007
3008 p->sched_reset_on_fork = 0;
3009 }
3010
3011 if (!rt_prio(p->prio))
3012 p->sched_class = &fair_sched_class;
3013
3014 if (p->sched_class->task_fork)
3015 p->sched_class->task_fork(p);
3016
3017
3018
3019
3020
3021
3022
3023
3024 raw_spin_lock_irqsave(&p->pi_lock, flags);
3025 set_task_cpu(p, cpu);
3026 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3027
3028#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
3029 if (likely(sched_info_on()))
3030 memset(&p->sched_info, 0, sizeof(p->sched_info));
3031#endif
3032#if defined(CONFIG_SMP)
3033 p->on_cpu = 0;
3034#endif
3035#ifdef CONFIG_PREEMPT_COUNT
3036
3037 task_thread_info(p)->preempt_count = 1;
3038#endif
3039#ifdef CONFIG_SMP
3040 plist_node_init(&p->pushable_tasks, MAX_PRIO);
3041#endif
3042
3043 put_cpu();
3044}
3045
3046
3047
3048
3049
3050
3051
3052
3053void wake_up_new_task(struct task_struct *p)
3054{
3055 unsigned long flags;
3056 struct rq *rq;
3057
3058 raw_spin_lock_irqsave(&p->pi_lock, flags);
3059#ifdef CONFIG_SMP
3060
3061
3062
3063
3064
3065 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
3066#endif
3067
3068 rq = __task_rq_lock(p);
3069 activate_task(rq, p, 0);
3070 p->on_rq = 1;
3071 trace_sched_wakeup_new(p, true);
3072 check_preempt_curr(rq, p, WF_FORK);
3073#ifdef CONFIG_SMP
3074 if (p->sched_class->task_woken)
3075 p->sched_class->task_woken(rq, p);
3076#endif
3077 task_rq_unlock(rq, p, &flags);
3078}
3079
3080#ifdef CONFIG_PREEMPT_NOTIFIERS
3081
3082
3083
3084
3085
3086void preempt_notifier_register(struct preempt_notifier *notifier)
3087{
3088 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
3089}
3090EXPORT_SYMBOL_GPL(preempt_notifier_register);
3091
3092
3093
3094
3095
3096
3097
3098void preempt_notifier_unregister(struct preempt_notifier *notifier)
3099{
3100 hlist_del(¬ifier->link);
3101}
3102EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3103
3104static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3105{
3106 struct preempt_notifier *notifier;
3107 struct hlist_node *node;
3108
3109 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3110 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3111}
3112
3113static void
3114fire_sched_out_preempt_notifiers(struct task_struct *curr,
3115 struct task_struct *next)
3116{
3117 struct preempt_notifier *notifier;
3118 struct hlist_node *node;
3119
3120 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3121 notifier->ops->sched_out(notifier, next);
3122}
3123
3124#else
3125
3126static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3127{
3128}
3129
3130static void
3131fire_sched_out_preempt_notifiers(struct task_struct *curr,
3132 struct task_struct *next)
3133{
3134}
3135
3136#endif
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151static inline void
3152prepare_task_switch(struct rq *rq, struct task_struct *prev,
3153 struct task_struct *next)
3154{
3155 sched_info_switch(prev, next);
3156 perf_event_task_sched_out(prev, next);
3157 fire_sched_out_preempt_notifiers(prev, next);
3158 prepare_lock_switch(rq, next);
3159 prepare_arch_switch(next);
3160 trace_sched_switch(prev, next);
3161}
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3179 __releases(rq->lock)
3180{
3181 struct mm_struct *mm = rq->prev_mm;
3182 long prev_state;
3183
3184 rq->prev_mm = NULL;
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197 prev_state = prev->state;
3198 finish_arch_switch(prev);
3199#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3200 local_irq_disable();
3201#endif
3202 perf_event_task_sched_in(prev, current);
3203#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3204 local_irq_enable();
3205#endif
3206 finish_lock_switch(rq, prev);
3207
3208 fire_sched_in_preempt_notifiers(current);
3209 if (mm)
3210 mmdrop(mm);
3211 if (unlikely(prev_state == TASK_DEAD)) {
3212
3213
3214
3215
3216 kprobe_flush_task(prev);
3217 put_task_struct(prev);
3218 }
3219}
3220
3221#ifdef CONFIG_SMP
3222
3223
3224static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3225{
3226 if (prev->sched_class->pre_schedule)
3227 prev->sched_class->pre_schedule(rq, prev);
3228}
3229
3230
3231static inline void post_schedule(struct rq *rq)
3232{
3233 if (rq->post_schedule) {
3234 unsigned long flags;
3235
3236 raw_spin_lock_irqsave(&rq->lock, flags);
3237 if (rq->curr->sched_class->post_schedule)
3238 rq->curr->sched_class->post_schedule(rq);
3239 raw_spin_unlock_irqrestore(&rq->lock, flags);
3240
3241 rq->post_schedule = 0;
3242 }
3243}
3244
3245#else
3246
3247static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3248{
3249}
3250
3251static inline void post_schedule(struct rq *rq)
3252{
3253}
3254
3255#endif
3256
3257
3258
3259
3260
3261asmlinkage void schedule_tail(struct task_struct *prev)
3262 __releases(rq->lock)
3263{
3264 struct rq *rq = this_rq();
3265
3266 finish_task_switch(rq, prev);
3267
3268
3269
3270
3271
3272 post_schedule(rq);
3273
3274#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3275
3276 preempt_enable();
3277#endif
3278 if (current->set_child_tid)
3279 put_user(task_pid_vnr(current), current->set_child_tid);
3280}
3281
3282
3283
3284
3285
3286static inline void
3287context_switch(struct rq *rq, struct task_struct *prev,
3288 struct task_struct *next)
3289{
3290 struct mm_struct *mm, *oldmm;
3291
3292 prepare_task_switch(rq, prev, next);
3293
3294 mm = next->mm;
3295 oldmm = prev->active_mm;
3296
3297
3298
3299
3300
3301 arch_start_context_switch(prev);
3302
3303 if (!mm) {
3304 next->active_mm = oldmm;
3305 atomic_inc(&oldmm->mm_count);
3306 enter_lazy_tlb(oldmm, next);
3307 } else
3308 switch_mm(oldmm, mm, next);
3309
3310 if (!prev->mm) {
3311 prev->active_mm = NULL;
3312 rq->prev_mm = oldmm;
3313 }
3314
3315
3316
3317
3318
3319
3320#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3321 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3322#endif
3323
3324
3325 switch_to(prev, next, prev);
3326
3327 barrier();
3328
3329
3330
3331
3332
3333 finish_task_switch(this_rq(), prev);
3334}
3335
3336
3337
3338
3339
3340
3341
3342
3343unsigned long nr_running(void)
3344{
3345 unsigned long i, sum = 0;
3346
3347 for_each_online_cpu(i)
3348 sum += cpu_rq(i)->nr_running;
3349
3350 return sum;
3351}
3352
3353unsigned long nr_uninterruptible(void)
3354{
3355 unsigned long i, sum = 0;
3356
3357 for_each_possible_cpu(i)
3358 sum += cpu_rq(i)->nr_uninterruptible;
3359
3360
3361
3362
3363
3364 if (unlikely((long)sum < 0))
3365 sum = 0;
3366
3367 return sum;
3368}
3369
3370unsigned long long nr_context_switches(void)
3371{
3372 int i;
3373 unsigned long long sum = 0;
3374
3375 for_each_possible_cpu(i)
3376 sum += cpu_rq(i)->nr_switches;
3377
3378 return sum;
3379}
3380
3381unsigned long nr_iowait(void)
3382{
3383 unsigned long i, sum = 0;
3384
3385 for_each_possible_cpu(i)
3386 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3387
3388 return sum;
3389}
3390
3391unsigned long nr_iowait_cpu(int cpu)
3392{
3393 struct rq *this = cpu_rq(cpu);
3394 return atomic_read(&this->nr_iowait);
3395}
3396
3397unsigned long this_cpu_load(void)
3398{
3399 struct rq *this = this_rq();
3400 return this->cpu_load[0];
3401}
3402
3403
3404
3405static atomic_long_t calc_load_tasks;
3406static unsigned long calc_load_update;
3407unsigned long avenrun[3];
3408EXPORT_SYMBOL(avenrun);
3409
3410static long calc_load_fold_active(struct rq *this_rq)
3411{
3412 long nr_active, delta = 0;
3413
3414 nr_active = this_rq->nr_running;
3415 nr_active += (long) this_rq->nr_uninterruptible;
3416
3417 if (nr_active != this_rq->calc_load_active) {
3418 delta = nr_active - this_rq->calc_load_active;
3419 this_rq->calc_load_active = nr_active;
3420 }
3421
3422 return delta;
3423}
3424
3425static unsigned long
3426calc_load(unsigned long load, unsigned long exp, unsigned long active)
3427{
3428 load *= exp;
3429 load += active * (FIXED_1 - exp);
3430 load += 1UL << (FSHIFT - 1);
3431 return load >> FSHIFT;
3432}
3433
3434#ifdef CONFIG_NO_HZ
3435
3436
3437
3438
3439
3440static atomic_long_t calc_load_tasks_idle;
3441
3442static void calc_load_account_idle(struct rq *this_rq)
3443{
3444 long delta;
3445
3446 delta = calc_load_fold_active(this_rq);
3447 if (delta)
3448 atomic_long_add(delta, &calc_load_tasks_idle);
3449}
3450
3451static long calc_load_fold_idle(void)
3452{
3453 long delta = 0;
3454
3455
3456
3457
3458 if (atomic_long_read(&calc_load_tasks_idle))
3459 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3460
3461 return delta;
3462}
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479static unsigned long
3480fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3481{
3482 unsigned long result = 1UL << frac_bits;
3483
3484 if (n) for (;;) {
3485 if (n & 1) {
3486 result *= x;
3487 result += 1UL << (frac_bits - 1);
3488 result >>= frac_bits;
3489 }
3490 n >>= 1;
3491 if (!n)
3492 break;
3493 x *= x;
3494 x += 1UL << (frac_bits - 1);
3495 x >>= frac_bits;
3496 }
3497
3498 return result;
3499}
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524static unsigned long
3525calc_load_n(unsigned long load, unsigned long exp,
3526 unsigned long active, unsigned int n)
3527{
3528
3529 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3530}
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541static void calc_global_nohz(unsigned long ticks)
3542{
3543 long delta, active, n;
3544
3545 if (time_before(jiffies, calc_load_update))
3546 return;
3547
3548
3549
3550
3551
3552
3553
3554 delta = calc_load_fold_idle();
3555 if (delta)
3556 atomic_long_add(delta, &calc_load_tasks);
3557
3558
3559
3560
3561 if (ticks >= LOAD_FREQ) {
3562 n = ticks / LOAD_FREQ;
3563
3564 active = atomic_long_read(&calc_load_tasks);
3565 active = active > 0 ? active * FIXED_1 : 0;
3566
3567 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3568 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3569 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3570
3571 calc_load_update += n * LOAD_FREQ;
3572 }
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584}
3585#else
3586static void calc_load_account_idle(struct rq *this_rq)
3587{
3588}
3589
3590static inline long calc_load_fold_idle(void)
3591{
3592 return 0;
3593}
3594
3595static void calc_global_nohz(unsigned long ticks)
3596{
3597}
3598#endif
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3609{
3610 loads[0] = (avenrun[0] + offset) << shift;
3611 loads[1] = (avenrun[1] + offset) << shift;
3612 loads[2] = (avenrun[2] + offset) << shift;
3613}
3614
3615
3616
3617
3618
3619void calc_global_load(unsigned long ticks)
3620{
3621 long active;
3622
3623 calc_global_nohz(ticks);
3624
3625 if (time_before(jiffies, calc_load_update + 10))
3626 return;
3627
3628 active = atomic_long_read(&calc_load_tasks);
3629 active = active > 0 ? active * FIXED_1 : 0;
3630
3631 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3632 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3633 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3634
3635 calc_load_update += LOAD_FREQ;
3636}
3637
3638
3639
3640
3641
3642static void calc_load_account_active(struct rq *this_rq)
3643{
3644 long delta;
3645
3646 if (time_before(jiffies, this_rq->calc_load_update))
3647 return;
3648
3649 delta = calc_load_fold_active(this_rq);
3650 delta += calc_load_fold_idle();
3651 if (delta)
3652 atomic_long_add(delta, &calc_load_tasks);
3653
3654 this_rq->calc_load_update += LOAD_FREQ;
3655}
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684#define DEGRADE_SHIFT 7
3685static const unsigned char
3686 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3687static const unsigned char
3688 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3689 {0, 0, 0, 0, 0, 0, 0, 0},
3690 {64, 32, 8, 0, 0, 0, 0, 0},
3691 {96, 72, 40, 12, 1, 0, 0},
3692 {112, 98, 75, 43, 15, 1, 0},
3693 {120, 112, 98, 76, 45, 16, 2} };
3694
3695
3696
3697
3698
3699
3700static unsigned long
3701decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3702{
3703 int j = 0;
3704
3705 if (!missed_updates)
3706 return load;
3707
3708 if (missed_updates >= degrade_zero_ticks[idx])
3709 return 0;
3710
3711 if (idx == 1)
3712 return load >> missed_updates;
3713
3714 while (missed_updates) {
3715 if (missed_updates % 2)
3716 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3717
3718 missed_updates >>= 1;
3719 j++;
3720 }
3721 return load;
3722}
3723
3724
3725
3726
3727
3728
3729static void update_cpu_load(struct rq *this_rq)
3730{
3731 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies;
3733 unsigned long pending_updates;
3734 int i, scale;
3735
3736 this_rq->nr_load_updates++;
3737
3738
3739 if (curr_jiffies == this_rq->last_load_update_tick)
3740 return;
3741
3742 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3743 this_rq->last_load_update_tick = curr_jiffies;
3744
3745
3746 this_rq->cpu_load[0] = this_load;
3747 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3748 unsigned long old_load, new_load;
3749
3750
3751
3752 old_load = this_rq->cpu_load[i];
3753 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3754 new_load = this_load;
3755
3756
3757
3758
3759
3760 if (new_load > old_load)
3761 new_load += scale - 1;
3762
3763 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3764 }
3765
3766 sched_avg_update(this_rq);
3767}
3768
3769static void update_cpu_load_active(struct rq *this_rq)
3770{
3771 update_cpu_load(this_rq);
3772
3773 calc_load_account_active(this_rq);
3774}
3775
3776#ifdef CONFIG_SMP
3777
3778
3779
3780
3781
3782void sched_exec(void)
3783{
3784 struct task_struct *p = current;
3785 unsigned long flags;
3786 int dest_cpu;
3787
3788 raw_spin_lock_irqsave(&p->pi_lock, flags);
3789 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3790 if (dest_cpu == smp_processor_id())
3791 goto unlock;
3792
3793 if (likely(cpu_active(dest_cpu))) {
3794 struct migration_arg arg = { p, dest_cpu };
3795
3796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3797 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3798 return;
3799 }
3800unlock:
3801 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3802}
3803
3804#endif
3805
3806DEFINE_PER_CPU(struct kernel_stat, kstat);
3807
3808EXPORT_PER_CPU_SYMBOL(kstat);
3809
3810
3811
3812
3813
3814
3815
3816static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3817{
3818 u64 ns = 0;
3819
3820 if (task_current(rq, p)) {
3821 update_rq_clock(rq);
3822 ns = rq->clock_task - p->se.exec_start;
3823 if ((s64)ns < 0)
3824 ns = 0;
3825 }
3826
3827 return ns;
3828}
3829
3830unsigned long long task_delta_exec(struct task_struct *p)
3831{
3832 unsigned long flags;
3833 struct rq *rq;
3834 u64 ns = 0;
3835
3836 rq = task_rq_lock(p, &flags);
3837 ns = do_task_delta_exec(p, rq);
3838 task_rq_unlock(rq, p, &flags);
3839
3840 return ns;
3841}
3842
3843
3844
3845
3846
3847
3848unsigned long long task_sched_runtime(struct task_struct *p)
3849{
3850 unsigned long flags;
3851 struct rq *rq;
3852 u64 ns = 0;
3853
3854 rq = task_rq_lock(p, &flags);
3855 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3856 task_rq_unlock(rq, p, &flags);
3857
3858 return ns;
3859}
3860
3861
3862
3863
3864
3865
3866
3867void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled)
3869{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3871 cputime64_t tmp;
3872
3873
3874 p->utime = cputime_add(p->utime, cputime);
3875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3876 account_group_user_time(p, cputime);
3877
3878
3879 tmp = cputime_to_cputime64(cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
3884
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3886
3887 acct_update_integrals(p);
3888}
3889
3890
3891
3892
3893
3894
3895
3896static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled)
3898{
3899 cputime64_t tmp;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903
3904
3905 p->utime = cputime_add(p->utime, cputime);
3906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3907 account_group_user_time(p, cputime);
3908 p->gtime = cputime_add(p->gtime, cputime);
3909
3910
3911 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3914 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp);
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3917 }
3918}
3919
3920
3921
3922
3923
3924
3925
3926
3927static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3930{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933
3934 p->stime = cputime_add(p->stime, cputime);
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3936 account_group_system_time(p, cputime);
3937
3938
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941
3942
3943 acct_update_integrals(p);
3944}
3945
3946
3947
3948
3949
3950
3951
3952
3953void account_system_time(struct task_struct *p, int hardirq_offset,
3954 cputime_t cputime, cputime_t cputime_scaled)
3955{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3957 cputime64_t *target_cputime64;
3958
3959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3960 account_guest_time(p, cputime, cputime_scaled);
3961 return;
3962 }
3963
3964 if (hardirq_count() - hardirq_offset)
3965 target_cputime64 = &cpustat->irq;
3966 else if (in_serving_softirq())
3967 target_cputime64 = &cpustat->softirq;
3968 else
3969 target_cputime64 = &cpustat->system;
3970
3971 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3972}
3973
3974
3975
3976
3977
3978void account_steal_time(cputime_t cputime)
3979{
3980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3984}
3985
3986
3987
3988
3989
3990void account_idle_time(cputime_t cputime)
3991{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3994 struct rq *rq = this_rq();
3995
3996 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3998 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4000}
4001
4002static __always_inline bool steal_account_process_tick(void)
4003{
4004#ifdef CONFIG_PARAVIRT
4005 if (static_branch(¶virt_steal_enabled)) {
4006 u64 steal, st = 0;
4007
4008 steal = paravirt_steal_clock(smp_processor_id());
4009 steal -= this_rq()->prev_steal_time;
4010
4011 st = steal_ticks(steal);
4012 this_rq()->prev_steal_time += st * TICK_NSEC;
4013
4014 account_steal_time(st);
4015 return st;
4016 }
4017#endif
4018 return false;
4019}
4020
4021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4022
4023#ifdef CONFIG_IRQ_TIME_ACCOUNTING
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq)
4047{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051
4052 if (steal_account_process_tick())
4053 return;
4054
4055 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4057 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4059 } else if (this_cpu_ksoftirqd() == p) {
4060
4061
4062
4063
4064
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq);
4067 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) {
4070 account_idle_time(cputime_one_jiffy);
4071 } else if (p->flags & PF_VCPU) {
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system);
4076 }
4077}
4078
4079static void irqtime_account_idle_ticks(int ticks)
4080{
4081 int i;
4082 struct rq *rq = this_rq();
4083
4084 for (i = 0; i < ticks; i++)
4085 irqtime_account_process_tick(current, 0, rq);
4086}
4087#else
4088static void irqtime_account_idle_ticks(int ticks) {}
4089static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4090 struct rq *rq) {}
4091#endif
4092
4093
4094
4095
4096
4097
4098void account_process_tick(struct task_struct *p, int user_tick)
4099{
4100 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4101 struct rq *rq = this_rq();
4102
4103 if (sched_clock_irqtime) {
4104 irqtime_account_process_tick(p, user_tick, rq);
4105 return;
4106 }
4107
4108 if (steal_account_process_tick())
4109 return;
4110
4111 if (user_tick)
4112 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4113 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4114 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
4115 one_jiffy_scaled);
4116 else
4117 account_idle_time(cputime_one_jiffy);
4118}
4119
4120
4121
4122
4123
4124
4125void account_steal_ticks(unsigned long ticks)
4126{
4127 account_steal_time(jiffies_to_cputime(ticks));
4128}
4129
4130
4131
4132
4133
4134void account_idle_ticks(unsigned long ticks)
4135{
4136
4137 if (sched_clock_irqtime) {
4138 irqtime_account_idle_ticks(ticks);
4139 return;
4140 }
4141
4142 account_idle_time(jiffies_to_cputime(ticks));
4143}
4144
4145#endif
4146
4147
4148
4149
4150#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4151void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4152{
4153 *ut = p->utime;
4154 *st = p->stime;
4155}
4156
4157void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4158{
4159 struct task_cputime cputime;
4160
4161 thread_group_cputime(p, &cputime);
4162
4163 *ut = cputime.utime;
4164 *st = cputime.stime;
4165}
4166#else
4167
4168#ifndef nsecs_to_cputime
4169# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
4170#endif
4171
4172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4173{
4174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
4175
4176
4177
4178
4179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4180
4181 if (total) {
4182 u64 temp = rtime;
4183
4184 temp *= utime;
4185 do_div(temp, total);
4186 utime = (cputime_t)temp;
4187 } else
4188 utime = rtime;
4189
4190
4191
4192
4193 p->prev_utime = max(p->prev_utime, utime);
4194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
4195
4196 *ut = p->prev_utime;
4197 *st = p->prev_stime;
4198}
4199
4200
4201
4202
4203void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4204{
4205 struct signal_struct *sig = p->signal;
4206 struct task_cputime cputime;
4207 cputime_t rtime, utime, total;
4208
4209 thread_group_cputime(p, &cputime);
4210
4211 total = cputime_add(cputime.utime, cputime.stime);
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213
4214 if (total) {
4215 u64 temp = rtime;
4216
4217 temp *= cputime.utime;
4218 do_div(temp, total);
4219 utime = (cputime_t)temp;
4220 } else
4221 utime = rtime;
4222
4223 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime,
4225 cputime_sub(rtime, sig->prev_utime));
4226
4227 *ut = sig->prev_utime;
4228 *st = sig->prev_stime;
4229}
4230#endif
4231
4232
4233
4234
4235
4236void scheduler_tick(void)
4237{
4238 int cpu = smp_processor_id();
4239 struct rq *rq = cpu_rq(cpu);
4240 struct task_struct *curr = rq->curr;
4241
4242 sched_clock_tick();
4243
4244 raw_spin_lock(&rq->lock);
4245 update_rq_clock(rq);
4246 update_cpu_load_active(rq);
4247 curr->sched_class->task_tick(rq, curr, 0);
4248 raw_spin_unlock(&rq->lock);
4249
4250 perf_event_task_tick();
4251
4252#ifdef CONFIG_SMP
4253 rq->idle_balance = idle_cpu(cpu);
4254 trigger_load_balance(rq, cpu);
4255#endif
4256}
4257
4258notrace unsigned long get_parent_ip(unsigned long addr)
4259{
4260 if (in_lock_functions(addr)) {
4261 addr = CALLER_ADDR2;
4262 if (in_lock_functions(addr))
4263 addr = CALLER_ADDR3;
4264 }
4265 return addr;
4266}
4267
4268#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4269 defined(CONFIG_PREEMPT_TRACER))
4270
4271void __kprobes add_preempt_count(int val)
4272{
4273#ifdef CONFIG_DEBUG_PREEMPT
4274
4275
4276
4277 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4278 return;
4279#endif
4280 preempt_count() += val;
4281#ifdef CONFIG_DEBUG_PREEMPT
4282
4283
4284
4285 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4286 PREEMPT_MASK - 10);
4287#endif
4288 if (preempt_count() == val)
4289 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4290}
4291EXPORT_SYMBOL(add_preempt_count);
4292
4293void __kprobes sub_preempt_count(int val)
4294{
4295#ifdef CONFIG_DEBUG_PREEMPT
4296
4297
4298
4299 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4300 return;
4301
4302
4303
4304 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4305 !(preempt_count() & PREEMPT_MASK)))
4306 return;
4307#endif
4308
4309 if (preempt_count() == val)
4310 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4311 preempt_count() -= val;
4312}
4313EXPORT_SYMBOL(sub_preempt_count);
4314
4315#endif
4316
4317
4318
4319
4320static noinline void __schedule_bug(struct task_struct *prev)
4321{
4322 struct pt_regs *regs = get_irq_regs();
4323
4324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count());
4326
4327 debug_show_held_locks(prev);
4328 print_modules();
4329 if (irqs_disabled())
4330 print_irqtrace_events(prev);
4331
4332 if (regs)
4333 show_regs(regs);
4334 else
4335 dump_stack();
4336}
4337
4338
4339
4340
4341static inline void schedule_debug(struct task_struct *prev)
4342{
4343
4344
4345
4346
4347
4348 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4349 __schedule_bug(prev);
4350 rcu_sleep_check();
4351
4352 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4353
4354 schedstat_inc(this_rq(), sched_count);
4355}
4356
4357static void put_prev_task(struct rq *rq, struct task_struct *prev)
4358{
4359 if (prev->on_rq || rq->skip_clock_update < 0)
4360 update_rq_clock(rq);
4361 prev->sched_class->put_prev_task(rq, prev);
4362}
4363
4364
4365
4366
4367static inline struct task_struct *
4368pick_next_task(struct rq *rq)
4369{
4370 const struct sched_class *class;
4371 struct task_struct *p;
4372
4373
4374
4375
4376
4377 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4378 p = fair_sched_class.pick_next_task(rq);
4379 if (likely(p))
4380 return p;
4381 }
4382
4383 for_each_class(class) {
4384 p = class->pick_next_task(rq);
4385 if (p)
4386 return p;
4387 }
4388
4389 BUG();
4390}
4391
4392
4393
4394
4395static void __sched __schedule(void)
4396{
4397 struct task_struct *prev, *next;
4398 unsigned long *switch_count;
4399 struct rq *rq;
4400 int cpu;
4401
4402need_resched:
4403 preempt_disable();
4404 cpu = smp_processor_id();
4405 rq = cpu_rq(cpu);
4406 rcu_note_context_switch(cpu);
4407 prev = rq->curr;
4408
4409 schedule_debug(prev);
4410
4411 if (sched_feat(HRTICK))
4412 hrtick_clear(rq);
4413
4414 raw_spin_lock_irq(&rq->lock);
4415
4416 switch_count = &prev->nivcsw;
4417 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4418 if (unlikely(signal_pending_state(prev->state, prev))) {
4419 prev->state = TASK_RUNNING;
4420 } else {
4421 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4422 prev->on_rq = 0;
4423
4424
4425
4426
4427
4428
4429 if (prev->flags & PF_WQ_WORKER) {
4430 struct task_struct *to_wakeup;
4431
4432 to_wakeup = wq_worker_sleeping(prev, cpu);
4433 if (to_wakeup)
4434 try_to_wake_up_local(to_wakeup);
4435 }
4436 }
4437 switch_count = &prev->nvcsw;
4438 }
4439
4440 pre_schedule(rq, prev);
4441
4442 if (unlikely(!rq->nr_running))
4443 idle_balance(cpu, rq);
4444
4445 put_prev_task(rq, prev);
4446 next = pick_next_task(rq);
4447 clear_tsk_need_resched(prev);
4448 rq->skip_clock_update = 0;
4449
4450 if (likely(prev != next)) {
4451 rq->nr_switches++;
4452 rq->curr = next;
4453 ++*switch_count;
4454
4455 context_switch(rq, prev, next);
4456
4457
4458
4459
4460
4461
4462 cpu = smp_processor_id();
4463 rq = cpu_rq(cpu);
4464 } else
4465 raw_spin_unlock_irq(&rq->lock);
4466
4467 post_schedule(rq);
4468
4469 preempt_enable_no_resched();
4470 if (need_resched())
4471 goto need_resched;
4472}
4473
4474static inline void sched_submit_work(struct task_struct *tsk)
4475{
4476 if (!tsk->state)
4477 return;
4478
4479
4480
4481
4482 if (blk_needs_flush_plug(tsk))
4483 blk_schedule_flush_plug(tsk);
4484}
4485
4486asmlinkage void __sched schedule(void)
4487{
4488 struct task_struct *tsk = current;
4489
4490 sched_submit_work(tsk);
4491 __schedule();
4492}
4493EXPORT_SYMBOL(schedule);
4494
4495#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4496
4497static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4498{
4499 if (lock->owner != owner)
4500 return false;
4501
4502
4503
4504
4505
4506
4507
4508 barrier();
4509
4510 return owner->on_cpu;
4511}
4512
4513
4514
4515
4516
4517int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4518{
4519 if (!sched_feat(OWNER_SPIN))
4520 return 0;
4521
4522 rcu_read_lock();
4523 while (owner_running(lock, owner)) {
4524 if (need_resched())
4525 break;
4526
4527 arch_mutex_cpu_relax();
4528 }
4529 rcu_read_unlock();
4530
4531
4532
4533
4534
4535
4536 return lock->owner == NULL;
4537}
4538#endif
4539
4540#ifdef CONFIG_PREEMPT
4541
4542
4543
4544
4545
4546asmlinkage void __sched notrace preempt_schedule(void)
4547{
4548 struct thread_info *ti = current_thread_info();
4549
4550
4551
4552
4553
4554 if (likely(ti->preempt_count || irqs_disabled()))
4555 return;
4556
4557 do {
4558 add_preempt_count_notrace(PREEMPT_ACTIVE);
4559 __schedule();
4560 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4561
4562
4563
4564
4565
4566 barrier();
4567 } while (need_resched());
4568}
4569EXPORT_SYMBOL(preempt_schedule);
4570
4571
4572
4573
4574
4575
4576
4577asmlinkage void __sched preempt_schedule_irq(void)
4578{
4579 struct thread_info *ti = current_thread_info();
4580
4581
4582 BUG_ON(ti->preempt_count || !irqs_disabled());
4583
4584 do {
4585 add_preempt_count(PREEMPT_ACTIVE);
4586 local_irq_enable();
4587 __schedule();
4588 local_irq_disable();
4589 sub_preempt_count(PREEMPT_ACTIVE);
4590
4591
4592
4593
4594
4595 barrier();
4596 } while (need_resched());
4597}
4598
4599#endif
4600
4601int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4602 void *key)
4603{
4604 return try_to_wake_up(curr->private, mode, wake_flags);
4605}
4606EXPORT_SYMBOL(default_wake_function);
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4618 int nr_exclusive, int wake_flags, void *key)
4619{
4620 wait_queue_t *curr, *next;
4621
4622 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4623 unsigned flags = curr->flags;
4624
4625 if (curr->func(curr, mode, wake_flags, key) &&
4626 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4627 break;
4628 }
4629}
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641void __wake_up(wait_queue_head_t *q, unsigned int mode,
4642 int nr_exclusive, void *key)
4643{
4644 unsigned long flags;
4645
4646 spin_lock_irqsave(&q->lock, flags);
4647 __wake_up_common(q, mode, nr_exclusive, 0, key);
4648 spin_unlock_irqrestore(&q->lock, flags);
4649}
4650EXPORT_SYMBOL(__wake_up);
4651
4652
4653
4654
4655void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4656{
4657 __wake_up_common(q, mode, 1, 0, NULL);
4658}
4659EXPORT_SYMBOL_GPL(__wake_up_locked);
4660
4661void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4662{
4663 __wake_up_common(q, mode, 1, 0, key);
4664}
4665EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4685 int nr_exclusive, void *key)
4686{
4687 unsigned long flags;
4688 int wake_flags = WF_SYNC;
4689
4690 if (unlikely(!q))
4691 return;
4692
4693 if (unlikely(!nr_exclusive))
4694 wake_flags = 0;
4695
4696 spin_lock_irqsave(&q->lock, flags);
4697 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4698 spin_unlock_irqrestore(&q->lock, flags);
4699}
4700EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4701
4702
4703
4704
4705void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4706{
4707 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4708}
4709EXPORT_SYMBOL_GPL(__wake_up_sync);
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723void complete(struct completion *x)
4724{
4725 unsigned long flags;
4726
4727 spin_lock_irqsave(&x->wait.lock, flags);
4728 x->done++;
4729 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4730 spin_unlock_irqrestore(&x->wait.lock, flags);
4731}
4732EXPORT_SYMBOL(complete);
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743void complete_all(struct completion *x)
4744{
4745 unsigned long flags;
4746
4747 spin_lock_irqsave(&x->wait.lock, flags);
4748 x->done += UINT_MAX/2;
4749 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4750 spin_unlock_irqrestore(&x->wait.lock, flags);
4751}
4752EXPORT_SYMBOL(complete_all);
4753
4754static inline long __sched
4755do_wait_for_common(struct completion *x, long timeout, int state)
4756{
4757 if (!x->done) {
4758 DECLARE_WAITQUEUE(wait, current);
4759
4760 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4761 do {
4762 if (signal_pending_state(state, current)) {
4763 timeout = -ERESTARTSYS;
4764 break;
4765 }
4766 __set_current_state(state);
4767 spin_unlock_irq(&x->wait.lock);
4768 timeout = schedule_timeout(timeout);
4769 spin_lock_irq(&x->wait.lock);
4770 } while (!x->done && timeout);
4771 __remove_wait_queue(&x->wait, &wait);
4772 if (!x->done)
4773 return timeout;
4774 }
4775 x->done--;
4776 return timeout ?: 1;
4777}
4778
4779static long __sched
4780wait_for_common(struct completion *x, long timeout, int state)
4781{
4782 might_sleep();
4783
4784 spin_lock_irq(&x->wait.lock);
4785 timeout = do_wait_for_common(x, timeout, state);
4786 spin_unlock_irq(&x->wait.lock);
4787 return timeout;
4788}
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800void __sched wait_for_completion(struct completion *x)
4801{
4802 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4803}
4804EXPORT_SYMBOL(wait_for_completion);
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818unsigned long __sched
4819wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4820{
4821 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4822}
4823EXPORT_SYMBOL(wait_for_completion_timeout);
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834int __sched wait_for_completion_interruptible(struct completion *x)
4835{
4836 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4837 if (t == -ERESTARTSYS)
4838 return t;
4839 return 0;
4840}
4841EXPORT_SYMBOL(wait_for_completion_interruptible);
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854long __sched
4855wait_for_completion_interruptible_timeout(struct completion *x,
4856 unsigned long timeout)
4857{
4858 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4859}
4860EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871int __sched wait_for_completion_killable(struct completion *x)
4872{
4873 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4874 if (t == -ERESTARTSYS)
4875 return t;
4876 return 0;
4877}
4878EXPORT_SYMBOL(wait_for_completion_killable);
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892long __sched
4893wait_for_completion_killable_timeout(struct completion *x,
4894 unsigned long timeout)
4895{
4896 return wait_for_common(x, timeout, TASK_KILLABLE);
4897}
4898EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912bool try_wait_for_completion(struct completion *x)
4913{
4914 unsigned long flags;
4915 int ret = 1;
4916
4917 spin_lock_irqsave(&x->wait.lock, flags);
4918 if (!x->done)
4919 ret = 0;
4920 else
4921 x->done--;
4922 spin_unlock_irqrestore(&x->wait.lock, flags);
4923 return ret;
4924}
4925EXPORT_SYMBOL(try_wait_for_completion);
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935bool completion_done(struct completion *x)
4936{
4937 unsigned long flags;
4938 int ret = 1;
4939
4940 spin_lock_irqsave(&x->wait.lock, flags);
4941 if (!x->done)
4942 ret = 0;
4943 spin_unlock_irqrestore(&x->wait.lock, flags);
4944 return ret;
4945}
4946EXPORT_SYMBOL(completion_done);
4947
4948static long __sched
4949sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4950{
4951 unsigned long flags;
4952 wait_queue_t wait;
4953
4954 init_waitqueue_entry(&wait, current);
4955
4956 __set_current_state(state);
4957
4958 spin_lock_irqsave(&q->lock, flags);
4959 __add_wait_queue(q, &wait);
4960 spin_unlock(&q->lock);
4961 timeout = schedule_timeout(timeout);
4962 spin_lock_irq(&q->lock);
4963 __remove_wait_queue(q, &wait);
4964 spin_unlock_irqrestore(&q->lock, flags);
4965
4966 return timeout;
4967}
4968
4969void __sched interruptible_sleep_on(wait_queue_head_t *q)
4970{
4971 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4972}
4973EXPORT_SYMBOL(interruptible_sleep_on);
4974
4975long __sched
4976interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4977{
4978 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4979}
4980EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4981
4982void __sched sleep_on(wait_queue_head_t *q)
4983{
4984 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4985}
4986EXPORT_SYMBOL(sleep_on);
4987
4988long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4989{
4990 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4991}
4992EXPORT_SYMBOL(sleep_on_timeout);
4993
4994#ifdef CONFIG_RT_MUTEXES
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006void rt_mutex_setprio(struct task_struct *p, int prio)
5007{
5008 int oldprio, on_rq, running;
5009 struct rq *rq;
5010 const struct sched_class *prev_class;
5011
5012 BUG_ON(prio < 0 || prio > MAX_PRIO);
5013
5014 rq = __task_rq_lock(p);
5015
5016 trace_sched_pi_setprio(p, prio);
5017 oldprio = p->prio;
5018 prev_class = p->sched_class;
5019 on_rq = p->on_rq;
5020 running = task_current(rq, p);
5021 if (on_rq)
5022 dequeue_task(rq, p, 0);
5023 if (running)
5024 p->sched_class->put_prev_task(rq, p);
5025
5026 if (rt_prio(prio))
5027 p->sched_class = &rt_sched_class;
5028 else
5029 p->sched_class = &fair_sched_class;
5030
5031 p->prio = prio;
5032
5033 if (running)
5034 p->sched_class->set_curr_task(rq);
5035 if (on_rq)
5036 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
5037
5038 check_class_changed(rq, p, prev_class, oldprio);
5039 __task_rq_unlock(rq);
5040}
5041
5042#endif
5043
5044void set_user_nice(struct task_struct *p, long nice)
5045{
5046 int old_prio, delta, on_rq;
5047 unsigned long flags;
5048 struct rq *rq;
5049
5050 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5051 return;
5052
5053
5054
5055
5056 rq = task_rq_lock(p, &flags);
5057
5058
5059
5060
5061
5062
5063 if (task_has_rt_policy(p)) {
5064 p->static_prio = NICE_TO_PRIO(nice);
5065 goto out_unlock;
5066 }
5067 on_rq = p->on_rq;
5068 if (on_rq)
5069 dequeue_task(rq, p, 0);
5070
5071 p->static_prio = NICE_TO_PRIO(nice);
5072 set_load_weight(p);
5073 old_prio = p->prio;
5074 p->prio = effective_prio(p);
5075 delta = p->prio - old_prio;
5076
5077 if (on_rq) {
5078 enqueue_task(rq, p, 0);
5079
5080
5081
5082
5083 if (delta < 0 || (delta > 0 && task_running(rq, p)))
5084 resched_task(rq->curr);
5085 }
5086out_unlock:
5087 task_rq_unlock(rq, p, &flags);
5088}
5089EXPORT_SYMBOL(set_user_nice);
5090
5091
5092
5093
5094
5095
5096int can_nice(const struct task_struct *p, const int nice)
5097{
5098
5099 int nice_rlim = 20 - nice;
5100
5101 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
5102 capable(CAP_SYS_NICE));
5103}
5104
5105#ifdef __ARCH_WANT_SYS_NICE
5106
5107
5108
5109
5110
5111
5112
5113
5114SYSCALL_DEFINE1(nice, int, increment)
5115{
5116 long nice, retval;
5117
5118
5119
5120
5121
5122
5123 if (increment < -40)
5124 increment = -40;
5125 if (increment > 40)
5126 increment = 40;
5127
5128 nice = TASK_NICE(current) + increment;
5129 if (nice < -20)
5130 nice = -20;
5131 if (nice > 19)
5132 nice = 19;
5133
5134 if (increment < 0 && !can_nice(current, nice))
5135 return -EPERM;
5136
5137 retval = security_task_setnice(current, nice);
5138 if (retval)
5139 return retval;
5140
5141 set_user_nice(current, nice);
5142 return 0;
5143}
5144
5145#endif
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155int task_prio(const struct task_struct *p)
5156{
5157 return p->prio - MAX_RT_PRIO;
5158}
5159
5160
5161
5162
5163
5164int task_nice(const struct task_struct *p)
5165{
5166 return TASK_NICE(p);
5167}
5168EXPORT_SYMBOL(task_nice);
5169
5170
5171
5172
5173
5174int idle_cpu(int cpu)
5175{
5176 struct rq *rq = cpu_rq(cpu);
5177
5178 if (rq->curr != rq->idle)
5179 return 0;
5180
5181 if (rq->nr_running)
5182 return 0;
5183
5184#ifdef CONFIG_SMP
5185 if (!llist_empty(&rq->wake_list))
5186 return 0;
5187#endif
5188
5189 return 1;
5190}
5191
5192
5193
5194
5195
5196struct task_struct *idle_task(int cpu)
5197{
5198 return cpu_rq(cpu)->idle;
5199}
5200
5201
5202
5203
5204
5205static struct task_struct *find_process_by_pid(pid_t pid)
5206{
5207 return pid ? find_task_by_vpid(pid) : current;
5208}
5209
5210
5211static void
5212__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5213{
5214 p->policy = policy;
5215 p->rt_priority = prio;
5216 p->normal_prio = normal_prio(p);
5217
5218 p->prio = rt_mutex_getprio(p);
5219 if (rt_prio(p->prio))
5220 p->sched_class = &rt_sched_class;
5221 else
5222 p->sched_class = &fair_sched_class;
5223 set_load_weight(p);
5224}
5225
5226
5227
5228
5229static bool check_same_owner(struct task_struct *p)
5230{
5231 const struct cred *cred = current_cred(), *pcred;
5232 bool match;
5233
5234 rcu_read_lock();
5235 pcred = __task_cred(p);
5236 if (cred->user->user_ns == pcred->user->user_ns)
5237 match = (cred->euid == pcred->euid ||
5238 cred->euid == pcred->uid);
5239 else
5240 match = false;
5241 rcu_read_unlock();
5242 return match;
5243}
5244
5245static int __sched_setscheduler(struct task_struct *p, int policy,
5246 const struct sched_param *param, bool user)
5247{
5248 int retval, oldprio, oldpolicy = -1, on_rq, running;
5249 unsigned long flags;
5250 const struct sched_class *prev_class;
5251 struct rq *rq;
5252 int reset_on_fork;
5253
5254
5255 BUG_ON(in_interrupt());
5256recheck:
5257
5258 if (policy < 0) {
5259 reset_on_fork = p->sched_reset_on_fork;
5260 policy = oldpolicy = p->policy;
5261 } else {
5262 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5263 policy &= ~SCHED_RESET_ON_FORK;
5264
5265 if (policy != SCHED_FIFO && policy != SCHED_RR &&
5266 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5267 policy != SCHED_IDLE)
5268 return -EINVAL;
5269 }
5270
5271
5272
5273
5274
5275
5276 if (param->sched_priority < 0 ||
5277 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5278 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5279 return -EINVAL;
5280 if (rt_policy(policy) != (param->sched_priority != 0))
5281 return -EINVAL;
5282
5283
5284
5285
5286 if (user && !capable(CAP_SYS_NICE)) {
5287 if (rt_policy(policy)) {
5288 unsigned long rlim_rtprio =
5289 task_rlimit(p, RLIMIT_RTPRIO);
5290
5291
5292 if (policy != p->policy && !rlim_rtprio)
5293 return -EPERM;
5294
5295
5296 if (param->sched_priority > p->rt_priority &&
5297 param->sched_priority > rlim_rtprio)
5298 return -EPERM;
5299 }
5300
5301
5302
5303
5304
5305 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5306 if (!can_nice(p, TASK_NICE(p)))
5307 return -EPERM;
5308 }
5309
5310
5311 if (!check_same_owner(p))
5312 return -EPERM;
5313
5314
5315 if (p->sched_reset_on_fork && !reset_on_fork)
5316 return -EPERM;
5317 }
5318
5319 if (user) {
5320 retval = security_task_setscheduler(p);
5321 if (retval)
5322 return retval;
5323 }
5324
5325
5326
5327
5328
5329
5330
5331
5332 rq = task_rq_lock(p, &flags);
5333
5334
5335
5336
5337 if (p == rq->stop) {
5338 task_rq_unlock(rq, p, &flags);
5339 return -EINVAL;
5340 }
5341
5342
5343
5344
5345 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5346 param->sched_priority == p->rt_priority))) {
5347
5348 __task_rq_unlock(rq);
5349 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5350 return 0;
5351 }
5352
5353#ifdef CONFIG_RT_GROUP_SCHED
5354 if (user) {
5355
5356
5357
5358
5359 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5360 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5361 !task_group_is_autogroup(task_group(p))) {
5362 task_rq_unlock(rq, p, &flags);
5363 return -EPERM;
5364 }
5365 }
5366#endif
5367
5368
5369 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5370 policy = oldpolicy = -1;
5371 task_rq_unlock(rq, p, &flags);
5372 goto recheck;
5373 }
5374 on_rq = p->on_rq;
5375 running = task_current(rq, p);
5376 if (on_rq)
5377 deactivate_task(rq, p, 0);
5378 if (running)
5379 p->sched_class->put_prev_task(rq, p);
5380
5381 p->sched_reset_on_fork = reset_on_fork;
5382
5383 oldprio = p->prio;
5384 prev_class = p->sched_class;
5385 __setscheduler(rq, p, policy, param->sched_priority);
5386
5387 if (running)
5388 p->sched_class->set_curr_task(rq);
5389 if (on_rq)
5390 activate_task(rq, p, 0);
5391
5392 check_class_changed(rq, p, prev_class, oldprio);
5393 task_rq_unlock(rq, p, &flags);
5394
5395 rt_mutex_adjust_pi(p);
5396
5397 return 0;
5398}
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408int sched_setscheduler(struct task_struct *p, int policy,
5409 const struct sched_param *param)
5410{
5411 return __sched_setscheduler(p, policy, param, true);
5412}
5413EXPORT_SYMBOL_GPL(sched_setscheduler);
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5427 const struct sched_param *param)
5428{
5429 return __sched_setscheduler(p, policy, param, false);
5430}
5431
5432static int
5433do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5434{
5435 struct sched_param lparam;
5436 struct task_struct *p;
5437 int retval;
5438
5439 if (!param || pid < 0)
5440 return -EINVAL;
5441 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5442 return -EFAULT;
5443
5444 rcu_read_lock();
5445 retval = -ESRCH;
5446 p = find_process_by_pid(pid);
5447 if (p != NULL)
5448 retval = sched_setscheduler(p, policy, &lparam);
5449 rcu_read_unlock();
5450
5451 return retval;
5452}
5453
5454
5455
5456
5457
5458
5459
5460SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5461 struct sched_param __user *, param)
5462{
5463
5464 if (policy < 0)
5465 return -EINVAL;
5466
5467 return do_sched_setscheduler(pid, policy, param);
5468}
5469
5470
5471
5472
5473
5474
5475SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5476{
5477 return do_sched_setscheduler(pid, -1, param);
5478}
5479
5480
5481
5482
5483
5484SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5485{
5486 struct task_struct *p;
5487 int retval;
5488
5489 if (pid < 0)
5490 return -EINVAL;
5491
5492 retval = -ESRCH;
5493 rcu_read_lock();
5494 p = find_process_by_pid(pid);
5495 if (p) {
5496 retval = security_task_getscheduler(p);
5497 if (!retval)
5498 retval = p->policy
5499 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5500 }
5501 rcu_read_unlock();
5502 return retval;
5503}
5504
5505
5506
5507
5508
5509
5510SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5511{
5512 struct sched_param lp;
5513 struct task_struct *p;
5514 int retval;
5515
5516 if (!param || pid < 0)
5517 return -EINVAL;
5518
5519 rcu_read_lock();
5520 p = find_process_by_pid(pid);
5521 retval = -ESRCH;
5522 if (!p)
5523 goto out_unlock;
5524
5525 retval = security_task_getscheduler(p);
5526 if (retval)
5527 goto out_unlock;
5528
5529 lp.sched_priority = p->rt_priority;
5530 rcu_read_unlock();
5531
5532
5533
5534
5535 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5536
5537 return retval;
5538
5539out_unlock:
5540 rcu_read_unlock();
5541 return retval;
5542}
5543
5544long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5545{
5546 cpumask_var_t cpus_allowed, new_mask;
5547 struct task_struct *p;
5548 int retval;
5549
5550 get_online_cpus();
5551 rcu_read_lock();
5552
5553 p = find_process_by_pid(pid);
5554 if (!p) {
5555 rcu_read_unlock();
5556 put_online_cpus();
5557 return -ESRCH;
5558 }
5559
5560
5561 get_task_struct(p);
5562 rcu_read_unlock();
5563
5564 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5565 retval = -ENOMEM;
5566 goto out_put_task;
5567 }
5568 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5569 retval = -ENOMEM;
5570 goto out_free_cpus_allowed;
5571 }
5572 retval = -EPERM;
5573 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5574 goto out_unlock;
5575
5576 retval = security_task_setscheduler(p);
5577 if (retval)
5578 goto out_unlock;
5579
5580 cpuset_cpus_allowed(p, cpus_allowed);
5581 cpumask_and(new_mask, in_mask, cpus_allowed);
5582again:
5583 retval = set_cpus_allowed_ptr(p, new_mask);
5584
5585 if (!retval) {
5586 cpuset_cpus_allowed(p, cpus_allowed);
5587 if (!cpumask_subset(new_mask, cpus_allowed)) {
5588
5589
5590
5591
5592
5593 cpumask_copy(new_mask, cpus_allowed);
5594 goto again;
5595 }
5596 }
5597out_unlock:
5598 free_cpumask_var(new_mask);
5599out_free_cpus_allowed:
5600 free_cpumask_var(cpus_allowed);
5601out_put_task:
5602 put_task_struct(p);
5603 put_online_cpus();
5604 return retval;
5605}
5606
5607static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5608 struct cpumask *new_mask)
5609{
5610 if (len < cpumask_size())
5611 cpumask_clear(new_mask);
5612 else if (len > cpumask_size())
5613 len = cpumask_size();
5614
5615 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5616}
5617
5618
5619
5620
5621
5622
5623
5624SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5625 unsigned long __user *, user_mask_ptr)
5626{
5627 cpumask_var_t new_mask;
5628 int retval;
5629
5630 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5631 return -ENOMEM;
5632
5633 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5634 if (retval == 0)
5635 retval = sched_setaffinity(pid, new_mask);
5636 free_cpumask_var(new_mask);
5637 return retval;
5638}
5639
5640long sched_getaffinity(pid_t pid, struct cpumask *mask)
5641{
5642 struct task_struct *p;
5643 unsigned long flags;
5644 int retval;
5645
5646 get_online_cpus();
5647 rcu_read_lock();
5648
5649 retval = -ESRCH;
5650 p = find_process_by_pid(pid);
5651 if (!p)
5652 goto out_unlock;
5653
5654 retval = security_task_getscheduler(p);
5655 if (retval)
5656 goto out_unlock;
5657
5658 raw_spin_lock_irqsave(&p->pi_lock, flags);
5659 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5660 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5661
5662out_unlock:
5663 rcu_read_unlock();
5664 put_online_cpus();
5665
5666 return retval;
5667}
5668
5669
5670
5671
5672
5673
5674
5675SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5676 unsigned long __user *, user_mask_ptr)
5677{
5678 int ret;
5679 cpumask_var_t mask;
5680
5681 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5682 return -EINVAL;
5683 if (len & (sizeof(unsigned long)-1))
5684 return -EINVAL;
5685
5686 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5687 return -ENOMEM;
5688
5689 ret = sched_getaffinity(pid, mask);
5690 if (ret == 0) {
5691 size_t retlen = min_t(size_t, len, cpumask_size());
5692
5693 if (copy_to_user(user_mask_ptr, mask, retlen))
5694 ret = -EFAULT;
5695 else
5696 ret = retlen;
5697 }
5698 free_cpumask_var(mask);
5699
5700 return ret;
5701}
5702
5703
5704
5705
5706
5707
5708
5709SYSCALL_DEFINE0(sched_yield)
5710{
5711 struct rq *rq = this_rq_lock();
5712
5713 schedstat_inc(rq, yld_count);
5714 current->sched_class->yield_task(rq);
5715
5716
5717
5718
5719
5720 __release(rq->lock);
5721 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5722 do_raw_spin_unlock(&rq->lock);
5723 preempt_enable_no_resched();
5724
5725 schedule();
5726
5727 return 0;
5728}
5729
5730static inline int should_resched(void)
5731{
5732 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5733}
5734
5735static void __cond_resched(void)
5736{
5737 add_preempt_count(PREEMPT_ACTIVE);
5738 __schedule();
5739 sub_preempt_count(PREEMPT_ACTIVE);
5740}
5741
5742int __sched _cond_resched(void)
5743{
5744 if (should_resched()) {
5745 __cond_resched();
5746 return 1;
5747 }
5748 return 0;
5749}
5750EXPORT_SYMBOL(_cond_resched);
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760int __cond_resched_lock(spinlock_t *lock)
5761{
5762 int resched = should_resched();
5763 int ret = 0;
5764
5765 lockdep_assert_held(lock);
5766
5767 if (spin_needbreak(lock) || resched) {
5768 spin_unlock(lock);
5769 if (resched)
5770 __cond_resched();
5771 else
5772 cpu_relax();
5773 ret = 1;
5774 spin_lock(lock);
5775 }
5776 return ret;
5777}
5778EXPORT_SYMBOL(__cond_resched_lock);
5779
5780int __sched __cond_resched_softirq(void)
5781{
5782 BUG_ON(!in_softirq());
5783
5784 if (should_resched()) {
5785 local_bh_enable();
5786 __cond_resched();
5787 local_bh_disable();
5788 return 1;
5789 }
5790 return 0;
5791}
5792EXPORT_SYMBOL(__cond_resched_softirq);
5793
5794
5795
5796
5797
5798
5799
5800void __sched yield(void)
5801{
5802 set_current_state(TASK_RUNNING);
5803 sys_sched_yield();
5804}
5805EXPORT_SYMBOL(yield);
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819bool __sched yield_to(struct task_struct *p, bool preempt)
5820{
5821 struct task_struct *curr = current;
5822 struct rq *rq, *p_rq;
5823 unsigned long flags;
5824 bool yielded = 0;
5825
5826 local_irq_save(flags);
5827 rq = this_rq();
5828
5829again:
5830 p_rq = task_rq(p);
5831 double_rq_lock(rq, p_rq);
5832 while (task_rq(p) != p_rq) {
5833 double_rq_unlock(rq, p_rq);
5834 goto again;
5835 }
5836
5837 if (!curr->sched_class->yield_to_task)
5838 goto out;
5839
5840 if (curr->sched_class != p->sched_class)
5841 goto out;
5842
5843 if (task_running(p_rq, p) || p->state)
5844 goto out;
5845
5846 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5847 if (yielded) {
5848 schedstat_inc(rq, yld_count);
5849
5850
5851
5852
5853 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr);
5855 }
5856
5857out:
5858 double_rq_unlock(rq, p_rq);
5859 local_irq_restore(flags);
5860
5861 if (yielded)
5862 schedule();
5863
5864 return yielded;
5865}
5866EXPORT_SYMBOL_GPL(yield_to);
5867
5868
5869
5870
5871
5872void __sched io_schedule(void)
5873{
5874 struct rq *rq = raw_rq();
5875
5876 delayacct_blkio_start();
5877 atomic_inc(&rq->nr_iowait);
5878 blk_flush_plug(current);
5879 current->in_iowait = 1;
5880 schedule();
5881 current->in_iowait = 0;
5882 atomic_dec(&rq->nr_iowait);
5883 delayacct_blkio_end();
5884}
5885EXPORT_SYMBOL(io_schedule);
5886
5887long __sched io_schedule_timeout(long timeout)
5888{
5889 struct rq *rq = raw_rq();
5890 long ret;
5891
5892 delayacct_blkio_start();
5893 atomic_inc(&rq->nr_iowait);
5894 blk_flush_plug(current);
5895 current->in_iowait = 1;
5896 ret = schedule_timeout(timeout);
5897 current->in_iowait = 0;
5898 atomic_dec(&rq->nr_iowait);
5899 delayacct_blkio_end();
5900 return ret;
5901}
5902
5903
5904
5905
5906
5907
5908
5909
5910SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5911{
5912 int ret = -EINVAL;
5913
5914 switch (policy) {
5915 case SCHED_FIFO:
5916 case SCHED_RR:
5917 ret = MAX_USER_RT_PRIO-1;
5918 break;
5919 case SCHED_NORMAL:
5920 case SCHED_BATCH:
5921 case SCHED_IDLE:
5922 ret = 0;
5923 break;
5924 }
5925 return ret;
5926}
5927
5928
5929
5930
5931
5932
5933
5934
5935SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5936{
5937 int ret = -EINVAL;
5938
5939 switch (policy) {
5940 case SCHED_FIFO:
5941 case SCHED_RR:
5942 ret = 1;
5943 break;
5944 case SCHED_NORMAL:
5945 case SCHED_BATCH:
5946 case SCHED_IDLE:
5947 ret = 0;
5948 }
5949 return ret;
5950}
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5961 struct timespec __user *, interval)
5962{
5963 struct task_struct *p;
5964 unsigned int time_slice;
5965 unsigned long flags;
5966 struct rq *rq;
5967 int retval;
5968 struct timespec t;
5969
5970 if (pid < 0)
5971 return -EINVAL;
5972
5973 retval = -ESRCH;
5974 rcu_read_lock();
5975 p = find_process_by_pid(pid);
5976 if (!p)
5977 goto out_unlock;
5978
5979 retval = security_task_getscheduler(p);
5980 if (retval)
5981 goto out_unlock;
5982
5983 rq = task_rq_lock(p, &flags);
5984 time_slice = p->sched_class->get_rr_interval(rq, p);
5985 task_rq_unlock(rq, p, &flags);
5986
5987 rcu_read_unlock();
5988 jiffies_to_timespec(time_slice, &t);
5989 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5990 return retval;
5991
5992out_unlock:
5993 rcu_read_unlock();
5994 return retval;
5995}
5996
5997static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5998
5999void sched_show_task(struct task_struct *p)
6000{
6001 unsigned long free = 0;
6002 unsigned state;
6003
6004 state = p->state ? __ffs(p->state) + 1 : 0;
6005 printk(KERN_INFO "%-15.15s %c", p->comm,
6006 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6007#if BITS_PER_LONG == 32
6008 if (state == TASK_RUNNING)
6009 printk(KERN_CONT " running ");
6010 else
6011 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6012#else
6013 if (state == TASK_RUNNING)
6014 printk(KERN_CONT " running task ");
6015 else
6016 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6017#endif
6018#ifdef CONFIG_DEBUG_STACK_USAGE
6019 free = stack_not_used(p);
6020#endif
6021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6022 task_pid_nr(p), task_pid_nr(p->real_parent),
6023 (unsigned long)task_thread_info(p)->flags);
6024
6025 show_stack(p, NULL);
6026}
6027
6028void show_state_filter(unsigned long state_filter)
6029{
6030 struct task_struct *g, *p;
6031
6032#if BITS_PER_LONG == 32
6033 printk(KERN_INFO
6034 " task PC stack pid father\n");
6035#else
6036 printk(KERN_INFO
6037 " task PC stack pid father\n");
6038#endif
6039 rcu_read_lock();
6040 do_each_thread(g, p) {
6041
6042
6043
6044
6045 touch_nmi_watchdog();
6046 if (!state_filter || (p->state & state_filter))
6047 sched_show_task(p);
6048 } while_each_thread(g, p);
6049
6050 touch_all_softlockup_watchdogs();
6051
6052#ifdef CONFIG_SCHED_DEBUG
6053 sysrq_sched_debug_show();
6054#endif
6055 rcu_read_unlock();
6056
6057
6058
6059 if (!state_filter)
6060 debug_show_all_locks();
6061}
6062
6063void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6064{
6065 idle->sched_class = &idle_sched_class;
6066}
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076void __cpuinit init_idle(struct task_struct *idle, int cpu)
6077{
6078 struct rq *rq = cpu_rq(cpu);
6079 unsigned long flags;
6080
6081 raw_spin_lock_irqsave(&rq->lock, flags);
6082
6083 __sched_fork(idle);
6084 idle->state = TASK_RUNNING;
6085 idle->se.exec_start = sched_clock();
6086
6087 do_set_cpus_allowed(idle, cpumask_of(cpu));
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098 rcu_read_lock();
6099 __set_task_cpu(idle, cpu);
6100 rcu_read_unlock();
6101
6102 rq->curr = rq->idle = idle;
6103#if defined(CONFIG_SMP)
6104 idle->on_cpu = 1;
6105#endif
6106 raw_spin_unlock_irqrestore(&rq->lock, flags);
6107
6108
6109 task_thread_info(idle)->preempt_count = 0;
6110
6111
6112
6113
6114 idle->sched_class = &idle_sched_class;
6115 ftrace_graph_init_idle_task(idle, cpu);
6116#if defined(CONFIG_SMP)
6117 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6118#endif
6119}
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130static int get_update_sysctl_factor(void)
6131{
6132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
6147
6148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
6155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
6160#undef SET_SYSCTL
6161}
6162
6163static inline void sched_init_granularity(void)
6164{
6165 update_sysctl();
6166}
6167
6168#ifdef CONFIG_SMP
6169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{
6171 if (p->sched_class && p->sched_class->set_cpus_allowed)
6172 p->sched_class->set_cpus_allowed(p, new_mask);
6173
6174 cpumask_copy(&p->cpus_allowed, new_mask);
6175 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6176}
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6202{
6203 unsigned long flags;
6204 struct rq *rq;
6205 unsigned int dest_cpu;
6206 int ret = 0;
6207
6208 rq = task_rq_lock(p, &flags);
6209
6210 if (cpumask_equal(&p->cpus_allowed, new_mask))
6211 goto out;
6212
6213 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
6214 ret = -EINVAL;
6215 goto out;
6216 }
6217
6218 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
6219 ret = -EINVAL;
6220 goto out;
6221 }
6222
6223 do_set_cpus_allowed(p, new_mask);
6224
6225
6226 if (cpumask_test_cpu(task_cpu(p), new_mask))
6227 goto out;
6228
6229 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
6230 if (p->on_rq) {
6231 struct migration_arg arg = { p, dest_cpu };
6232
6233 task_rq_unlock(rq, p, &flags);
6234 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
6235 tlb_migrate_finish(p->mm);
6236 return 0;
6237 }
6238out:
6239 task_rq_unlock(rq, p, &flags);
6240
6241 return ret;
6242}
6243EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6257{
6258 struct rq *rq_dest, *rq_src;
6259 int ret = 0;
6260
6261 if (unlikely(!cpu_active(dest_cpu)))
6262 return ret;
6263
6264 rq_src = cpu_rq(src_cpu);
6265 rq_dest = cpu_rq(dest_cpu);
6266
6267 raw_spin_lock(&p->pi_lock);
6268 double_rq_lock(rq_src, rq_dest);
6269
6270 if (task_cpu(p) != src_cpu)
6271 goto done;
6272
6273 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6274 goto fail;
6275
6276
6277
6278
6279
6280 if (p->on_rq) {
6281 deactivate_task(rq_src, p, 0);
6282 set_task_cpu(p, dest_cpu);
6283 activate_task(rq_dest, p, 0);
6284 check_preempt_curr(rq_dest, p, 0);
6285 }
6286done:
6287 ret = 1;
6288fail:
6289 double_rq_unlock(rq_src, rq_dest);
6290 raw_spin_unlock(&p->pi_lock);
6291 return ret;
6292}
6293
6294
6295
6296
6297
6298
6299static int migration_cpu_stop(void *data)
6300{
6301 struct migration_arg *arg = data;
6302
6303
6304
6305
6306
6307 local_irq_disable();
6308 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6309 local_irq_enable();
6310 return 0;
6311}
6312
6313#ifdef CONFIG_HOTPLUG_CPU
6314
6315
6316
6317
6318
6319void idle_task_exit(void)
6320{
6321 struct mm_struct *mm = current->active_mm;
6322
6323 BUG_ON(cpu_online(smp_processor_id()));
6324
6325 if (mm != &init_mm)
6326 switch_mm(mm, &init_mm, current);
6327 mmdrop(mm);
6328}
6329
6330
6331
6332
6333
6334
6335
6336
6337static void migrate_nr_uninterruptible(struct rq *rq_src)
6338{
6339 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
6340
6341 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6342 rq_src->nr_uninterruptible = 0;
6343}
6344
6345
6346
6347
6348static void calc_global_load_remove(struct rq *rq)
6349{
6350 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6351 rq->calc_load_active = 0;
6352}
6353
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365
6366
6367
6368
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386static void migrate_tasks(unsigned int dead_cpu)
6387{
6388 struct rq *rq = cpu_rq(dead_cpu);
6389 struct task_struct *next, *stop = rq->stop;
6390 int dest_cpu;
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401 rq->stop = NULL;
6402
6403
6404 unthrottle_offline_cfs_rqs(rq);
6405
6406 for ( ; ; ) {
6407
6408
6409
6410
6411 if (rq->nr_running == 1)
6412 break;
6413
6414 next = pick_next_task(rq);
6415 BUG_ON(!next);
6416 next->sched_class->put_prev_task(rq, next);
6417
6418
6419 dest_cpu = select_fallback_rq(dead_cpu, next);
6420 raw_spin_unlock(&rq->lock);
6421
6422 __migrate_task(next, dead_cpu, dest_cpu);
6423
6424 raw_spin_lock(&rq->lock);
6425 }
6426
6427 rq->stop = stop;
6428}
6429
6430#endif
6431
6432#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6433
6434static struct ctl_table sd_ctl_dir[] = {
6435 {
6436 .procname = "sched_domain",
6437 .mode = 0555,
6438 },
6439 {}
6440};
6441
6442static struct ctl_table sd_ctl_root[] = {
6443 {
6444 .procname = "kernel",
6445 .mode = 0555,
6446 .child = sd_ctl_dir,
6447 },
6448 {}
6449};
6450
6451static struct ctl_table *sd_alloc_ctl_entry(int n)
6452{
6453 struct ctl_table *entry =
6454 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6455
6456 return entry;
6457}
6458
6459static void sd_free_ctl_entry(struct ctl_table **tablep)
6460{
6461 struct ctl_table *entry;
6462
6463
6464
6465
6466
6467
6468
6469 for (entry = *tablep; entry->mode; entry++) {
6470 if (entry->child)
6471 sd_free_ctl_entry(&entry->child);
6472 if (entry->proc_handler == NULL)
6473 kfree(entry->procname);
6474 }
6475
6476 kfree(*tablep);
6477 *tablep = NULL;
6478}
6479
6480static void
6481set_table_entry(struct ctl_table *entry,
6482 const char *procname, void *data, int maxlen,
6483 mode_t mode, proc_handler *proc_handler)
6484{
6485 entry->procname = procname;
6486 entry->data = data;
6487 entry->maxlen = maxlen;
6488 entry->mode = mode;
6489 entry->proc_handler = proc_handler;
6490}
6491
6492static struct ctl_table *
6493sd_alloc_ctl_domain_table(struct sched_domain *sd)
6494{
6495 struct ctl_table *table = sd_alloc_ctl_entry(13);
6496
6497 if (table == NULL)
6498 return NULL;
6499
6500 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6501 sizeof(long), 0644, proc_doulongvec_minmax);
6502 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6503 sizeof(long), 0644, proc_doulongvec_minmax);
6504 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6505 sizeof(int), 0644, proc_dointvec_minmax);
6506 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6507 sizeof(int), 0644, proc_dointvec_minmax);
6508 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6509 sizeof(int), 0644, proc_dointvec_minmax);
6510 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6511 sizeof(int), 0644, proc_dointvec_minmax);
6512 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6513 sizeof(int), 0644, proc_dointvec_minmax);
6514 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6515 sizeof(int), 0644, proc_dointvec_minmax);
6516 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6517 sizeof(int), 0644, proc_dointvec_minmax);
6518 set_table_entry(&table[9], "cache_nice_tries",
6519 &sd->cache_nice_tries,
6520 sizeof(int), 0644, proc_dointvec_minmax);
6521 set_table_entry(&table[10], "flags", &sd->flags,
6522 sizeof(int), 0644, proc_dointvec_minmax);
6523 set_table_entry(&table[11], "name", sd->name,
6524 CORENAME_MAX_SIZE, 0444, proc_dostring);
6525
6526
6527 return table;
6528}
6529
6530static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6531{
6532 struct ctl_table *entry, *table;
6533 struct sched_domain *sd;
6534 int domain_num = 0, i;
6535 char buf[32];
6536
6537 for_each_domain(cpu, sd)
6538 domain_num++;
6539 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6540 if (table == NULL)
6541 return NULL;
6542
6543 i = 0;
6544 for_each_domain(cpu, sd) {
6545 snprintf(buf, 32, "domain%d", i);
6546 entry->procname = kstrdup(buf, GFP_KERNEL);
6547 entry->mode = 0555;
6548 entry->child = sd_alloc_ctl_domain_table(sd);
6549 entry++;
6550 i++;
6551 }
6552 return table;
6553}
6554
6555static struct ctl_table_header *sd_sysctl_header;
6556static void register_sched_domain_sysctl(void)
6557{
6558 int i, cpu_num = num_possible_cpus();
6559 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6560 char buf[32];
6561
6562 WARN_ON(sd_ctl_dir[0].child);
6563 sd_ctl_dir[0].child = entry;
6564
6565 if (entry == NULL)
6566 return;
6567
6568 for_each_possible_cpu(i) {
6569 snprintf(buf, 32, "cpu%d", i);
6570 entry->procname = kstrdup(buf, GFP_KERNEL);
6571 entry->mode = 0555;
6572 entry->child = sd_alloc_ctl_cpu_table(i);
6573 entry++;
6574 }
6575
6576 WARN_ON(sd_sysctl_header);
6577 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6578}
6579
6580
6581static void unregister_sched_domain_sysctl(void)
6582{
6583 if (sd_sysctl_header)
6584 unregister_sysctl_table(sd_sysctl_header);
6585 sd_sysctl_header = NULL;
6586 if (sd_ctl_dir[0].child)
6587 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6588}
6589#else
6590static void register_sched_domain_sysctl(void)
6591{
6592}
6593static void unregister_sched_domain_sysctl(void)
6594{
6595}
6596#endif
6597
6598static void set_rq_online(struct rq *rq)
6599{
6600 if (!rq->online) {
6601 const struct sched_class *class;
6602
6603 cpumask_set_cpu(rq->cpu, rq->rd->online);
6604 rq->online = 1;
6605
6606 for_each_class(class) {
6607 if (class->rq_online)
6608 class->rq_online(rq);
6609 }
6610 }
6611}
6612
6613static void set_rq_offline(struct rq *rq)
6614{
6615 if (rq->online) {
6616 const struct sched_class *class;
6617
6618 for_each_class(class) {
6619 if (class->rq_offline)
6620 class->rq_offline(rq);
6621 }
6622
6623 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6624 rq->online = 0;
6625 }
6626}
6627
6628
6629
6630
6631
6632static int __cpuinit
6633migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6634{
6635 int cpu = (long)hcpu;
6636 unsigned long flags;
6637 struct rq *rq = cpu_rq(cpu);
6638
6639 switch (action & ~CPU_TASKS_FROZEN) {
6640
6641 case CPU_UP_PREPARE:
6642 rq->calc_load_update = calc_load_update;
6643 break;
6644
6645 case CPU_ONLINE:
6646
6647 raw_spin_lock_irqsave(&rq->lock, flags);
6648 if (rq->rd) {
6649 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6650
6651 set_rq_online(rq);
6652 }
6653 raw_spin_unlock_irqrestore(&rq->lock, flags);
6654 break;
6655
6656#ifdef CONFIG_HOTPLUG_CPU
6657 case CPU_DYING:
6658 sched_ttwu_pending();
6659
6660 raw_spin_lock_irqsave(&rq->lock, flags);
6661 if (rq->rd) {
6662 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6663 set_rq_offline(rq);
6664 }
6665 migrate_tasks(cpu);
6666 BUG_ON(rq->nr_running != 1);
6667 raw_spin_unlock_irqrestore(&rq->lock, flags);
6668
6669 migrate_nr_uninterruptible(rq);
6670 calc_global_load_remove(rq);
6671 break;
6672#endif
6673 }
6674
6675 update_max_interval();
6676
6677 return NOTIFY_OK;
6678}
6679
6680
6681
6682
6683
6684
6685static struct notifier_block __cpuinitdata migration_notifier = {
6686 .notifier_call = migration_call,
6687 .priority = CPU_PRI_MIGRATION,
6688};
6689
6690static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6691 unsigned long action, void *hcpu)
6692{
6693 switch (action & ~CPU_TASKS_FROZEN) {
6694 case CPU_ONLINE:
6695 case CPU_DOWN_FAILED:
6696 set_cpu_active((long)hcpu, true);
6697 return NOTIFY_OK;
6698 default:
6699 return NOTIFY_DONE;
6700 }
6701}
6702
6703static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6704 unsigned long action, void *hcpu)
6705{
6706 switch (action & ~CPU_TASKS_FROZEN) {
6707 case CPU_DOWN_PREPARE:
6708 set_cpu_active((long)hcpu, false);
6709 return NOTIFY_OK;
6710 default:
6711 return NOTIFY_DONE;
6712 }
6713}
6714
6715static int __init migration_init(void)
6716{
6717 void *cpu = (void *)(long)smp_processor_id();
6718 int err;
6719
6720
6721 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6722 BUG_ON(err == NOTIFY_BAD);
6723 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6724 register_cpu_notifier(&migration_notifier);
6725
6726
6727 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6728 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6729
6730 return 0;
6731}
6732early_initcall(migration_init);
6733#endif
6734
6735#ifdef CONFIG_SMP
6736
6737static cpumask_var_t sched_domains_tmpmask;
6738
6739#ifdef CONFIG_SCHED_DEBUG
6740
6741static __read_mostly int sched_domain_debug_enabled;
6742
6743static int __init sched_domain_debug_setup(char *str)
6744{
6745 sched_domain_debug_enabled = 1;
6746
6747 return 0;
6748}
6749early_param("sched_debug", sched_domain_debug_setup);
6750
6751static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6752 struct cpumask *groupmask)
6753{
6754 struct sched_group *group = sd->groups;
6755 char str[256];
6756
6757 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6758 cpumask_clear(groupmask);
6759
6760 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6761
6762 if (!(sd->flags & SD_LOAD_BALANCE)) {
6763 printk("does not load-balance\n");
6764 if (sd->parent)
6765 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6766 " has parent");
6767 return -1;
6768 }
6769
6770 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6771
6772 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6773 printk(KERN_ERR "ERROR: domain->span does not contain "
6774 "CPU%d\n", cpu);
6775 }
6776 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6777 printk(KERN_ERR "ERROR: domain->groups does not contain"
6778 " CPU%d\n", cpu);
6779 }
6780
6781 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6782 do {
6783 if (!group) {
6784 printk("\n");
6785 printk(KERN_ERR "ERROR: group is NULL\n");
6786 break;
6787 }
6788
6789 if (!group->sgp->power) {
6790 printk(KERN_CONT "\n");
6791 printk(KERN_ERR "ERROR: domain->cpu_power not "
6792 "set\n");
6793 break;
6794 }
6795
6796 if (!cpumask_weight(sched_group_cpus(group))) {
6797 printk(KERN_CONT "\n");
6798 printk(KERN_ERR "ERROR: empty group\n");
6799 break;
6800 }
6801
6802 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6803 printk(KERN_CONT "\n");
6804 printk(KERN_ERR "ERROR: repeated CPUs\n");
6805 break;
6806 }
6807
6808 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6809
6810 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6811
6812 printk(KERN_CONT " %s", str);
6813 if (group->sgp->power != SCHED_POWER_SCALE) {
6814 printk(KERN_CONT " (cpu_power = %d)",
6815 group->sgp->power);
6816 }
6817
6818 group = group->next;
6819 } while (group != sd->groups);
6820 printk(KERN_CONT "\n");
6821
6822 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6823 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6824
6825 if (sd->parent &&
6826 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6827 printk(KERN_ERR "ERROR: parent span is not a superset "
6828 "of domain->span\n");
6829 return 0;
6830}
6831
6832static void sched_domain_debug(struct sched_domain *sd, int cpu)
6833{
6834 int level = 0;
6835
6836 if (!sched_domain_debug_enabled)
6837 return;
6838
6839 if (!sd) {
6840 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6841 return;
6842 }
6843
6844 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6845
6846 for (;;) {
6847 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6848 break;
6849 level++;
6850 sd = sd->parent;
6851 if (!sd)
6852 break;
6853 }
6854}
6855#else
6856# define sched_domain_debug(sd, cpu) do { } while (0)
6857#endif
6858
6859static int sd_degenerate(struct sched_domain *sd)
6860{
6861 if (cpumask_weight(sched_domain_span(sd)) == 1)
6862 return 1;
6863
6864
6865 if (sd->flags & (SD_LOAD_BALANCE |
6866 SD_BALANCE_NEWIDLE |
6867 SD_BALANCE_FORK |
6868 SD_BALANCE_EXEC |
6869 SD_SHARE_CPUPOWER |
6870 SD_SHARE_PKG_RESOURCES)) {
6871 if (sd->groups != sd->groups->next)
6872 return 0;
6873 }
6874
6875
6876 if (sd->flags & (SD_WAKE_AFFINE))
6877 return 0;
6878
6879 return 1;
6880}
6881
6882static int
6883sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6884{
6885 unsigned long cflags = sd->flags, pflags = parent->flags;
6886
6887 if (sd_degenerate(parent))
6888 return 1;
6889
6890 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6891 return 0;
6892
6893
6894 if (parent->groups == parent->groups->next) {
6895 pflags &= ~(SD_LOAD_BALANCE |
6896 SD_BALANCE_NEWIDLE |
6897 SD_BALANCE_FORK |
6898 SD_BALANCE_EXEC |
6899 SD_SHARE_CPUPOWER |
6900 SD_SHARE_PKG_RESOURCES);
6901 if (nr_node_ids == 1)
6902 pflags &= ~SD_SERIALIZE;
6903 }
6904 if (~cflags & pflags)
6905 return 0;
6906
6907 return 1;
6908}
6909
6910static void free_rootdomain(struct rcu_head *rcu)
6911{
6912 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6913
6914 cpupri_cleanup(&rd->cpupri);
6915 free_cpumask_var(rd->rto_mask);
6916 free_cpumask_var(rd->online);
6917 free_cpumask_var(rd->span);
6918 kfree(rd);
6919}
6920
6921static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6922{
6923 struct root_domain *old_rd = NULL;
6924 unsigned long flags;
6925
6926 raw_spin_lock_irqsave(&rq->lock, flags);
6927
6928 if (rq->rd) {
6929 old_rd = rq->rd;
6930
6931 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6932 set_rq_offline(rq);
6933
6934 cpumask_clear_cpu(rq->cpu, old_rd->span);
6935
6936
6937
6938
6939
6940
6941 if (!atomic_dec_and_test(&old_rd->refcount))
6942 old_rd = NULL;
6943 }
6944
6945 atomic_inc(&rd->refcount);
6946 rq->rd = rd;
6947
6948 cpumask_set_cpu(rq->cpu, rd->span);
6949 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6950 set_rq_online(rq);
6951
6952 raw_spin_unlock_irqrestore(&rq->lock, flags);
6953
6954 if (old_rd)
6955 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6956}
6957
6958static int init_rootdomain(struct root_domain *rd)
6959{
6960 memset(rd, 0, sizeof(*rd));
6961
6962 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6963 goto out;
6964 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6965 goto free_span;
6966 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6967 goto free_online;
6968
6969 if (cpupri_init(&rd->cpupri) != 0)
6970 goto free_rto_mask;
6971 return 0;
6972
6973free_rto_mask:
6974 free_cpumask_var(rd->rto_mask);
6975free_online:
6976 free_cpumask_var(rd->online);
6977free_span:
6978 free_cpumask_var(rd->span);
6979out:
6980 return -ENOMEM;
6981}
6982
6983static void init_defrootdomain(void)
6984{
6985 init_rootdomain(&def_root_domain);
6986
6987 atomic_set(&def_root_domain.refcount, 1);
6988}
6989
6990static struct root_domain *alloc_rootdomain(void)
6991{
6992 struct root_domain *rd;
6993
6994 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6995 if (!rd)
6996 return NULL;
6997
6998 if (init_rootdomain(rd) != 0) {
6999 kfree(rd);
7000 return NULL;
7001 }
7002
7003 return rd;
7004}
7005
7006static void free_sched_groups(struct sched_group *sg, int free_sgp)
7007{
7008 struct sched_group *tmp, *first;
7009
7010 if (!sg)
7011 return;
7012
7013 first = sg;
7014 do {
7015 tmp = sg->next;
7016
7017 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
7018 kfree(sg->sgp);
7019
7020 kfree(sg);
7021 sg = tmp;
7022 } while (sg != first);
7023}
7024
7025static void free_sched_domain(struct rcu_head *rcu)
7026{
7027 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
7028
7029
7030
7031
7032
7033 if (sd->flags & SD_OVERLAP) {
7034 free_sched_groups(sd->groups, 1);
7035 } else if (atomic_dec_and_test(&sd->groups->ref)) {
7036 kfree(sd->groups->sgp);
7037 kfree(sd->groups);
7038 }
7039 kfree(sd);
7040}
7041
7042static void destroy_sched_domain(struct sched_domain *sd, int cpu)
7043{
7044 call_rcu(&sd->rcu, free_sched_domain);
7045}
7046
7047static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7048{
7049 for (; sd; sd = sd->parent)
7050 destroy_sched_domain(sd, cpu);
7051}
7052
7053
7054
7055
7056
7057static void
7058cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7059{
7060 struct rq *rq = cpu_rq(cpu);
7061 struct sched_domain *tmp;
7062
7063
7064 for (tmp = sd; tmp; ) {
7065 struct sched_domain *parent = tmp->parent;
7066 if (!parent)
7067 break;
7068
7069 if (sd_parent_degenerate(tmp, parent)) {
7070 tmp->parent = parent->parent;
7071 if (parent->parent)
7072 parent->parent->child = tmp;
7073 destroy_sched_domain(parent, cpu);
7074 } else
7075 tmp = tmp->parent;
7076 }
7077
7078 if (sd && sd_degenerate(sd)) {
7079 tmp = sd;
7080 sd = sd->parent;
7081 destroy_sched_domain(tmp, cpu);
7082 if (sd)
7083 sd->child = NULL;
7084 }
7085
7086 sched_domain_debug(sd, cpu);
7087
7088 rq_attach_root(rq, rd);
7089 tmp = rq->sd;
7090 rcu_assign_pointer(rq->sd, sd);
7091 destroy_sched_domains(tmp, cpu);
7092}
7093
7094
7095static cpumask_var_t cpu_isolated_map;
7096
7097
7098static int __init isolated_cpu_setup(char *str)
7099{
7100 alloc_bootmem_cpumask_var(&cpu_isolated_map);
7101 cpulist_parse(str, cpu_isolated_map);
7102 return 1;
7103}
7104
7105__setup("isolcpus=", isolated_cpu_setup);
7106
7107#ifdef CONFIG_NUMA
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119static int find_next_best_node(int node, nodemask_t *used_nodes)
7120{
7121 int i, n, val, min_val, best_node = -1;
7122
7123 min_val = INT_MAX;
7124
7125 for (i = 0; i < nr_node_ids; i++) {
7126
7127 n = (node + i) % nr_node_ids;
7128
7129 if (!nr_cpus_node(n))
7130 continue;
7131
7132
7133 if (node_isset(n, *used_nodes))
7134 continue;
7135
7136
7137 val = node_distance(node, n);
7138
7139 if (val < min_val) {
7140 min_val = val;
7141 best_node = n;
7142 }
7143 }
7144
7145 if (best_node != -1)
7146 node_set(best_node, *used_nodes);
7147 return best_node;
7148}
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159static void sched_domain_node_span(int node, struct cpumask *span)
7160{
7161 nodemask_t used_nodes;
7162 int i;
7163
7164 cpumask_clear(span);
7165 nodes_clear(used_nodes);
7166
7167 cpumask_or(span, span, cpumask_of_node(node));
7168 node_set(node, used_nodes);
7169
7170 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7171 int next_node = find_next_best_node(node, &used_nodes);
7172 if (next_node < 0)
7173 break;
7174 cpumask_or(span, span, cpumask_of_node(next_node));
7175 }
7176}
7177
7178static const struct cpumask *cpu_node_mask(int cpu)
7179{
7180 lockdep_assert_held(&sched_domains_mutex);
7181
7182 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7183
7184 return sched_domains_tmpmask;
7185}
7186
7187static const struct cpumask *cpu_allnodes_mask(int cpu)
7188{
7189 return cpu_possible_mask;
7190}
7191#endif
7192
7193static const struct cpumask *cpu_cpu_mask(int cpu)
7194{
7195 return cpumask_of_node(cpu_to_node(cpu));
7196}
7197
7198int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7199
7200struct sd_data {
7201 struct sched_domain **__percpu sd;
7202 struct sched_group **__percpu sg;
7203 struct sched_group_power **__percpu sgp;
7204};
7205
7206struct s_data {
7207 struct sched_domain ** __percpu sd;
7208 struct root_domain *rd;
7209};
7210
7211enum s_alloc {
7212 sa_rootdomain,
7213 sa_sd,
7214 sa_sd_storage,
7215 sa_none,
7216};
7217
7218struct sched_domain_topology_level;
7219
7220typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
7221typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
7222
7223#define SDTL_OVERLAP 0x01
7224
7225struct sched_domain_topology_level {
7226 sched_domain_init_f init;
7227 sched_domain_mask_f mask;
7228 int flags;
7229 struct sd_data data;
7230};
7231
7232static int
7233build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7234{
7235 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7236 const struct cpumask *span = sched_domain_span(sd);
7237 struct cpumask *covered = sched_domains_tmpmask;
7238 struct sd_data *sdd = sd->private;
7239 struct sched_domain *child;
7240 int i;
7241
7242 cpumask_clear(covered);
7243
7244 for_each_cpu(i, span) {
7245 struct cpumask *sg_span;
7246
7247 if (cpumask_test_cpu(i, covered))
7248 continue;
7249
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i));
7252
7253 if (!sg)
7254 goto fail;
7255
7256 sg_span = sched_group_cpus(sg);
7257
7258 child = *per_cpu_ptr(sdd->sd, i);
7259 if (child->child) {
7260 child = child->child;
7261 cpumask_copy(sg_span, sched_domain_span(child));
7262 } else
7263 cpumask_set_cpu(i, sg_span);
7264
7265 cpumask_or(covered, covered, sg_span);
7266
7267 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7268 atomic_inc(&sg->sgp->ref);
7269
7270 if (cpumask_test_cpu(cpu, sg_span))
7271 groups = sg;
7272
7273 if (!first)
7274 first = sg;
7275 if (last)
7276 last->next = sg;
7277 last = sg;
7278 last->next = first;
7279 }
7280 sd->groups = groups;
7281
7282 return 0;
7283
7284fail:
7285 free_sched_groups(first, 0);
7286
7287 return -ENOMEM;
7288}
7289
7290static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
7291{
7292 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7293 struct sched_domain *child = sd->child;
7294
7295 if (child)
7296 cpu = cpumask_first(sched_domain_span(child));
7297
7298 if (sg) {
7299 *sg = *per_cpu_ptr(sdd->sg, cpu);
7300 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7301 atomic_set(&(*sg)->sgp->ref, 1);
7302 }
7303
7304 return cpu;
7305}
7306
7307
7308
7309
7310
7311
7312
7313
7314static int
7315build_sched_groups(struct sched_domain *sd, int cpu)
7316{
7317 struct sched_group *first = NULL, *last = NULL;
7318 struct sd_data *sdd = sd->private;
7319 const struct cpumask *span = sched_domain_span(sd);
7320 struct cpumask *covered;
7321 int i;
7322
7323 get_group(cpu, sdd, &sd->groups);
7324 atomic_inc(&sd->groups->ref);
7325
7326 if (cpu != cpumask_first(sched_domain_span(sd)))
7327 return 0;
7328
7329 lockdep_assert_held(&sched_domains_mutex);
7330 covered = sched_domains_tmpmask;
7331
7332 cpumask_clear(covered);
7333
7334 for_each_cpu(i, span) {
7335 struct sched_group *sg;
7336 int group = get_group(i, sdd, &sg);
7337 int j;
7338
7339 if (cpumask_test_cpu(i, covered))
7340 continue;
7341
7342 cpumask_clear(sched_group_cpus(sg));
7343 sg->sgp->power = 0;
7344
7345 for_each_cpu(j, span) {
7346 if (get_group(j, sdd, NULL) != group)
7347 continue;
7348
7349 cpumask_set_cpu(j, covered);
7350 cpumask_set_cpu(j, sched_group_cpus(sg));
7351 }
7352
7353 if (!first)
7354 first = sg;
7355 if (last)
7356 last->next = sg;
7357 last = sg;
7358 }
7359 last->next = first;
7360
7361 return 0;
7362}
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7375{
7376 struct sched_group *sg = sd->groups;
7377
7378 WARN_ON(!sd || !sg);
7379
7380 do {
7381 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7382 sg = sg->next;
7383 } while (sg != sd->groups);
7384
7385 if (cpu != group_first_cpu(sg))
7386 return;
7387
7388 update_group_power(sd, cpu);
7389}
7390
7391
7392
7393
7394
7395
7396#ifdef CONFIG_SCHED_DEBUG
7397# define SD_INIT_NAME(sd, type) sd->name = #type
7398#else
7399# define SD_INIT_NAME(sd, type) do { } while (0)
7400#endif
7401
7402#define SD_INIT_FUNC(type) \
7403static noinline struct sched_domain * \
7404sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7405{ \
7406 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7407 *sd = SD_##type##_INIT; \
7408 SD_INIT_NAME(sd, type); \
7409 sd->private = &tl->data; \
7410 return sd; \
7411}
7412
7413SD_INIT_FUNC(CPU)
7414#ifdef CONFIG_NUMA
7415 SD_INIT_FUNC(ALLNODES)
7416 SD_INIT_FUNC(NODE)
7417#endif
7418#ifdef CONFIG_SCHED_SMT
7419 SD_INIT_FUNC(SIBLING)
7420#endif
7421#ifdef CONFIG_SCHED_MC
7422 SD_INIT_FUNC(MC)
7423#endif
7424#ifdef CONFIG_SCHED_BOOK
7425 SD_INIT_FUNC(BOOK)
7426#endif
7427
7428static int default_relax_domain_level = -1;
7429int sched_domain_level_max;
7430
7431static int __init setup_relax_domain_level(char *str)
7432{
7433 unsigned long val;
7434
7435 val = simple_strtoul(str, NULL, 0);
7436 if (val < sched_domain_level_max)
7437 default_relax_domain_level = val;
7438
7439 return 1;
7440}
7441__setup("relax_domain_level=", setup_relax_domain_level);
7442
7443static void set_domain_attribute(struct sched_domain *sd,
7444 struct sched_domain_attr *attr)
7445{
7446 int request;
7447
7448 if (!attr || attr->relax_domain_level < 0) {
7449 if (default_relax_domain_level < 0)
7450 return;
7451 else
7452 request = default_relax_domain_level;
7453 } else
7454 request = attr->relax_domain_level;
7455 if (request < sd->level) {
7456
7457 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7458 } else {
7459
7460 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7461 }
7462}
7463
7464static void __sdt_free(const struct cpumask *cpu_map);
7465static int __sdt_alloc(const struct cpumask *cpu_map);
7466
7467static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7468 const struct cpumask *cpu_map)
7469{
7470 switch (what) {
7471 case sa_rootdomain:
7472 if (!atomic_read(&d->rd->refcount))
7473 free_rootdomain(&d->rd->rcu);
7474 case sa_sd:
7475 free_percpu(d->sd);
7476 case sa_sd_storage:
7477 __sdt_free(cpu_map);
7478 case sa_none:
7479 break;
7480 }
7481}
7482
7483static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7484 const struct cpumask *cpu_map)
7485{
7486 memset(d, 0, sizeof(*d));
7487
7488 if (__sdt_alloc(cpu_map))
7489 return sa_sd_storage;
7490 d->sd = alloc_percpu(struct sched_domain *);
7491 if (!d->sd)
7492 return sa_sd_storage;
7493 d->rd = alloc_rootdomain();
7494 if (!d->rd)
7495 return sa_sd;
7496 return sa_rootdomain;
7497}
7498
7499
7500
7501
7502
7503
7504static void claim_allocations(int cpu, struct sched_domain *sd)
7505{
7506 struct sd_data *sdd = sd->private;
7507
7508 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7509 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7510
7511 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7512 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7513
7514 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7515 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7516}
7517
7518#ifdef CONFIG_SCHED_SMT
7519static const struct cpumask *cpu_smt_mask(int cpu)
7520{
7521 return topology_thread_cpumask(cpu);
7522}
7523#endif
7524
7525
7526
7527
7528static struct sched_domain_topology_level default_topology[] = {
7529#ifdef CONFIG_SCHED_SMT
7530 { sd_init_SIBLING, cpu_smt_mask, },
7531#endif
7532#ifdef CONFIG_SCHED_MC
7533 { sd_init_MC, cpu_coregroup_mask, },
7534#endif
7535#ifdef CONFIG_SCHED_BOOK
7536 { sd_init_BOOK, cpu_book_mask, },
7537#endif
7538 { sd_init_CPU, cpu_cpu_mask, },
7539#ifdef CONFIG_NUMA
7540 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7541 { sd_init_ALLNODES, cpu_allnodes_mask, },
7542#endif
7543 { NULL, },
7544};
7545
7546static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7547
7548static int __sdt_alloc(const struct cpumask *cpu_map)
7549{
7550 struct sched_domain_topology_level *tl;
7551 int j;
7552
7553 for (tl = sched_domain_topology; tl->init; tl++) {
7554 struct sd_data *sdd = &tl->data;
7555
7556 sdd->sd = alloc_percpu(struct sched_domain *);
7557 if (!sdd->sd)
7558 return -ENOMEM;
7559
7560 sdd->sg = alloc_percpu(struct sched_group *);
7561 if (!sdd->sg)
7562 return -ENOMEM;
7563
7564 sdd->sgp = alloc_percpu(struct sched_group_power *);
7565 if (!sdd->sgp)
7566 return -ENOMEM;
7567
7568 for_each_cpu(j, cpu_map) {
7569 struct sched_domain *sd;
7570 struct sched_group *sg;
7571 struct sched_group_power *sgp;
7572
7573 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7574 GFP_KERNEL, cpu_to_node(j));
7575 if (!sd)
7576 return -ENOMEM;
7577
7578 *per_cpu_ptr(sdd->sd, j) = sd;
7579
7580 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7581 GFP_KERNEL, cpu_to_node(j));
7582 if (!sg)
7583 return -ENOMEM;
7584
7585 *per_cpu_ptr(sdd->sg, j) = sg;
7586
7587 sgp = kzalloc_node(sizeof(struct sched_group_power),
7588 GFP_KERNEL, cpu_to_node(j));
7589 if (!sgp)
7590 return -ENOMEM;
7591
7592 *per_cpu_ptr(sdd->sgp, j) = sgp;
7593 }
7594 }
7595
7596 return 0;
7597}
7598
7599static void __sdt_free(const struct cpumask *cpu_map)
7600{
7601 struct sched_domain_topology_level *tl;
7602 int j;
7603
7604 for (tl = sched_domain_topology; tl->init; tl++) {
7605 struct sd_data *sdd = &tl->data;
7606
7607 for_each_cpu(j, cpu_map) {
7608 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7609 if (sd && (sd->flags & SD_OVERLAP))
7610 free_sched_groups(sd->groups, 0);
7611 kfree(*per_cpu_ptr(sdd->sd, j));
7612 kfree(*per_cpu_ptr(sdd->sg, j));
7613 kfree(*per_cpu_ptr(sdd->sgp, j));
7614 }
7615 free_percpu(sdd->sd);
7616 free_percpu(sdd->sg);
7617 free_percpu(sdd->sgp);
7618 }
7619}
7620
7621struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7622 struct s_data *d, const struct cpumask *cpu_map,
7623 struct sched_domain_attr *attr, struct sched_domain *child,
7624 int cpu)
7625{
7626 struct sched_domain *sd = tl->init(tl, cpu);
7627 if (!sd)
7628 return child;
7629
7630 set_domain_attribute(sd, attr);
7631 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7632 if (child) {
7633 sd->level = child->level + 1;
7634 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7635 child->parent = sd;
7636 }
7637 sd->child = child;
7638
7639 return sd;
7640}
7641
7642
7643
7644
7645
7646static int build_sched_domains(const struct cpumask *cpu_map,
7647 struct sched_domain_attr *attr)
7648{
7649 enum s_alloc alloc_state = sa_none;
7650 struct sched_domain *sd;
7651 struct s_data d;
7652 int i, ret = -ENOMEM;
7653
7654 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7655 if (alloc_state != sa_rootdomain)
7656 goto error;
7657
7658
7659 for_each_cpu(i, cpu_map) {
7660 struct sched_domain_topology_level *tl;
7661
7662 sd = NULL;
7663 for (tl = sched_domain_topology; tl->init; tl++) {
7664 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7665 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7666 sd->flags |= SD_OVERLAP;
7667 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7668 break;
7669 }
7670
7671 while (sd->child)
7672 sd = sd->child;
7673
7674 *per_cpu_ptr(d.sd, i) = sd;
7675 }
7676
7677
7678 for_each_cpu(i, cpu_map) {
7679 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7680 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7681 if (sd->flags & SD_OVERLAP) {
7682 if (build_overlap_sched_groups(sd, i))
7683 goto error;
7684 } else {
7685 if (build_sched_groups(sd, i))
7686 goto error;
7687 }
7688 }
7689 }
7690
7691
7692 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7693 if (!cpumask_test_cpu(i, cpu_map))
7694 continue;
7695
7696 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7697 claim_allocations(i, sd);
7698 init_sched_groups_power(i, sd);
7699 }
7700 }
7701
7702
7703 rcu_read_lock();
7704 for_each_cpu(i, cpu_map) {
7705 sd = *per_cpu_ptr(d.sd, i);
7706 cpu_attach_domain(sd, d.rd, i);
7707 }
7708 rcu_read_unlock();
7709
7710 ret = 0;
7711error:
7712 __free_domain_allocs(&d, alloc_state, cpu_map);
7713 return ret;
7714}
7715
7716static cpumask_var_t *doms_cur;
7717static int ndoms_cur;
7718static struct sched_domain_attr *dattr_cur;
7719
7720
7721
7722
7723
7724
7725
7726static cpumask_var_t fallback_doms;
7727
7728
7729
7730
7731
7732
7733int __attribute__((weak)) arch_update_cpu_topology(void)
7734{
7735 return 0;
7736}
7737
7738cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7739{
7740 int i;
7741 cpumask_var_t *doms;
7742
7743 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7744 if (!doms)
7745 return NULL;
7746 for (i = 0; i < ndoms; i++) {
7747 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7748 free_sched_domains(doms, i);
7749 return NULL;
7750 }
7751 }
7752 return doms;
7753}
7754
7755void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7756{
7757 unsigned int i;
7758 for (i = 0; i < ndoms; i++)
7759 free_cpumask_var(doms[i]);
7760 kfree(doms);
7761}
7762
7763
7764
7765
7766
7767
7768static int init_sched_domains(const struct cpumask *cpu_map)
7769{
7770 int err;
7771
7772 arch_update_cpu_topology();
7773 ndoms_cur = 1;
7774 doms_cur = alloc_sched_domains(ndoms_cur);
7775 if (!doms_cur)
7776 doms_cur = &fallback_doms;
7777 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7778 dattr_cur = NULL;
7779 err = build_sched_domains(doms_cur[0], NULL);
7780 register_sched_domain_sysctl();
7781
7782 return err;
7783}
7784
7785
7786
7787
7788
7789static void detach_destroy_domains(const struct cpumask *cpu_map)
7790{
7791 int i;
7792
7793 rcu_read_lock();
7794 for_each_cpu(i, cpu_map)
7795 cpu_attach_domain(NULL, &def_root_domain, i);
7796 rcu_read_unlock();
7797}
7798
7799
7800static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7801 struct sched_domain_attr *new, int idx_new)
7802{
7803 struct sched_domain_attr tmp;
7804
7805
7806 if (!new && !cur)
7807 return 1;
7808
7809 tmp = SD_ATTR_INIT;
7810 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7811 new ? (new + idx_new) : &tmp,
7812 sizeof(struct sched_domain_attr));
7813}
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7842 struct sched_domain_attr *dattr_new)
7843{
7844 int i, j, n;
7845 int new_topology;
7846
7847 mutex_lock(&sched_domains_mutex);
7848
7849
7850 unregister_sched_domain_sysctl();
7851
7852
7853 new_topology = arch_update_cpu_topology();
7854
7855 n = doms_new ? ndoms_new : 0;
7856
7857
7858 for (i = 0; i < ndoms_cur; i++) {
7859 for (j = 0; j < n && !new_topology; j++) {
7860 if (cpumask_equal(doms_cur[i], doms_new[j])
7861 && dattrs_equal(dattr_cur, i, dattr_new, j))
7862 goto match1;
7863 }
7864
7865 detach_destroy_domains(doms_cur[i]);
7866match1:
7867 ;
7868 }
7869
7870 if (doms_new == NULL) {
7871 ndoms_cur = 0;
7872 doms_new = &fallback_doms;
7873 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7874 WARN_ON_ONCE(dattr_new);
7875 }
7876
7877
7878 for (i = 0; i < ndoms_new; i++) {
7879 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7880 if (cpumask_equal(doms_new[i], doms_cur[j])
7881 && dattrs_equal(dattr_new, i, dattr_cur, j))
7882 goto match2;
7883 }
7884
7885 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7886match2:
7887 ;
7888 }
7889
7890
7891 if (doms_cur != &fallback_doms)
7892 free_sched_domains(doms_cur, ndoms_cur);
7893 kfree(dattr_cur);
7894 doms_cur = doms_new;
7895 dattr_cur = dattr_new;
7896 ndoms_cur = ndoms_new;
7897
7898 register_sched_domain_sysctl();
7899
7900 mutex_unlock(&sched_domains_mutex);
7901}
7902
7903#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7904static void reinit_sched_domains(void)
7905{
7906 get_online_cpus();
7907
7908
7909 partition_sched_domains(0, NULL, NULL);
7910
7911 rebuild_sched_domains();
7912 put_online_cpus();
7913}
7914
7915static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7916{
7917 unsigned int level = 0;
7918
7919 if (sscanf(buf, "%u", &level) != 1)
7920 return -EINVAL;
7921
7922
7923
7924
7925
7926
7927
7928
7929 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7930 return -EINVAL;
7931
7932 if (smt)
7933 sched_smt_power_savings = level;
7934 else
7935 sched_mc_power_savings = level;
7936
7937 reinit_sched_domains();
7938
7939 return count;
7940}
7941
7942#ifdef CONFIG_SCHED_MC
7943static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7944 struct sysdev_class_attribute *attr,
7945 char *page)
7946{
7947 return sprintf(page, "%u\n", sched_mc_power_savings);
7948}
7949static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7950 struct sysdev_class_attribute *attr,
7951 const char *buf, size_t count)
7952{
7953 return sched_power_savings_store(buf, count, 0);
7954}
7955static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7956 sched_mc_power_savings_show,
7957 sched_mc_power_savings_store);
7958#endif
7959
7960#ifdef CONFIG_SCHED_SMT
7961static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7962 struct sysdev_class_attribute *attr,
7963 char *page)
7964{
7965 return sprintf(page, "%u\n", sched_smt_power_savings);
7966}
7967static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7968 struct sysdev_class_attribute *attr,
7969 const char *buf, size_t count)
7970{
7971 return sched_power_savings_store(buf, count, 1);
7972}
7973static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7974 sched_smt_power_savings_show,
7975 sched_smt_power_savings_store);
7976#endif
7977
7978int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7979{
7980 int err = 0;
7981
7982#ifdef CONFIG_SCHED_SMT
7983 if (smt_capable())
7984 err = sysfs_create_file(&cls->kset.kobj,
7985 &attr_sched_smt_power_savings.attr);
7986#endif
7987#ifdef CONFIG_SCHED_MC
7988 if (!err && mc_capable())
7989 err = sysfs_create_file(&cls->kset.kobj,
7990 &attr_sched_mc_power_savings.attr);
7991#endif
7992 return err;
7993}
7994#endif
7995
7996
7997
7998
7999
8000
8001static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
8002 void *hcpu)
8003{
8004 switch (action & ~CPU_TASKS_FROZEN) {
8005 case CPU_ONLINE:
8006 case CPU_DOWN_FAILED:
8007 cpuset_update_active_cpus();
8008 return NOTIFY_OK;
8009 default:
8010 return NOTIFY_DONE;
8011 }
8012}
8013
8014static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8015 void *hcpu)
8016{
8017 switch (action & ~CPU_TASKS_FROZEN) {
8018 case CPU_DOWN_PREPARE:
8019 cpuset_update_active_cpus();
8020 return NOTIFY_OK;
8021 default:
8022 return NOTIFY_DONE;
8023 }
8024}
8025
8026static int update_runtime(struct notifier_block *nfb,
8027 unsigned long action, void *hcpu)
8028{
8029 int cpu = (int)(long)hcpu;
8030
8031 switch (action) {
8032 case CPU_DOWN_PREPARE:
8033 case CPU_DOWN_PREPARE_FROZEN:
8034 disable_runtime(cpu_rq(cpu));
8035 return NOTIFY_OK;
8036
8037 case CPU_DOWN_FAILED:
8038 case CPU_DOWN_FAILED_FROZEN:
8039 case CPU_ONLINE:
8040 case CPU_ONLINE_FROZEN:
8041 enable_runtime(cpu_rq(cpu));
8042 return NOTIFY_OK;
8043
8044 default:
8045 return NOTIFY_DONE;
8046 }
8047}
8048
8049void __init sched_init_smp(void)
8050{
8051 cpumask_var_t non_isolated_cpus;
8052
8053 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8054 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8055
8056 get_online_cpus();
8057 mutex_lock(&sched_domains_mutex);
8058 init_sched_domains(cpu_active_mask);
8059 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8060 if (cpumask_empty(non_isolated_cpus))
8061 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8062 mutex_unlock(&sched_domains_mutex);
8063 put_online_cpus();
8064
8065 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
8066 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
8067
8068
8069 hotcpu_notifier(update_runtime, 0);
8070
8071 init_hrtick();
8072
8073
8074 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8075 BUG();
8076 sched_init_granularity();
8077 free_cpumask_var(non_isolated_cpus);
8078
8079 init_sched_rt_class();
8080}
8081#else
8082void __init sched_init_smp(void)
8083{
8084 sched_init_granularity();
8085}
8086#endif
8087
8088const_debug unsigned int sysctl_timer_migration = 1;
8089
8090int in_sched_functions(unsigned long addr)
8091{
8092 return in_lock_functions(addr) ||
8093 (addr >= (unsigned long)__sched_text_start
8094 && addr < (unsigned long)__sched_text_end);
8095}
8096
8097static void init_cfs_rq(struct cfs_rq *cfs_rq)
8098{
8099 cfs_rq->tasks_timeline = RB_ROOT;
8100 INIT_LIST_HEAD(&cfs_rq->tasks);
8101 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8102#ifndef CONFIG_64BIT
8103 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8104#endif
8105}
8106
8107static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8108{
8109 struct rt_prio_array *array;
8110 int i;
8111
8112 array = &rt_rq->active;
8113 for (i = 0; i < MAX_RT_PRIO; i++) {
8114 INIT_LIST_HEAD(array->queue + i);
8115 __clear_bit(i, array->bitmap);
8116 }
8117
8118 __set_bit(MAX_RT_PRIO, array->bitmap);
8119
8120#if defined CONFIG_SMP
8121 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8122 rt_rq->highest_prio.next = MAX_RT_PRIO;
8123 rt_rq->rt_nr_migratory = 0;
8124 rt_rq->overloaded = 0;
8125 plist_head_init(&rt_rq->pushable_tasks);
8126#endif
8127
8128 rt_rq->rt_time = 0;
8129 rt_rq->rt_throttled = 0;
8130 rt_rq->rt_runtime = 0;
8131 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8132}
8133
8134#ifdef CONFIG_FAIR_GROUP_SCHED
8135static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8136 struct sched_entity *se, int cpu,
8137 struct sched_entity *parent)
8138{
8139 struct rq *rq = cpu_rq(cpu);
8140
8141 cfs_rq->tg = tg;
8142 cfs_rq->rq = rq;
8143#ifdef CONFIG_SMP
8144
8145 cfs_rq->load_stamp = 1;
8146#endif
8147 init_cfs_rq_runtime(cfs_rq);
8148
8149 tg->cfs_rq[cpu] = cfs_rq;
8150 tg->se[cpu] = se;
8151
8152
8153 if (!se)
8154 return;
8155
8156 if (!parent)
8157 se->cfs_rq = &rq->cfs;
8158 else
8159 se->cfs_rq = parent->my_q;
8160
8161 se->my_q = cfs_rq;
8162 update_load_set(&se->load, 0);
8163 se->parent = parent;
8164}
8165#endif
8166
8167#ifdef CONFIG_RT_GROUP_SCHED
8168static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8169 struct sched_rt_entity *rt_se, int cpu,
8170 struct sched_rt_entity *parent)
8171{
8172 struct rq *rq = cpu_rq(cpu);
8173
8174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8175 rt_rq->rt_nr_boosted = 0;
8176 rt_rq->rq = rq;
8177 rt_rq->tg = tg;
8178
8179 tg->rt_rq[cpu] = rt_rq;
8180 tg->rt_se[cpu] = rt_se;
8181
8182 if (!rt_se)
8183 return;
8184
8185 if (!parent)
8186 rt_se->rt_rq = &rq->rt;
8187 else
8188 rt_se->rt_rq = parent->my_q;
8189
8190 rt_se->my_q = rt_rq;
8191 rt_se->parent = parent;
8192 INIT_LIST_HEAD(&rt_se->run_list);
8193}
8194#endif
8195
8196void __init sched_init(void)
8197{
8198 int i, j;
8199 unsigned long alloc_size = 0, ptr;
8200
8201#ifdef CONFIG_FAIR_GROUP_SCHED
8202 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8203#endif
8204#ifdef CONFIG_RT_GROUP_SCHED
8205 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8206#endif
8207#ifdef CONFIG_CPUMASK_OFFSTACK
8208 alloc_size += num_possible_cpus() * cpumask_size();
8209#endif
8210 if (alloc_size) {
8211 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8212
8213#ifdef CONFIG_FAIR_GROUP_SCHED
8214 root_task_group.se = (struct sched_entity **)ptr;
8215 ptr += nr_cpu_ids * sizeof(void **);
8216
8217 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8218 ptr += nr_cpu_ids * sizeof(void **);
8219
8220#endif
8221#ifdef CONFIG_RT_GROUP_SCHED
8222 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8223 ptr += nr_cpu_ids * sizeof(void **);
8224
8225 root_task_group.rt_rq = (struct rt_rq **)ptr;
8226 ptr += nr_cpu_ids * sizeof(void **);
8227
8228#endif
8229#ifdef CONFIG_CPUMASK_OFFSTACK
8230 for_each_possible_cpu(i) {
8231 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8232 ptr += cpumask_size();
8233 }
8234#endif
8235 }
8236
8237#ifdef CONFIG_SMP
8238 init_defrootdomain();
8239#endif
8240
8241 init_rt_bandwidth(&def_rt_bandwidth,
8242 global_rt_period(), global_rt_runtime());
8243
8244#ifdef CONFIG_RT_GROUP_SCHED
8245 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8246 global_rt_period(), global_rt_runtime());
8247#endif
8248
8249#ifdef CONFIG_CGROUP_SCHED
8250 list_add(&root_task_group.list, &task_groups);
8251 INIT_LIST_HEAD(&root_task_group.children);
8252 autogroup_init(&init_task);
8253#endif
8254
8255 for_each_possible_cpu(i) {
8256 struct rq *rq;
8257
8258 rq = cpu_rq(i);
8259 raw_spin_lock_init(&rq->lock);
8260 rq->nr_running = 0;
8261 rq->calc_load_active = 0;
8262 rq->calc_load_update = jiffies + LOAD_FREQ;
8263 init_cfs_rq(&rq->cfs);
8264 init_rt_rq(&rq->rt, rq);
8265#ifdef CONFIG_FAIR_GROUP_SCHED
8266 root_task_group.shares = root_task_group_load;
8267 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8288 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8289#endif
8290
8291 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8292#ifdef CONFIG_RT_GROUP_SCHED
8293 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8294 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8295#endif
8296
8297 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8298 rq->cpu_load[j] = 0;
8299
8300 rq->last_load_update_tick = jiffies;
8301
8302#ifdef CONFIG_SMP
8303 rq->sd = NULL;
8304 rq->rd = NULL;
8305 rq->cpu_power = SCHED_POWER_SCALE;
8306 rq->post_schedule = 0;
8307 rq->active_balance = 0;
8308 rq->next_balance = jiffies;
8309 rq->push_cpu = 0;
8310 rq->cpu = i;
8311 rq->online = 0;
8312 rq->idle_stamp = 0;
8313 rq->avg_idle = 2*sysctl_sched_migration_cost;
8314 rq_attach_root(rq, &def_root_domain);
8315#ifdef CONFIG_NO_HZ
8316 rq->nohz_balance_kick = 0;
8317#endif
8318#endif
8319 init_rq_hrtick(rq);
8320 atomic_set(&rq->nr_iowait, 0);
8321 }
8322
8323 set_load_weight(&init_task);
8324
8325#ifdef CONFIG_PREEMPT_NOTIFIERS
8326 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8327#endif
8328
8329#ifdef CONFIG_SMP
8330 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8331#endif
8332
8333#ifdef CONFIG_RT_MUTEXES
8334 plist_head_init(&init_task.pi_waiters);
8335#endif
8336
8337
8338
8339
8340 atomic_inc(&init_mm.mm_count);
8341 enter_lazy_tlb(&init_mm, current);
8342
8343
8344
8345
8346
8347
8348
8349 init_idle(current, smp_processor_id());
8350
8351 calc_load_update = jiffies + LOAD_FREQ;
8352
8353
8354
8355
8356 current->sched_class = &fair_sched_class;
8357
8358#ifdef CONFIG_SMP
8359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8360#ifdef CONFIG_NO_HZ
8361 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8362 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8363 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8364 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8365 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8366#endif
8367
8368 if (cpu_isolated_map == NULL)
8369 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8370#endif
8371
8372 scheduler_running = 1;
8373}
8374
8375#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8376static inline int preempt_count_equals(int preempt_offset)
8377{
8378 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8379
8380 return (nested == preempt_offset);
8381}
8382
8383void __might_sleep(const char *file, int line, int preempt_offset)
8384{
8385 static unsigned long prev_jiffy;
8386
8387 rcu_sleep_check();
8388 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8389 system_state != SYSTEM_RUNNING || oops_in_progress)
8390 return;
8391 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8392 return;
8393 prev_jiffy = jiffies;
8394
8395 printk(KERN_ERR
8396 "BUG: sleeping function called from invalid context at %s:%d\n",
8397 file, line);
8398 printk(KERN_ERR
8399 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8400 in_atomic(), irqs_disabled(),
8401 current->pid, current->comm);
8402
8403 debug_show_held_locks(current);
8404 if (irqs_disabled())
8405 print_irqtrace_events(current);
8406 dump_stack();
8407}
8408EXPORT_SYMBOL(__might_sleep);
8409#endif
8410
8411#ifdef CONFIG_MAGIC_SYSRQ
8412static void normalize_task(struct rq *rq, struct task_struct *p)
8413{
8414 const struct sched_class *prev_class = p->sched_class;
8415 int old_prio = p->prio;
8416 int on_rq;
8417
8418 on_rq = p->on_rq;
8419 if (on_rq)
8420 deactivate_task(rq, p, 0);
8421 __setscheduler(rq, p, SCHED_NORMAL, 0);
8422 if (on_rq) {
8423 activate_task(rq, p, 0);
8424 resched_task(rq->curr);
8425 }
8426
8427 check_class_changed(rq, p, prev_class, old_prio);
8428}
8429
8430void normalize_rt_tasks(void)
8431{
8432 struct task_struct *g, *p;
8433 unsigned long flags;
8434 struct rq *rq;
8435
8436 read_lock_irqsave(&tasklist_lock, flags);
8437 do_each_thread(g, p) {
8438
8439
8440
8441 if (!p->mm)
8442 continue;
8443
8444 p->se.exec_start = 0;
8445#ifdef CONFIG_SCHEDSTATS
8446 p->se.statistics.wait_start = 0;
8447 p->se.statistics.sleep_start = 0;
8448 p->se.statistics.block_start = 0;
8449#endif
8450
8451 if (!rt_task(p)) {
8452
8453
8454
8455
8456 if (TASK_NICE(p) < 0 && p->mm)
8457 set_user_nice(p, 0);
8458 continue;
8459 }
8460
8461 raw_spin_lock(&p->pi_lock);
8462 rq = __task_rq_lock(p);
8463
8464 normalize_task(rq, p);
8465
8466 __task_rq_unlock(rq);
8467 raw_spin_unlock(&p->pi_lock);
8468 } while_each_thread(g, p);
8469
8470 read_unlock_irqrestore(&tasklist_lock, flags);
8471}
8472
8473#endif
8474
8475#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492struct task_struct *curr_task(int cpu)
8493{
8494 return cpu_curr(cpu);
8495}
8496
8497#endif
8498
8499#ifdef CONFIG_IA64
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515void set_curr_task(int cpu, struct task_struct *p)
8516{
8517 cpu_curr(cpu) = p;
8518}
8519
8520#endif
8521
8522#ifdef CONFIG_FAIR_GROUP_SCHED
8523static void free_fair_sched_group(struct task_group *tg)
8524{
8525 int i;
8526
8527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
8529 for_each_possible_cpu(i) {
8530 if (tg->cfs_rq)
8531 kfree(tg->cfs_rq[i]);
8532 if (tg->se)
8533 kfree(tg->se[i]);
8534 }
8535
8536 kfree(tg->cfs_rq);
8537 kfree(tg->se);
8538}
8539
8540static
8541int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8542{
8543 struct cfs_rq *cfs_rq;
8544 struct sched_entity *se;
8545 int i;
8546
8547 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8548 if (!tg->cfs_rq)
8549 goto err;
8550 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8551 if (!tg->se)
8552 goto err;
8553
8554 tg->shares = NICE_0_LOAD;
8555
8556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
8558 for_each_possible_cpu(i) {
8559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8560 GFP_KERNEL, cpu_to_node(i));
8561 if (!cfs_rq)
8562 goto err;
8563
8564 se = kzalloc_node(sizeof(struct sched_entity),
8565 GFP_KERNEL, cpu_to_node(i));
8566 if (!se)
8567 goto err_free_rq;
8568
8569 init_cfs_rq(cfs_rq);
8570 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8571 }
8572
8573 return 1;
8574
8575err_free_rq:
8576 kfree(cfs_rq);
8577err:
8578 return 0;
8579}
8580
8581static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8582{
8583 struct rq *rq = cpu_rq(cpu);
8584 unsigned long flags;
8585
8586
8587
8588
8589
8590 if (!tg->cfs_rq[cpu]->on_list)
8591 return;
8592
8593 raw_spin_lock_irqsave(&rq->lock, flags);
8594 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8595 raw_spin_unlock_irqrestore(&rq->lock, flags);
8596}
8597#else
8598static inline void free_fair_sched_group(struct task_group *tg)
8599{
8600}
8601
8602static inline
8603int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8604{
8605 return 1;
8606}
8607
8608static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8609{
8610}
8611#endif
8612
8613#ifdef CONFIG_RT_GROUP_SCHED
8614static void free_rt_sched_group(struct task_group *tg)
8615{
8616 int i;
8617
8618 if (tg->rt_se)
8619 destroy_rt_bandwidth(&tg->rt_bandwidth);
8620
8621 for_each_possible_cpu(i) {
8622 if (tg->rt_rq)
8623 kfree(tg->rt_rq[i]);
8624 if (tg->rt_se)
8625 kfree(tg->rt_se[i]);
8626 }
8627
8628 kfree(tg->rt_rq);
8629 kfree(tg->rt_se);
8630}
8631
8632static
8633int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8634{
8635 struct rt_rq *rt_rq;
8636 struct sched_rt_entity *rt_se;
8637 int i;
8638
8639 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8640 if (!tg->rt_rq)
8641 goto err;
8642 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8643 if (!tg->rt_se)
8644 goto err;
8645
8646 init_rt_bandwidth(&tg->rt_bandwidth,
8647 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8648
8649 for_each_possible_cpu(i) {
8650 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8651 GFP_KERNEL, cpu_to_node(i));
8652 if (!rt_rq)
8653 goto err;
8654
8655 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8656 GFP_KERNEL, cpu_to_node(i));
8657 if (!rt_se)
8658 goto err_free_rq;
8659
8660 init_rt_rq(rt_rq, cpu_rq(i));
8661 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8662 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8663 }
8664
8665 return 1;
8666
8667err_free_rq:
8668 kfree(rt_rq);
8669err:
8670 return 0;
8671}
8672#else
8673static inline void free_rt_sched_group(struct task_group *tg)
8674{
8675}
8676
8677static inline
8678int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8679{
8680 return 1;
8681}
8682#endif
8683
8684#ifdef CONFIG_CGROUP_SCHED
8685static void free_sched_group(struct task_group *tg)
8686{
8687 free_fair_sched_group(tg);
8688 free_rt_sched_group(tg);
8689 autogroup_free(tg);
8690 kfree(tg);
8691}
8692
8693
8694struct task_group *sched_create_group(struct task_group *parent)
8695{
8696 struct task_group *tg;
8697 unsigned long flags;
8698
8699 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8700 if (!tg)
8701 return ERR_PTR(-ENOMEM);
8702
8703 if (!alloc_fair_sched_group(tg, parent))
8704 goto err;
8705
8706 if (!alloc_rt_sched_group(tg, parent))
8707 goto err;
8708
8709 spin_lock_irqsave(&task_group_lock, flags);
8710 list_add_rcu(&tg->list, &task_groups);
8711
8712 WARN_ON(!parent);
8713
8714 tg->parent = parent;
8715 INIT_LIST_HEAD(&tg->children);
8716 list_add_rcu(&tg->siblings, &parent->children);
8717 spin_unlock_irqrestore(&task_group_lock, flags);
8718
8719 return tg;
8720
8721err:
8722 free_sched_group(tg);
8723 return ERR_PTR(-ENOMEM);
8724}
8725
8726
8727static void free_sched_group_rcu(struct rcu_head *rhp)
8728{
8729
8730 free_sched_group(container_of(rhp, struct task_group, rcu));
8731}
8732
8733
8734void sched_destroy_group(struct task_group *tg)
8735{
8736 unsigned long flags;
8737 int i;
8738
8739
8740 for_each_possible_cpu(i)
8741 unregister_fair_sched_group(tg, i);
8742
8743 spin_lock_irqsave(&task_group_lock, flags);
8744 list_del_rcu(&tg->list);
8745 list_del_rcu(&tg->siblings);
8746 spin_unlock_irqrestore(&task_group_lock, flags);
8747
8748
8749 call_rcu(&tg->rcu, free_sched_group_rcu);
8750}
8751
8752
8753
8754
8755
8756
8757void sched_move_task(struct task_struct *tsk)
8758{
8759 int on_rq, running;
8760 unsigned long flags;
8761 struct rq *rq;
8762
8763 rq = task_rq_lock(tsk, &flags);
8764
8765 running = task_current(rq, tsk);
8766 on_rq = tsk->on_rq;
8767
8768 if (on_rq)
8769 dequeue_task(rq, tsk, 0);
8770 if (unlikely(running))
8771 tsk->sched_class->put_prev_task(rq, tsk);
8772
8773#ifdef CONFIG_FAIR_GROUP_SCHED
8774 if (tsk->sched_class->task_move_group)
8775 tsk->sched_class->task_move_group(tsk, on_rq);
8776 else
8777#endif
8778 set_task_rq(tsk, task_cpu(tsk));
8779
8780 if (unlikely(running))
8781 tsk->sched_class->set_curr_task(rq);
8782 if (on_rq)
8783 enqueue_task(rq, tsk, 0);
8784
8785 task_rq_unlock(rq, tsk, &flags);
8786}
8787#endif
8788
8789#ifdef CONFIG_FAIR_GROUP_SCHED
8790static DEFINE_MUTEX(shares_mutex);
8791
8792int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8793{
8794 int i;
8795 unsigned long flags;
8796
8797
8798
8799
8800 if (!tg->se[0])
8801 return -EINVAL;
8802
8803 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8804
8805 mutex_lock(&shares_mutex);
8806 if (tg->shares == shares)
8807 goto done;
8808
8809 tg->shares = shares;
8810 for_each_possible_cpu(i) {
8811 struct rq *rq = cpu_rq(i);
8812 struct sched_entity *se;
8813
8814 se = tg->se[i];
8815
8816 raw_spin_lock_irqsave(&rq->lock, flags);
8817 for_each_sched_entity(se)
8818 update_cfs_shares(group_cfs_rq(se));
8819 raw_spin_unlock_irqrestore(&rq->lock, flags);
8820 }
8821
8822done:
8823 mutex_unlock(&shares_mutex);
8824 return 0;
8825}
8826
8827unsigned long sched_group_shares(struct task_group *tg)
8828{
8829 return tg->shares;
8830}
8831#endif
8832
8833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8834static unsigned long to_ratio(u64 period, u64 runtime)
8835{
8836 if (runtime == RUNTIME_INF)
8837 return 1ULL << 20;
8838
8839 return div64_u64(runtime << 20, period);
8840}
8841#endif
8842
8843#ifdef CONFIG_RT_GROUP_SCHED
8844
8845
8846
8847static DEFINE_MUTEX(rt_constraints_mutex);
8848
8849
8850static inline int tg_has_rt_tasks(struct task_group *tg)
8851{
8852 struct task_struct *g, *p;
8853
8854 do_each_thread(g, p) {
8855 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8856 return 1;
8857 } while_each_thread(g, p);
8858
8859 return 0;
8860}
8861
8862struct rt_schedulable_data {
8863 struct task_group *tg;
8864 u64 rt_period;
8865 u64 rt_runtime;
8866};
8867
8868static int tg_rt_schedulable(struct task_group *tg, void *data)
8869{
8870 struct rt_schedulable_data *d = data;
8871 struct task_group *child;
8872 unsigned long total, sum = 0;
8873 u64 period, runtime;
8874
8875 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8876 runtime = tg->rt_bandwidth.rt_runtime;
8877
8878 if (tg == d->tg) {
8879 period = d->rt_period;
8880 runtime = d->rt_runtime;
8881 }
8882
8883
8884
8885
8886 if (runtime > period && runtime != RUNTIME_INF)
8887 return -EINVAL;
8888
8889
8890
8891
8892 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8893 return -EBUSY;
8894
8895 total = to_ratio(period, runtime);
8896
8897
8898
8899
8900 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8901 return -EINVAL;
8902
8903
8904
8905
8906 list_for_each_entry_rcu(child, &tg->children, siblings) {
8907 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8908 runtime = child->rt_bandwidth.rt_runtime;
8909
8910 if (child == d->tg) {
8911 period = d->rt_period;
8912 runtime = d->rt_runtime;
8913 }
8914
8915 sum += to_ratio(period, runtime);
8916 }
8917
8918 if (sum > total)
8919 return -EINVAL;
8920
8921 return 0;
8922}
8923
8924static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8925{
8926 int ret;
8927
8928 struct rt_schedulable_data data = {
8929 .tg = tg,
8930 .rt_period = period,
8931 .rt_runtime = runtime,
8932 };
8933
8934 rcu_read_lock();
8935 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8936 rcu_read_unlock();
8937
8938 return ret;
8939}
8940
8941static int tg_set_rt_bandwidth(struct task_group *tg,
8942 u64 rt_period, u64 rt_runtime)
8943{
8944 int i, err = 0;
8945
8946 mutex_lock(&rt_constraints_mutex);
8947 read_lock(&tasklist_lock);
8948 err = __rt_schedulable(tg, rt_period, rt_runtime);
8949 if (err)
8950 goto unlock;
8951
8952 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8953 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8954 tg->rt_bandwidth.rt_runtime = rt_runtime;
8955
8956 for_each_possible_cpu(i) {
8957 struct rt_rq *rt_rq = tg->rt_rq[i];
8958
8959 raw_spin_lock(&rt_rq->rt_runtime_lock);
8960 rt_rq->rt_runtime = rt_runtime;
8961 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8962 }
8963 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8964unlock:
8965 read_unlock(&tasklist_lock);
8966 mutex_unlock(&rt_constraints_mutex);
8967
8968 return err;
8969}
8970
8971int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8972{
8973 u64 rt_runtime, rt_period;
8974
8975 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8976 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8977 if (rt_runtime_us < 0)
8978 rt_runtime = RUNTIME_INF;
8979
8980 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8981}
8982
8983long sched_group_rt_runtime(struct task_group *tg)
8984{
8985 u64 rt_runtime_us;
8986
8987 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8988 return -1;
8989
8990 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8991 do_div(rt_runtime_us, NSEC_PER_USEC);
8992 return rt_runtime_us;
8993}
8994
8995int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8996{
8997 u64 rt_runtime, rt_period;
8998
8999 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9000 rt_runtime = tg->rt_bandwidth.rt_runtime;
9001
9002 if (rt_period == 0)
9003 return -EINVAL;
9004
9005 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
9006}
9007
9008long sched_group_rt_period(struct task_group *tg)
9009{
9010 u64 rt_period_us;
9011
9012 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9013 do_div(rt_period_us, NSEC_PER_USEC);
9014 return rt_period_us;
9015}
9016
9017static int sched_rt_global_constraints(void)
9018{
9019 u64 runtime, period;
9020 int ret = 0;
9021
9022 if (sysctl_sched_rt_period <= 0)
9023 return -EINVAL;
9024
9025 runtime = global_rt_runtime();
9026 period = global_rt_period();
9027
9028
9029
9030
9031 if (runtime > period && runtime != RUNTIME_INF)
9032 return -EINVAL;
9033
9034 mutex_lock(&rt_constraints_mutex);
9035 read_lock(&tasklist_lock);
9036 ret = __rt_schedulable(NULL, 0, 0);
9037 read_unlock(&tasklist_lock);
9038 mutex_unlock(&rt_constraints_mutex);
9039
9040 return ret;
9041}
9042
9043int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9044{
9045
9046 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9047 return 0;
9048
9049 return 1;
9050}
9051
9052#else
9053static int sched_rt_global_constraints(void)
9054{
9055 unsigned long flags;
9056 int i;
9057
9058 if (sysctl_sched_rt_period <= 0)
9059 return -EINVAL;
9060
9061
9062
9063
9064
9065 if (sysctl_sched_rt_runtime == 0)
9066 return -EBUSY;
9067
9068 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9069 for_each_possible_cpu(i) {
9070 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9071
9072 raw_spin_lock(&rt_rq->rt_runtime_lock);
9073 rt_rq->rt_runtime = global_rt_runtime();
9074 raw_spin_unlock(&rt_rq->rt_runtime_lock);
9075 }
9076 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9077
9078 return 0;
9079}
9080#endif
9081
9082int sched_rt_handler(struct ctl_table *table, int write,
9083 void __user *buffer, size_t *lenp,
9084 loff_t *ppos)
9085{
9086 int ret;
9087 int old_period, old_runtime;
9088 static DEFINE_MUTEX(mutex);
9089
9090 mutex_lock(&mutex);
9091 old_period = sysctl_sched_rt_period;
9092 old_runtime = sysctl_sched_rt_runtime;
9093
9094 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9095
9096 if (!ret && write) {
9097 ret = sched_rt_global_constraints();
9098 if (ret) {
9099 sysctl_sched_rt_period = old_period;
9100 sysctl_sched_rt_runtime = old_runtime;
9101 } else {
9102 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9103 def_rt_bandwidth.rt_period =
9104 ns_to_ktime(global_rt_period());
9105 }
9106 }
9107 mutex_unlock(&mutex);
9108
9109 return ret;
9110}
9111
9112#ifdef CONFIG_CGROUP_SCHED
9113
9114
9115static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9116{
9117 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9118 struct task_group, css);
9119}
9120
9121static struct cgroup_subsys_state *
9122cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9123{
9124 struct task_group *tg, *parent;
9125
9126 if (!cgrp->parent) {
9127
9128 return &root_task_group.css;
9129 }
9130
9131 parent = cgroup_tg(cgrp->parent);
9132 tg = sched_create_group(parent);
9133 if (IS_ERR(tg))
9134 return ERR_PTR(-ENOMEM);
9135
9136 return &tg->css;
9137}
9138
9139static void
9140cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9141{
9142 struct task_group *tg = cgroup_tg(cgrp);
9143
9144 sched_destroy_group(tg);
9145}
9146
9147static int
9148cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9149{
9150#ifdef CONFIG_RT_GROUP_SCHED
9151 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9152 return -EINVAL;
9153#else
9154
9155 if (tsk->sched_class != &fair_sched_class)
9156 return -EINVAL;
9157#endif
9158 return 0;
9159}
9160
9161static void
9162cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9163{
9164 sched_move_task(tsk);
9165}
9166
9167static void
9168cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9169 struct cgroup *old_cgrp, struct task_struct *task)
9170{
9171
9172
9173
9174
9175
9176 if (!(task->flags & PF_EXITING))
9177 return;
9178
9179 sched_move_task(task);
9180}
9181
9182#ifdef CONFIG_FAIR_GROUP_SCHED
9183static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9184 u64 shareval)
9185{
9186 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9187}
9188
9189static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9190{
9191 struct task_group *tg = cgroup_tg(cgrp);
9192
9193 return (u64) scale_load_down(tg->shares);
9194}
9195
9196#ifdef CONFIG_CFS_BANDWIDTH
9197static DEFINE_MUTEX(cfs_constraints_mutex);
9198
9199const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
9200const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
9201
9202static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203
9204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{
9206 int i, ret = 0, runtime_enabled;
9207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9208
9209 if (tg == &root_task_group)
9210 return -EINVAL;
9211
9212
9213
9214
9215
9216
9217 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9218 return -EINVAL;
9219
9220
9221
9222
9223
9224
9225 if (period > max_cfs_quota_period)
9226 return -EINVAL;
9227
9228 mutex_lock(&cfs_constraints_mutex);
9229 ret = __cfs_schedulable(tg, period, quota);
9230 if (ret)
9231 goto out_unlock;
9232
9233 runtime_enabled = quota != RUNTIME_INF;
9234 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota;
9237
9238 __refill_cfs_bandwidth_runtime(cfs_b);
9239
9240 if (runtime_enabled && cfs_b->timer_active) {
9241
9242 cfs_b->timer_active = 0;
9243 __start_cfs_bandwidth(cfs_b);
9244 }
9245 raw_spin_unlock_irq(&cfs_b->lock);
9246
9247 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq);
9250
9251 raw_spin_lock_irq(&rq->lock);
9252 cfs_rq->runtime_enabled = runtime_enabled;
9253 cfs_rq->runtime_remaining = 0;
9254
9255 if (cfs_rq_throttled(cfs_rq))
9256 unthrottle_cfs_rq(cfs_rq);
9257 raw_spin_unlock_irq(&rq->lock);
9258 }
9259out_unlock:
9260 mutex_unlock(&cfs_constraints_mutex);
9261
9262 return ret;
9263}
9264
9265int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{
9267 u64 quota, period;
9268
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9270 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF;
9272 else
9273 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9274
9275 return tg_set_cfs_bandwidth(tg, period, quota);
9276}
9277
9278long tg_get_cfs_quota(struct task_group *tg)
9279{
9280 u64 quota_us;
9281
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9283 return -1;
9284
9285 quota_us = tg_cfs_bandwidth(tg)->quota;
9286 do_div(quota_us, NSEC_PER_USEC);
9287
9288 return quota_us;
9289}
9290
9291int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9292{
9293 u64 quota, period;
9294
9295 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300
9301 return tg_set_cfs_bandwidth(tg, period, quota);
9302}
9303
9304long tg_get_cfs_period(struct task_group *tg)
9305{
9306 u64 cfs_period_us;
9307
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9309 do_div(cfs_period_us, NSEC_PER_USEC);
9310
9311 return cfs_period_us;
9312}
9313
9314static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9315{
9316 return tg_get_cfs_quota(cgroup_tg(cgrp));
9317}
9318
9319static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9320 s64 cfs_quota_us)
9321{
9322 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9323}
9324
9325static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9326{
9327 return tg_get_cfs_period(cgroup_tg(cgrp));
9328}
9329
9330static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9331 u64 cfs_period_us)
9332{
9333 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9334}
9335
9336struct cfs_schedulable_data {
9337 struct task_group *tg;
9338 u64 period, quota;
9339};
9340
9341
9342
9343
9344
9345static u64 normalize_cfs_quota(struct task_group *tg,
9346 struct cfs_schedulable_data *d)
9347{
9348 u64 quota, period;
9349
9350 if (tg == d->tg) {
9351 period = d->period;
9352 quota = d->quota;
9353 } else {
9354 period = tg_get_cfs_period(tg);
9355 quota = tg_get_cfs_quota(tg);
9356 }
9357
9358
9359 if (quota == RUNTIME_INF || quota == -1)
9360 return RUNTIME_INF;
9361
9362 return to_ratio(period, quota);
9363}
9364
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{
9367 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9369 s64 quota = 0, parent_quota = -1;
9370
9371 if (!tg->parent) {
9372 quota = RUNTIME_INF;
9373 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9375
9376 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota;
9378
9379
9380
9381
9382
9383 if (quota == RUNTIME_INF)
9384 quota = parent_quota;
9385 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9386 return -EINVAL;
9387 }
9388 cfs_b->hierarchal_quota = quota;
9389
9390 return 0;
9391}
9392
9393static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9394{
9395 int ret;
9396 struct cfs_schedulable_data data = {
9397 .tg = tg,
9398 .period = period,
9399 .quota = quota,
9400 };
9401
9402 if (quota != RUNTIME_INF) {
9403 do_div(data.period, NSEC_PER_USEC);
9404 do_div(data.quota, NSEC_PER_USEC);
9405 }
9406
9407 rcu_read_lock();
9408 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9409 rcu_read_unlock();
9410
9411 return ret;
9412}
9413
9414static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb)
9416{
9417 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9419
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9422 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9423
9424 return 0;
9425}
9426#endif
9427#endif
9428
9429#ifdef CONFIG_RT_GROUP_SCHED
9430static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9431 s64 val)
9432{
9433 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9434}
9435
9436static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9437{
9438 return sched_group_rt_runtime(cgroup_tg(cgrp));
9439}
9440
9441static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9442 u64 rt_period_us)
9443{
9444 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9445}
9446
9447static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9448{
9449 return sched_group_rt_period(cgroup_tg(cgrp));
9450}
9451#endif
9452
9453static struct cftype cpu_files[] = {
9454#ifdef CONFIG_FAIR_GROUP_SCHED
9455 {
9456 .name = "shares",
9457 .read_u64 = cpu_shares_read_u64,
9458 .write_u64 = cpu_shares_write_u64,
9459 },
9460#endif
9461#ifdef CONFIG_CFS_BANDWIDTH
9462 {
9463 .name = "cfs_quota_us",
9464 .read_s64 = cpu_cfs_quota_read_s64,
9465 .write_s64 = cpu_cfs_quota_write_s64,
9466 },
9467 {
9468 .name = "cfs_period_us",
9469 .read_u64 = cpu_cfs_period_read_u64,
9470 .write_u64 = cpu_cfs_period_write_u64,
9471 },
9472 {
9473 .name = "stat",
9474 .read_map = cpu_stats_show,
9475 },
9476#endif
9477#ifdef CONFIG_RT_GROUP_SCHED
9478 {
9479 .name = "rt_runtime_us",
9480 .read_s64 = cpu_rt_runtime_read,
9481 .write_s64 = cpu_rt_runtime_write,
9482 },
9483 {
9484 .name = "rt_period_us",
9485 .read_u64 = cpu_rt_period_read_uint,
9486 .write_u64 = cpu_rt_period_write_uint,
9487 },
9488#endif
9489};
9490
9491static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9492{
9493 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9494}
9495
9496struct cgroup_subsys cpu_cgroup_subsys = {
9497 .name = "cpu",
9498 .create = cpu_cgroup_create,
9499 .destroy = cpu_cgroup_destroy,
9500 .can_attach_task = cpu_cgroup_can_attach_task,
9501 .attach_task = cpu_cgroup_attach_task,
9502 .exit = cpu_cgroup_exit,
9503 .populate = cpu_cgroup_populate,
9504 .subsys_id = cpu_cgroup_subsys_id,
9505 .early_init = 1,
9506};
9507
9508#endif
9509
9510#ifdef CONFIG_CGROUP_CPUACCT
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520struct cpuacct {
9521 struct cgroup_subsys_state css;
9522
9523 u64 __percpu *cpuusage;
9524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9525 struct cpuacct *parent;
9526};
9527
9528struct cgroup_subsys cpuacct_subsys;
9529
9530
9531static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9532{
9533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9534 struct cpuacct, css);
9535}
9536
9537
9538static inline struct cpuacct *task_ca(struct task_struct *tsk)
9539{
9540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9541 struct cpuacct, css);
9542}
9543
9544
9545static struct cgroup_subsys_state *cpuacct_create(
9546 struct cgroup_subsys *ss, struct cgroup *cgrp)
9547{
9548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9549 int i;
9550
9551 if (!ca)
9552 goto out;
9553
9554 ca->cpuusage = alloc_percpu(u64);
9555 if (!ca->cpuusage)
9556 goto out_free_ca;
9557
9558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9559 if (percpu_counter_init(&ca->cpustat[i], 0))
9560 goto out_free_counters;
9561
9562 if (cgrp->parent)
9563 ca->parent = cgroup_ca(cgrp->parent);
9564
9565 return &ca->css;
9566
9567out_free_counters:
9568 while (--i >= 0)
9569 percpu_counter_destroy(&ca->cpustat[i]);
9570 free_percpu(ca->cpuusage);
9571out_free_ca:
9572 kfree(ca);
9573out:
9574 return ERR_PTR(-ENOMEM);
9575}
9576
9577
9578static void
9579cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9580{
9581 struct cpuacct *ca = cgroup_ca(cgrp);
9582 int i;
9583
9584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9585 percpu_counter_destroy(&ca->cpustat[i]);
9586 free_percpu(ca->cpuusage);
9587 kfree(ca);
9588}
9589
9590static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9591{
9592 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9593 u64 data;
9594
9595#ifndef CONFIG_64BIT
9596
9597
9598
9599 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9600 data = *cpuusage;
9601 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9602#else
9603 data = *cpuusage;
9604#endif
9605
9606 return data;
9607}
9608
9609static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9610{
9611 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9612
9613#ifndef CONFIG_64BIT
9614
9615
9616
9617 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9618 *cpuusage = val;
9619 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9620#else
9621 *cpuusage = val;
9622#endif
9623}
9624
9625
9626static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9627{
9628 struct cpuacct *ca = cgroup_ca(cgrp);
9629 u64 totalcpuusage = 0;
9630 int i;
9631
9632 for_each_present_cpu(i)
9633 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9634
9635 return totalcpuusage;
9636}
9637
9638static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9639 u64 reset)
9640{
9641 struct cpuacct *ca = cgroup_ca(cgrp);
9642 int err = 0;
9643 int i;
9644
9645 if (reset) {
9646 err = -EINVAL;
9647 goto out;
9648 }
9649
9650 for_each_present_cpu(i)
9651 cpuacct_cpuusage_write(ca, i, 0);
9652
9653out:
9654 return err;
9655}
9656
9657static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9658 struct seq_file *m)
9659{
9660 struct cpuacct *ca = cgroup_ca(cgroup);
9661 u64 percpu;
9662 int i;
9663
9664 for_each_present_cpu(i) {
9665 percpu = cpuacct_cpuusage_read(ca, i);
9666 seq_printf(m, "%llu ", (unsigned long long) percpu);
9667 }
9668 seq_printf(m, "\n");
9669 return 0;
9670}
9671
9672static const char *cpuacct_stat_desc[] = {
9673 [CPUACCT_STAT_USER] = "user",
9674 [CPUACCT_STAT_SYSTEM] = "system",
9675};
9676
9677static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9678 struct cgroup_map_cb *cb)
9679{
9680 struct cpuacct *ca = cgroup_ca(cgrp);
9681 int i;
9682
9683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9684 s64 val = percpu_counter_read(&ca->cpustat[i]);
9685 val = cputime64_to_clock_t(val);
9686 cb->fill(cb, cpuacct_stat_desc[i], val);
9687 }
9688 return 0;
9689}
9690
9691static struct cftype files[] = {
9692 {
9693 .name = "usage",
9694 .read_u64 = cpuusage_read,
9695 .write_u64 = cpuusage_write,
9696 },
9697 {
9698 .name = "usage_percpu",
9699 .read_seq_string = cpuacct_percpu_seq_read,
9700 },
9701 {
9702 .name = "stat",
9703 .read_map = cpuacct_stats_show,
9704 },
9705};
9706
9707static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9708{
9709 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9710}
9711
9712
9713
9714
9715
9716
9717static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9718{
9719 struct cpuacct *ca;
9720 int cpu;
9721
9722 if (unlikely(!cpuacct_subsys.active))
9723 return;
9724
9725 cpu = task_cpu(tsk);
9726
9727 rcu_read_lock();
9728
9729 ca = task_ca(tsk);
9730
9731 for (; ca; ca = ca->parent) {
9732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9733 *cpuusage += cputime;
9734 }
9735
9736 rcu_read_unlock();
9737}
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749#ifdef CONFIG_SMP
9750#define CPUACCT_BATCH \
9751 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9752#else
9753#define CPUACCT_BATCH 0
9754#endif
9755
9756
9757
9758
9759static void cpuacct_update_stats(struct task_struct *tsk,
9760 enum cpuacct_stat_index idx, cputime_t val)
9761{
9762 struct cpuacct *ca;
9763 int batch = CPUACCT_BATCH;
9764
9765 if (unlikely(!cpuacct_subsys.active))
9766 return;
9767
9768 rcu_read_lock();
9769 ca = task_ca(tsk);
9770
9771 do {
9772 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9773 ca = ca->parent;
9774 } while (ca);
9775 rcu_read_unlock();
9776}
9777
9778struct cgroup_subsys cpuacct_subsys = {
9779 .name = "cpuacct",
9780 .create = cpuacct_create,
9781 .destroy = cpuacct_destroy,
9782 .populate = cpuacct_populate,
9783 .subsys_id = cpuacct_subsys_id,
9784};
9785#endif
9786