1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27#include <linux/mm.h>
28#include <linux/module.h>
29#include <linux/nmi.h>
30#include <linux/init.h>
31#include <linux/uaccess.h>
32#include <linux/highmem.h>
33#include <linux/smp_lock.h>
34#include <asm/mmu_context.h>
35#include <linux/interrupt.h>
36#include <linux/capability.h>
37#include <linux/completion.h>
38#include <linux/kernel_stat.h>
39#include <linux/debug_locks.h>
40#include <linux/security.h>
41#include <linux/notifier.h>
42#include <linux/profile.h>
43#include <linux/freezer.h>
44#include <linux/vmalloc.h>
45#include <linux/blkdev.h>
46#include <linux/delay.h>
47#include <linux/pid_namespace.h>
48#include <linux/smp.h>
49#include <linux/threads.h>
50#include <linux/timer.h>
51#include <linux/rcupdate.h>
52#include <linux/cpu.h>
53#include <linux/cpuset.h>
54#include <linux/percpu.h>
55#include <linux/kthread.h>
56#include <linux/seq_file.h>
57#include <linux/sysctl.h>
58#include <linux/syscalls.h>
59#include <linux/times.h>
60#include <linux/tsacct_kern.h>
61#include <linux/kprobes.h>
62#include <linux/delayacct.h>
63#include <linux/reciprocal_div.h>
64#include <linux/unistd.h>
65#include <linux/pagemap.h>
66
67#include <asm/tlb.h>
68#include <asm/irq_regs.h>
69
70
71
72
73
74
75unsigned long long __attribute__((weak)) sched_clock(void)
76{
77 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
78}
79
80
81
82
83
84
85#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
86#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
87#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
88
89
90
91
92
93
94#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
95#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97
98
99
100
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103
104#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT
106
107
108
109
110
111
112
113#define DEF_TIMESLICE (100 * HZ / 1000)
114
115#ifdef CONFIG_SMP
116
117
118
119
120static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
121{
122 return reciprocal_divide(load, sg->reciprocal_cpu_power);
123}
124
125
126
127
128
129static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
130{
131 sg->__cpu_power += val;
132 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
133}
134#endif
135
136static inline int rt_policy(int policy)
137{
138 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
139 return 1;
140 return 0;
141}
142
143static inline int task_has_rt_policy(struct task_struct *p)
144{
145 return rt_policy(p->policy);
146}
147
148
149
150
151struct rt_prio_array {
152 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
153 struct list_head queue[MAX_RT_PRIO];
154};
155
156#ifdef CONFIG_FAIR_GROUP_SCHED
157
158#include <linux/cgroup.h>
159
160struct cfs_rq;
161
162
163struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED
165 struct cgroup_subsys_state css;
166#endif
167
168 struct sched_entity **se;
169
170 struct cfs_rq **cfs_rq;
171 unsigned long shares;
172
173 spinlock_t lock;
174 struct rcu_head rcu;
175};
176
177
178static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181
182static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184
185
186
187
188struct task_group init_task_group = {
189 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p,
191};
192
193#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
195#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD
197#endif
198
199static int init_task_group_load = INIT_TASK_GRP_LOAD;
200
201
202static inline struct task_group *task_group(struct task_struct *p)
203{
204 struct task_group *tg;
205
206#ifdef CONFIG_FAIR_USER_SCHED
207 tg = p->user->tg;
208#elif defined(CONFIG_FAIR_CGROUP_SCHED)
209 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
210 struct task_group, css);
211#else
212 tg = &init_task_group;
213#endif
214 return tg;
215}
216
217
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
219{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu];
222}
223
224#else
225
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
227
228#endif
229
230
231struct cfs_rq {
232 struct load_weight load;
233 unsigned long nr_running;
234
235 u64 exec_clock;
236 u64 min_vruntime;
237
238 struct rb_root tasks_timeline;
239 struct rb_node *rb_leftmost;
240 struct rb_node *rb_load_balance_curr;
241
242
243
244 struct sched_entity *curr;
245
246 unsigned long nr_spread_over;
247
248#ifdef CONFIG_FAIR_GROUP_SCHED
249 struct rq *rq;
250
251
252
253
254
255
256
257
258
259 struct list_head leaf_cfs_rq_list;
260 struct task_group *tg;
261#endif
262};
263
264
265struct rt_rq {
266 struct rt_prio_array active;
267 int rt_load_balance_idx;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
269};
270
271
272
273
274
275
276
277
278struct rq {
279
280 spinlock_t lock;
281
282
283
284
285
286 unsigned long nr_running;
287 #define CPU_LOAD_IDX_MAX 5
288 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
289 unsigned char idle_at_tick;
290#ifdef CONFIG_NO_HZ
291 unsigned char in_nohz_recently;
292#endif
293
294 struct load_weight load;
295 unsigned long nr_load_updates;
296 u64 nr_switches;
297
298 struct cfs_rq cfs;
299#ifdef CONFIG_FAIR_GROUP_SCHED
300
301 struct list_head leaf_cfs_rq_list;
302#endif
303 struct rt_rq rt;
304
305
306
307
308
309
310
311 unsigned long nr_uninterruptible;
312
313 struct task_struct *curr, *idle;
314 unsigned long next_balance;
315 struct mm_struct *prev_mm;
316
317 u64 clock, prev_clock_raw;
318 s64 clock_max_delta;
319
320 unsigned int clock_warps, clock_overflows;
321 u64 idle_clock;
322 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp;
324
325 atomic_t nr_iowait;
326
327#ifdef CONFIG_SMP
328 struct sched_domain *sd;
329
330
331 int active_balance;
332 int push_cpu;
333
334 int cpu;
335
336 struct task_struct *migration_thread;
337 struct list_head migration_queue;
338#endif
339
340#ifdef CONFIG_SCHEDSTATS
341
342 struct sched_info rq_sched_info;
343
344
345 unsigned int yld_exp_empty;
346 unsigned int yld_act_empty;
347 unsigned int yld_both_empty;
348 unsigned int yld_count;
349
350
351 unsigned int sched_switch;
352 unsigned int sched_count;
353 unsigned int sched_goidle;
354
355
356 unsigned int ttwu_count;
357 unsigned int ttwu_local;
358
359
360 unsigned int bkl_count;
361#endif
362 struct lock_class_key rq_lock_key;
363};
364
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{
370 rq->curr->sched_class->check_preempt_curr(rq, p);
371}
372
373static inline int cpu_of(struct rq *rq)
374{
375#ifdef CONFIG_SMP
376 return rq->cpu;
377#else
378 return 0;
379#endif
380}
381
382
383
384
385
386static void __update_rq_clock(struct rq *rq)
387{
388 u64 prev_raw = rq->prev_clock_raw;
389 u64 now = sched_clock();
390 s64 delta = now - prev_raw;
391 u64 clock = rq->clock;
392
393#ifdef CONFIG_SCHED_DEBUG
394 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
395#endif
396
397
398
399 if (unlikely(delta < 0)) {
400 clock++;
401 rq->clock_warps++;
402 } else {
403
404
405
406 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
407 if (clock < rq->tick_timestamp + TICK_NSEC)
408 clock = rq->tick_timestamp + TICK_NSEC;
409 else
410 clock++;
411 rq->clock_overflows++;
412 } else {
413 if (unlikely(delta > rq->clock_max_delta))
414 rq->clock_max_delta = delta;
415 clock += delta;
416 }
417 }
418
419 rq->prev_clock_raw = now;
420 rq->clock = clock;
421}
422
423static void update_rq_clock(struct rq *rq)
424{
425 if (likely(smp_processor_id() == cpu_of(rq)))
426 __update_rq_clock(rq);
427}
428
429
430
431
432
433
434
435
436#define for_each_domain(cpu, __sd) \
437 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
438
439#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
440#define this_rq() (&__get_cpu_var(runqueues))
441#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443
444
445
446
447#ifdef CONFIG_SCHED_DEBUG
448# define const_debug __read_mostly
449#else
450# define const_debug static const
451#endif
452
453
454
455
456enum {
457 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
458 SCHED_FEAT_WAKEUP_PREEMPT = 2,
459 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16,
462};
463
464const_debug unsigned int sysctl_sched_features =
465 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0;
470
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472
473
474
475
476
477const_debug unsigned int sysctl_sched_nr_migrate = 32;
478
479
480
481
482
483unsigned long long cpu_clock(int cpu)
484{
485 unsigned long long now;
486 unsigned long flags;
487 struct rq *rq;
488
489 local_irq_save(flags);
490 rq = cpu_rq(cpu);
491
492
493
494
495 if (rq->idle)
496 update_rq_clock(rq);
497 now = rq->clock;
498 local_irq_restore(flags);
499
500 return now;
501}
502EXPORT_SYMBOL_GPL(cpu_clock);
503
504#ifndef prepare_arch_switch
505# define prepare_arch_switch(next) do { } while (0)
506#endif
507#ifndef finish_arch_switch
508# define finish_arch_switch(prev) do { } while (0)
509#endif
510
511static inline int task_current(struct rq *rq, struct task_struct *p)
512{
513 return rq->curr == p;
514}
515
516#ifndef __ARCH_WANT_UNLOCKED_CTXSW
517static inline int task_running(struct rq *rq, struct task_struct *p)
518{
519 return task_current(rq, p);
520}
521
522static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
523{
524}
525
526static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
527{
528#ifdef CONFIG_DEBUG_SPINLOCK
529
530 rq->lock.owner = current;
531#endif
532
533
534
535
536
537 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
538
539 spin_unlock_irq(&rq->lock);
540}
541
542#else
543static inline int task_running(struct rq *rq, struct task_struct *p)
544{
545#ifdef CONFIG_SMP
546 return p->oncpu;
547#else
548 return task_current(rq, p);
549#endif
550}
551
552static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
553{
554#ifdef CONFIG_SMP
555
556
557
558
559
560 next->oncpu = 1;
561#endif
562#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
563 spin_unlock_irq(&rq->lock);
564#else
565 spin_unlock(&rq->lock);
566#endif
567}
568
569static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
570{
571#ifdef CONFIG_SMP
572
573
574
575
576
577 smp_wmb();
578 prev->oncpu = 0;
579#endif
580#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
581 local_irq_enable();
582#endif
583}
584#endif
585
586
587
588
589
590static inline struct rq *__task_rq_lock(struct task_struct *p)
591 __acquires(rq->lock)
592{
593 for (;;) {
594 struct rq *rq = task_rq(p);
595 spin_lock(&rq->lock);
596 if (likely(rq == task_rq(p)))
597 return rq;
598 spin_unlock(&rq->lock);
599 }
600}
601
602
603
604
605
606
607static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
608 __acquires(rq->lock)
609{
610 struct rq *rq;
611
612 for (;;) {
613 local_irq_save(*flags);
614 rq = task_rq(p);
615 spin_lock(&rq->lock);
616 if (likely(rq == task_rq(p)))
617 return rq;
618 spin_unlock_irqrestore(&rq->lock, *flags);
619 }
620}
621
622static void __task_rq_unlock(struct rq *rq)
623 __releases(rq->lock)
624{
625 spin_unlock(&rq->lock);
626}
627
628static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
629 __releases(rq->lock)
630{
631 spin_unlock_irqrestore(&rq->lock, *flags);
632}
633
634
635
636
637static struct rq *this_rq_lock(void)
638 __acquires(rq->lock)
639{
640 struct rq *rq;
641
642 local_irq_disable();
643 rq = this_rq();
644 spin_lock(&rq->lock);
645
646 return rq;
647}
648
649
650
651
652void sched_clock_idle_sleep_event(void)
653{
654 struct rq *rq = cpu_rq(smp_processor_id());
655
656 spin_lock(&rq->lock);
657 __update_rq_clock(rq);
658 spin_unlock(&rq->lock);
659 rq->clock_deep_idle_events++;
660}
661EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
662
663
664
665
666void sched_clock_idle_wakeup_event(u64 delta_ns)
667{
668 struct rq *rq = cpu_rq(smp_processor_id());
669 u64 now = sched_clock();
670
671 touch_softlockup_watchdog();
672 rq->idle_clock += delta_ns;
673
674
675
676
677
678
679 spin_lock(&rq->lock);
680 rq->prev_clock_raw = now;
681 rq->clock += delta_ns;
682 spin_unlock(&rq->lock);
683}
684EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685
686
687
688
689
690
691
692
693#ifdef CONFIG_SMP
694
695#ifndef tsk_is_polling
696#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
697#endif
698
699static void resched_task(struct task_struct *p)
700{
701 int cpu;
702
703 assert_spin_locked(&task_rq(p)->lock);
704
705 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
706 return;
707
708 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
709
710 cpu = task_cpu(p);
711 if (cpu == smp_processor_id())
712 return;
713
714
715 smp_mb();
716 if (!tsk_is_polling(p))
717 smp_send_reschedule(cpu);
718}
719
720static void resched_cpu(int cpu)
721{
722 struct rq *rq = cpu_rq(cpu);
723 unsigned long flags;
724
725 if (!spin_trylock_irqsave(&rq->lock, flags))
726 return;
727 resched_task(cpu_curr(cpu));
728 spin_unlock_irqrestore(&rq->lock, flags);
729}
730#else
731static inline void resched_task(struct task_struct *p)
732{
733 assert_spin_locked(&task_rq(p)->lock);
734 set_tsk_need_resched(p);
735}
736#endif
737
738#if BITS_PER_LONG == 32
739# define WMULT_CONST (~0UL)
740#else
741# define WMULT_CONST (1UL << 32)
742#endif
743
744#define WMULT_SHIFT 32
745
746
747
748
749#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
750
751static unsigned long
752calc_delta_mine(unsigned long delta_exec, unsigned long weight,
753 struct load_weight *lw)
754{
755 u64 tmp;
756
757 if (unlikely(!lw->inv_weight))
758 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
759
760 tmp = (u64)delta_exec * weight;
761
762
763
764 if (unlikely(tmp > WMULT_CONST))
765 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
766 WMULT_SHIFT/2);
767 else
768 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
769
770 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
771}
772
773static inline unsigned long
774calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
775{
776 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
777}
778
779static inline void update_load_add(struct load_weight *lw, unsigned long inc)
780{
781 lw->weight += inc;
782}
783
784static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
785{
786 lw->weight -= dec;
787}
788
789
790
791
792
793
794
795
796
797
798#define WEIGHT_IDLEPRIO 2
799#define WMULT_IDLEPRIO (1 << 31)
800
801
802
803
804
805
806
807
808
809
810
811
812
813static const int prio_to_weight[40] = {
814 88761, 71755, 56483, 46273, 36291,
815 29154, 23254, 18705, 14949, 11916,
816 9548, 7620, 6100, 4904, 3906,
817 3121, 2501, 1991, 1586, 1277,
818 1024, 820, 655, 526, 423,
819 335, 272, 215, 172, 137,
820 110, 87, 70, 56, 45,
821 36, 29, 23, 18, 15,
822};
823
824
825
826
827
828
829
830
831static const u32 prio_to_wmult[40] = {
832 48388, 59856, 76040, 92818, 118348,
833 147320, 184698, 229616, 287308, 360437,
834 449829, 563644, 704093, 875809, 1099582,
835 1376151, 1717300, 2157191, 2708050, 3363326,
836 4194304, 5237765, 6557202, 8165337, 10153587,
837 12820798, 15790321, 19976592, 24970740, 31350126,
838 39045157, 49367440, 61356676, 76695844, 95443717,
839 119304647, 148102320, 186737708, 238609294, 286331153,
840};
841
842static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
843
844
845
846
847
848
849struct rq_iterator {
850 void *arg;
851 struct task_struct *(*start)(void *);
852 struct task_struct *(*next)(void *);
853};
854
855#ifdef CONFIG_SMP
856static unsigned long
857balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
858 unsigned long max_load_move, struct sched_domain *sd,
859 enum cpu_idle_type idle, int *all_pinned,
860 int *this_best_prio, struct rq_iterator *iterator);
861
862static int
863iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
864 struct sched_domain *sd, enum cpu_idle_type idle,
865 struct rq_iterator *iterator);
866#endif
867
868#ifdef CONFIG_CGROUP_CPUACCT
869static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
870#else
871static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
872#endif
873
874#include "sched_stats.h"
875#include "sched_idletask.c"
876#include "sched_fair.c"
877#include "sched_rt.c"
878#ifdef CONFIG_SCHED_DEBUG
879# include "sched_debug.c"
880#endif
881
882#define sched_class_highest (&rt_sched_class)
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899static inline void inc_load(struct rq *rq, const struct task_struct *p)
900{
901 update_load_add(&rq->load, p->se.load.weight);
902}
903
904static inline void dec_load(struct rq *rq, const struct task_struct *p)
905{
906 update_load_sub(&rq->load, p->se.load.weight);
907}
908
909static void inc_nr_running(struct task_struct *p, struct rq *rq)
910{
911 rq->nr_running++;
912 inc_load(rq, p);
913}
914
915static void dec_nr_running(struct task_struct *p, struct rq *rq)
916{
917 rq->nr_running--;
918 dec_load(rq, p);
919}
920
921static void set_load_weight(struct task_struct *p)
922{
923 if (task_has_rt_policy(p)) {
924 p->se.load.weight = prio_to_weight[0] * 2;
925 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
926 return;
927 }
928
929
930
931
932 if (p->policy == SCHED_IDLE) {
933 p->se.load.weight = WEIGHT_IDLEPRIO;
934 p->se.load.inv_weight = WMULT_IDLEPRIO;
935 return;
936 }
937
938 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
939 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
940}
941
942static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
943{
944 sched_info_queued(p);
945 p->sched_class->enqueue_task(rq, p, wakeup);
946 p->se.on_rq = 1;
947}
948
949static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
950{
951 p->sched_class->dequeue_task(rq, p, sleep);
952 p->se.on_rq = 0;
953}
954
955
956
957
958static inline int __normal_prio(struct task_struct *p)
959{
960 return p->static_prio;
961}
962
963
964
965
966
967
968
969
970static inline int normal_prio(struct task_struct *p)
971{
972 int prio;
973
974 if (task_has_rt_policy(p))
975 prio = MAX_RT_PRIO-1 - p->rt_priority;
976 else
977 prio = __normal_prio(p);
978 return prio;
979}
980
981
982
983
984
985
986
987
988static int effective_prio(struct task_struct *p)
989{
990 p->normal_prio = normal_prio(p);
991
992
993
994
995
996 if (!rt_prio(p->prio))
997 return p->normal_prio;
998 return p->prio;
999}
1000
1001
1002
1003
1004static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1005{
1006 if (p->state == TASK_UNINTERRUPTIBLE)
1007 rq->nr_uninterruptible--;
1008
1009 enqueue_task(rq, p, wakeup);
1010 inc_nr_running(p, rq);
1011}
1012
1013
1014
1015
1016static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1017{
1018 if (p->state == TASK_UNINTERRUPTIBLE)
1019 rq->nr_uninterruptible++;
1020
1021 dequeue_task(rq, p, sleep);
1022 dec_nr_running(p, rq);
1023}
1024
1025
1026
1027
1028
1029inline int task_curr(const struct task_struct *p)
1030{
1031 return cpu_curr(task_cpu(p)) == p;
1032}
1033
1034
1035unsigned long weighted_cpuload(const int cpu)
1036{
1037 return cpu_rq(cpu)->load.weight;
1038}
1039
1040static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041{
1042 set_task_cfs_rq(p, cpu);
1043#ifdef CONFIG_SMP
1044
1045
1046
1047
1048
1049 smp_wmb();
1050 task_thread_info(p)->cpu = cpu;
1051#endif
1052}
1053
1054#ifdef CONFIG_SMP
1055
1056
1057
1058
1059static inline int
1060task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1061{
1062 s64 delta;
1063
1064 if (p->sched_class != &fair_sched_class)
1065 return 0;
1066
1067 if (sysctl_sched_migration_cost == -1)
1068 return 1;
1069 if (sysctl_sched_migration_cost == 0)
1070 return 0;
1071
1072 delta = now - p->se.exec_start;
1073
1074 return delta < (s64)sysctl_sched_migration_cost;
1075}
1076
1077
1078void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1079{
1080 int old_cpu = task_cpu(p);
1081 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1082 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1083 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1084 u64 clock_offset;
1085
1086 clock_offset = old_rq->clock - new_rq->clock;
1087
1088#ifdef CONFIG_SCHEDSTATS
1089 if (p->se.wait_start)
1090 p->se.wait_start -= clock_offset;
1091 if (p->se.sleep_start)
1092 p->se.sleep_start -= clock_offset;
1093 if (p->se.block_start)
1094 p->se.block_start -= clock_offset;
1095 if (old_cpu != new_cpu) {
1096 schedstat_inc(p, se.nr_migrations);
1097 if (task_hot(p, old_rq->clock, NULL))
1098 schedstat_inc(p, se.nr_forced2_migrations);
1099 }
1100#endif
1101 p->se.vruntime -= old_cfsrq->min_vruntime -
1102 new_cfsrq->min_vruntime;
1103
1104 __set_task_cpu(p, new_cpu);
1105}
1106
1107struct migration_req {
1108 struct list_head list;
1109
1110 struct task_struct *task;
1111 int dest_cpu;
1112
1113 struct completion done;
1114};
1115
1116
1117
1118
1119
1120static int
1121migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1122{
1123 struct rq *rq = task_rq(p);
1124
1125
1126
1127
1128
1129 if (!p->se.on_rq && !task_running(rq, p)) {
1130 set_task_cpu(p, dest_cpu);
1131 return 0;
1132 }
1133
1134 init_completion(&req->done);
1135 req->task = p;
1136 req->dest_cpu = dest_cpu;
1137 list_add(&req->list, &rq->migration_queue);
1138
1139 return 1;
1140}
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151void wait_task_inactive(struct task_struct *p)
1152{
1153 unsigned long flags;
1154 int running, on_rq;
1155 struct rq *rq;
1156
1157 for (;;) {
1158
1159
1160
1161
1162
1163
1164 rq = task_rq(p);
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177 while (task_running(rq, p))
1178 cpu_relax();
1179
1180
1181
1182
1183
1184
1185 rq = task_rq_lock(p, &flags);
1186 running = task_running(rq, p);
1187 on_rq = p->se.on_rq;
1188 task_rq_unlock(rq, &flags);
1189
1190
1191
1192
1193
1194
1195
1196 if (unlikely(running)) {
1197 cpu_relax();
1198 continue;
1199 }
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210 if (unlikely(on_rq)) {
1211 schedule_timeout_uninterruptible(1);
1212 continue;
1213 }
1214
1215
1216
1217
1218
1219
1220 break;
1221 }
1222}
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237void kick_process(struct task_struct *p)
1238{
1239 int cpu;
1240
1241 preempt_disable();
1242 cpu = task_cpu(p);
1243 if ((cpu != smp_processor_id()) && task_curr(p))
1244 smp_send_reschedule(cpu);
1245 preempt_enable();
1246}
1247
1248
1249
1250
1251
1252
1253
1254
1255static unsigned long source_load(int cpu, int type)
1256{
1257 struct rq *rq = cpu_rq(cpu);
1258 unsigned long total = weighted_cpuload(cpu);
1259
1260 if (type == 0)
1261 return total;
1262
1263 return min(rq->cpu_load[type-1], total);
1264}
1265
1266
1267
1268
1269
1270static unsigned long target_load(int cpu, int type)
1271{
1272 struct rq *rq = cpu_rq(cpu);
1273 unsigned long total = weighted_cpuload(cpu);
1274
1275 if (type == 0)
1276 return total;
1277
1278 return max(rq->cpu_load[type-1], total);
1279}
1280
1281
1282
1283
1284static inline unsigned long cpu_avg_load_per_task(int cpu)
1285{
1286 struct rq *rq = cpu_rq(cpu);
1287 unsigned long total = weighted_cpuload(cpu);
1288 unsigned long n = rq->nr_running;
1289
1290 return n ? total / n : SCHED_LOAD_SCALE;
1291}
1292
1293
1294
1295
1296
1297static struct sched_group *
1298find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1299{
1300 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1301 unsigned long min_load = ULONG_MAX, this_load = 0;
1302 int load_idx = sd->forkexec_idx;
1303 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1304
1305 do {
1306 unsigned long load, avg_load;
1307 int local_group;
1308 int i;
1309
1310
1311 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1312 continue;
1313
1314 local_group = cpu_isset(this_cpu, group->cpumask);
1315
1316
1317 avg_load = 0;
1318
1319 for_each_cpu_mask(i, group->cpumask) {
1320
1321 if (local_group)
1322 load = source_load(i, load_idx);
1323 else
1324 load = target_load(i, load_idx);
1325
1326 avg_load += load;
1327 }
1328
1329
1330 avg_load = sg_div_cpu_power(group,
1331 avg_load * SCHED_LOAD_SCALE);
1332
1333 if (local_group) {
1334 this_load = avg_load;
1335 this = group;
1336 } else if (avg_load < min_load) {
1337 min_load = avg_load;
1338 idlest = group;
1339 }
1340 } while (group = group->next, group != sd->groups);
1341
1342 if (!idlest || 100*this_load < imbalance*min_load)
1343 return NULL;
1344 return idlest;
1345}
1346
1347
1348
1349
1350static int
1351find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1352{
1353 cpumask_t tmp;
1354 unsigned long load, min_load = ULONG_MAX;
1355 int idlest = -1;
1356 int i;
1357
1358
1359 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1360
1361 for_each_cpu_mask(i, tmp) {
1362 load = weighted_cpuload(i);
1363
1364 if (load < min_load || (load == min_load && i == this_cpu)) {
1365 min_load = load;
1366 idlest = i;
1367 }
1368 }
1369
1370 return idlest;
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384static int sched_balance_self(int cpu, int flag)
1385{
1386 struct task_struct *t = current;
1387 struct sched_domain *tmp, *sd = NULL;
1388
1389 for_each_domain(cpu, tmp) {
1390
1391
1392
1393 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1394 break;
1395 if (tmp->flags & flag)
1396 sd = tmp;
1397 }
1398
1399 while (sd) {
1400 cpumask_t span;
1401 struct sched_group *group;
1402 int new_cpu, weight;
1403
1404 if (!(sd->flags & flag)) {
1405 sd = sd->child;
1406 continue;
1407 }
1408
1409 span = sd->span;
1410 group = find_idlest_group(sd, t, cpu);
1411 if (!group) {
1412 sd = sd->child;
1413 continue;
1414 }
1415
1416 new_cpu = find_idlest_cpu(group, t, cpu);
1417 if (new_cpu == -1 || new_cpu == cpu) {
1418
1419 sd = sd->child;
1420 continue;
1421 }
1422
1423
1424 cpu = new_cpu;
1425 sd = NULL;
1426 weight = cpus_weight(span);
1427 for_each_domain(cpu, tmp) {
1428 if (weight <= cpus_weight(tmp->span))
1429 break;
1430 if (tmp->flags & flag)
1431 sd = tmp;
1432 }
1433
1434 }
1435
1436 return cpu;
1437}
1438
1439#endif
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1450static int wake_idle(int cpu, struct task_struct *p)
1451{
1452 cpumask_t tmp;
1453 struct sched_domain *sd;
1454 int i;
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1466 return cpu;
1467
1468 for_each_domain(cpu, sd) {
1469 if (sd->flags & SD_WAKE_IDLE) {
1470 cpus_and(tmp, sd->span, p->cpus_allowed);
1471 for_each_cpu_mask(i, tmp) {
1472 if (idle_cpu(i)) {
1473 if (i != task_cpu(p)) {
1474 schedstat_inc(p,
1475 se.nr_wakeups_idle);
1476 }
1477 return i;
1478 }
1479 }
1480 } else {
1481 break;
1482 }
1483 }
1484 return cpu;
1485}
1486#else
1487static inline int wake_idle(int cpu, struct task_struct *p)
1488{
1489 return cpu;
1490}
1491#endif
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1508{
1509 int cpu, orig_cpu, this_cpu, success = 0;
1510 unsigned long flags;
1511 long old_state;
1512 struct rq *rq;
1513#ifdef CONFIG_SMP
1514 struct sched_domain *sd, *this_sd = NULL;
1515 unsigned long load, this_load;
1516 int new_cpu;
1517#endif
1518
1519 rq = task_rq_lock(p, &flags);
1520 old_state = p->state;
1521 if (!(old_state & state))
1522 goto out;
1523
1524 if (p->se.on_rq)
1525 goto out_running;
1526
1527 cpu = task_cpu(p);
1528 orig_cpu = cpu;
1529 this_cpu = smp_processor_id();
1530
1531#ifdef CONFIG_SMP
1532 if (unlikely(task_running(rq, p)))
1533 goto out_activate;
1534
1535 new_cpu = cpu;
1536
1537 schedstat_inc(rq, ttwu_count);
1538 if (cpu == this_cpu) {
1539 schedstat_inc(rq, ttwu_local);
1540 goto out_set_cpu;
1541 }
1542
1543 for_each_domain(this_cpu, sd) {
1544 if (cpu_isset(cpu, sd->span)) {
1545 schedstat_inc(sd, ttwu_wake_remote);
1546 this_sd = sd;
1547 break;
1548 }
1549 }
1550
1551 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1552 goto out_set_cpu;
1553
1554
1555
1556
1557 if (this_sd) {
1558 int idx = this_sd->wake_idx;
1559 unsigned int imbalance;
1560
1561 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1562
1563 load = source_load(cpu, idx);
1564 this_load = target_load(this_cpu, idx);
1565
1566 new_cpu = this_cpu;
1567
1568 if (this_sd->flags & SD_WAKE_AFFINE) {
1569 unsigned long tl = this_load;
1570 unsigned long tl_per_task;
1571
1572
1573
1574
1575 if (sync && !task_hot(p, rq->clock, this_sd))
1576 goto out_set_cpu;
1577
1578 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1579 tl_per_task = cpu_avg_load_per_task(this_cpu);
1580
1581
1582
1583
1584
1585
1586 if (sync)
1587 tl -= current->se.load.weight;
1588
1589 if ((tl <= load &&
1590 tl + target_load(cpu, idx) <= tl_per_task) ||
1591 100*(tl + p->se.load.weight) <= imbalance*load) {
1592
1593
1594
1595
1596
1597 schedstat_inc(this_sd, ttwu_move_affine);
1598 schedstat_inc(p, se.nr_wakeups_affine);
1599 goto out_set_cpu;
1600 }
1601 }
1602
1603
1604
1605
1606
1607 if (this_sd->flags & SD_WAKE_BALANCE) {
1608 if (imbalance*this_load <= 100*load) {
1609 schedstat_inc(this_sd, ttwu_move_balance);
1610 schedstat_inc(p, se.nr_wakeups_passive);
1611 goto out_set_cpu;
1612 }
1613 }
1614 }
1615
1616 new_cpu = cpu;
1617out_set_cpu:
1618 new_cpu = wake_idle(new_cpu, p);
1619 if (new_cpu != cpu) {
1620 set_task_cpu(p, new_cpu);
1621 task_rq_unlock(rq, &flags);
1622
1623 rq = task_rq_lock(p, &flags);
1624 old_state = p->state;
1625 if (!(old_state & state))
1626 goto out;
1627 if (p->se.on_rq)
1628 goto out_running;
1629
1630 this_cpu = smp_processor_id();
1631 cpu = task_cpu(p);
1632 }
1633
1634out_activate:
1635#endif
1636 schedstat_inc(p, se.nr_wakeups);
1637 if (sync)
1638 schedstat_inc(p, se.nr_wakeups_sync);
1639 if (orig_cpu != cpu)
1640 schedstat_inc(p, se.nr_wakeups_migrate);
1641 if (cpu == this_cpu)
1642 schedstat_inc(p, se.nr_wakeups_local);
1643 else
1644 schedstat_inc(p, se.nr_wakeups_remote);
1645 update_rq_clock(rq);
1646 activate_task(rq, p, 1);
1647 check_preempt_curr(rq, p);
1648 success = 1;
1649
1650out_running:
1651 p->state = TASK_RUNNING;
1652out:
1653 task_rq_unlock(rq, &flags);
1654
1655 return success;
1656}
1657
1658int fastcall wake_up_process(struct task_struct *p)
1659{
1660 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1661 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1662}
1663EXPORT_SYMBOL(wake_up_process);
1664
1665int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1666{
1667 return try_to_wake_up(p, state, 0);
1668}
1669
1670
1671
1672
1673
1674
1675
1676static void __sched_fork(struct task_struct *p)
1677{
1678 p->se.exec_start = 0;
1679 p->se.sum_exec_runtime = 0;
1680 p->se.prev_sum_exec_runtime = 0;
1681
1682#ifdef CONFIG_SCHEDSTATS
1683 p->se.wait_start = 0;
1684 p->se.sum_sleep_runtime = 0;
1685 p->se.sleep_start = 0;
1686 p->se.block_start = 0;
1687 p->se.sleep_max = 0;
1688 p->se.block_max = 0;
1689 p->se.exec_max = 0;
1690 p->se.slice_max = 0;
1691 p->se.wait_max = 0;
1692#endif
1693
1694 INIT_LIST_HEAD(&p->run_list);
1695 p->se.on_rq = 0;
1696
1697#ifdef CONFIG_PREEMPT_NOTIFIERS
1698 INIT_HLIST_HEAD(&p->preempt_notifiers);
1699#endif
1700
1701
1702
1703
1704
1705
1706
1707 p->state = TASK_RUNNING;
1708}
1709
1710
1711
1712
1713void sched_fork(struct task_struct *p, int clone_flags)
1714{
1715 int cpu = get_cpu();
1716
1717 __sched_fork(p);
1718
1719#ifdef CONFIG_SMP
1720 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1721#endif
1722 set_task_cpu(p, cpu);
1723
1724
1725
1726
1727 p->prio = current->normal_prio;
1728 if (!rt_prio(p->prio))
1729 p->sched_class = &fair_sched_class;
1730
1731#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1732 if (likely(sched_info_on()))
1733 memset(&p->sched_info, 0, sizeof(p->sched_info));
1734#endif
1735#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1736 p->oncpu = 0;
1737#endif
1738#ifdef CONFIG_PREEMPT
1739
1740 task_thread_info(p)->preempt_count = 1;
1741#endif
1742 put_cpu();
1743}
1744
1745
1746
1747
1748
1749
1750
1751
1752void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1753{
1754 unsigned long flags;
1755 struct rq *rq;
1756
1757 rq = task_rq_lock(p, &flags);
1758 BUG_ON(p->state != TASK_RUNNING);
1759 update_rq_clock(rq);
1760
1761 p->prio = effective_prio(p);
1762
1763 if (!p->sched_class->task_new || !current->se.on_rq) {
1764 activate_task(rq, p, 0);
1765 } else {
1766
1767
1768
1769
1770 p->sched_class->task_new(rq, p);
1771 inc_nr_running(p, rq);
1772 }
1773 check_preempt_curr(rq, p);
1774 task_rq_unlock(rq, &flags);
1775}
1776
1777#ifdef CONFIG_PREEMPT_NOTIFIERS
1778
1779
1780
1781
1782
1783void preempt_notifier_register(struct preempt_notifier *notifier)
1784{
1785 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1786}
1787EXPORT_SYMBOL_GPL(preempt_notifier_register);
1788
1789
1790
1791
1792
1793
1794
1795void preempt_notifier_unregister(struct preempt_notifier *notifier)
1796{
1797 hlist_del(¬ifier->link);
1798}
1799EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1800
1801static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1802{
1803 struct preempt_notifier *notifier;
1804 struct hlist_node *node;
1805
1806 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1807 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1808}
1809
1810static void
1811fire_sched_out_preempt_notifiers(struct task_struct *curr,
1812 struct task_struct *next)
1813{
1814 struct preempt_notifier *notifier;
1815 struct hlist_node *node;
1816
1817 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1818 notifier->ops->sched_out(notifier, next);
1819}
1820
1821#else
1822
1823static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1824{
1825}
1826
1827static void
1828fire_sched_out_preempt_notifiers(struct task_struct *curr,
1829 struct task_struct *next)
1830{
1831}
1832
1833#endif
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848static inline void
1849prepare_task_switch(struct rq *rq, struct task_struct *prev,
1850 struct task_struct *next)
1851{
1852 fire_sched_out_preempt_notifiers(prev, next);
1853 prepare_lock_switch(rq, next);
1854 prepare_arch_switch(next);
1855}
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1873 __releases(rq->lock)
1874{
1875 struct mm_struct *mm = rq->prev_mm;
1876 long prev_state;
1877
1878 rq->prev_mm = NULL;
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891 prev_state = prev->state;
1892 finish_arch_switch(prev);
1893 finish_lock_switch(rq, prev);
1894 fire_sched_in_preempt_notifiers(current);
1895 if (mm)
1896 mmdrop(mm);
1897 if (unlikely(prev_state == TASK_DEAD)) {
1898
1899
1900
1901
1902 kprobe_flush_task(prev);
1903 put_task_struct(prev);
1904 }
1905}
1906
1907
1908
1909
1910
1911asmlinkage void schedule_tail(struct task_struct *prev)
1912 __releases(rq->lock)
1913{
1914 struct rq *rq = this_rq();
1915
1916 finish_task_switch(rq, prev);
1917#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1918
1919 preempt_enable();
1920#endif
1921 if (current->set_child_tid)
1922 put_user(task_pid_vnr(current), current->set_child_tid);
1923}
1924
1925
1926
1927
1928
1929static inline void
1930context_switch(struct rq *rq, struct task_struct *prev,
1931 struct task_struct *next)
1932{
1933 struct mm_struct *mm, *oldmm;
1934
1935 prepare_task_switch(rq, prev, next);
1936 mm = next->mm;
1937 oldmm = prev->active_mm;
1938
1939
1940
1941
1942
1943 arch_enter_lazy_cpu_mode();
1944
1945 if (unlikely(!mm)) {
1946 next->active_mm = oldmm;
1947 atomic_inc(&oldmm->mm_count);
1948 enter_lazy_tlb(oldmm, next);
1949 } else
1950 switch_mm(oldmm, mm, next);
1951
1952 if (unlikely(!prev->mm)) {
1953 prev->active_mm = NULL;
1954 rq->prev_mm = oldmm;
1955 }
1956
1957
1958
1959
1960
1961
1962#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1963 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1964#endif
1965
1966
1967 switch_to(prev, next, prev);
1968
1969 barrier();
1970
1971
1972
1973
1974
1975 finish_task_switch(this_rq(), prev);
1976}
1977
1978
1979
1980
1981
1982
1983
1984
1985unsigned long nr_running(void)
1986{
1987 unsigned long i, sum = 0;
1988
1989 for_each_online_cpu(i)
1990 sum += cpu_rq(i)->nr_running;
1991
1992 return sum;
1993}
1994
1995unsigned long nr_uninterruptible(void)
1996{
1997 unsigned long i, sum = 0;
1998
1999 for_each_possible_cpu(i)
2000 sum += cpu_rq(i)->nr_uninterruptible;
2001
2002
2003
2004
2005
2006 if (unlikely((long)sum < 0))
2007 sum = 0;
2008
2009 return sum;
2010}
2011
2012unsigned long long nr_context_switches(void)
2013{
2014 int i;
2015 unsigned long long sum = 0;
2016
2017 for_each_possible_cpu(i)
2018 sum += cpu_rq(i)->nr_switches;
2019
2020 return sum;
2021}
2022
2023unsigned long nr_iowait(void)
2024{
2025 unsigned long i, sum = 0;
2026
2027 for_each_possible_cpu(i)
2028 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2029
2030 return sum;
2031}
2032
2033unsigned long nr_active(void)
2034{
2035 unsigned long i, running = 0, uninterruptible = 0;
2036
2037 for_each_online_cpu(i) {
2038 running += cpu_rq(i)->nr_running;
2039 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2040 }
2041
2042 if (unlikely((long)uninterruptible < 0))
2043 uninterruptible = 0;
2044
2045 return running + uninterruptible;
2046}
2047
2048
2049
2050
2051
2052static void update_cpu_load(struct rq *this_rq)
2053{
2054 unsigned long this_load = this_rq->load.weight;
2055 int i, scale;
2056
2057 this_rq->nr_load_updates++;
2058
2059
2060 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2061 unsigned long old_load, new_load;
2062
2063
2064
2065 old_load = this_rq->cpu_load[i];
2066 new_load = this_load;
2067
2068
2069
2070
2071
2072 if (new_load > old_load)
2073 new_load += scale-1;
2074 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2075 }
2076}
2077
2078#ifdef CONFIG_SMP
2079
2080
2081
2082
2083
2084
2085
2086static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2087 __acquires(rq1->lock)
2088 __acquires(rq2->lock)
2089{
2090 BUG_ON(!irqs_disabled());
2091 if (rq1 == rq2) {
2092 spin_lock(&rq1->lock);
2093 __acquire(rq2->lock);
2094 } else {
2095 if (rq1 < rq2) {
2096 spin_lock(&rq1->lock);
2097 spin_lock(&rq2->lock);
2098 } else {
2099 spin_lock(&rq2->lock);
2100 spin_lock(&rq1->lock);
2101 }
2102 }
2103 update_rq_clock(rq1);
2104 update_rq_clock(rq2);
2105}
2106
2107
2108
2109
2110
2111
2112
2113static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2114 __releases(rq1->lock)
2115 __releases(rq2->lock)
2116{
2117 spin_unlock(&rq1->lock);
2118 if (rq1 != rq2)
2119 spin_unlock(&rq2->lock);
2120 else
2121 __release(rq2->lock);
2122}
2123
2124
2125
2126
2127static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2128 __releases(this_rq->lock)
2129 __acquires(busiest->lock)
2130 __acquires(this_rq->lock)
2131{
2132 if (unlikely(!irqs_disabled())) {
2133
2134 spin_unlock(&this_rq->lock);
2135 BUG_ON(1);
2136 }
2137 if (unlikely(!spin_trylock(&busiest->lock))) {
2138 if (busiest < this_rq) {
2139 spin_unlock(&this_rq->lock);
2140 spin_lock(&busiest->lock);
2141 spin_lock(&this_rq->lock);
2142 } else
2143 spin_lock(&busiest->lock);
2144 }
2145}
2146
2147
2148
2149
2150
2151
2152
2153static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2154{
2155 struct migration_req req;
2156 unsigned long flags;
2157 struct rq *rq;
2158
2159 rq = task_rq_lock(p, &flags);
2160 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2161 || unlikely(cpu_is_offline(dest_cpu)))
2162 goto out;
2163
2164
2165 if (migrate_task(p, dest_cpu, &req)) {
2166
2167 struct task_struct *mt = rq->migration_thread;
2168
2169 get_task_struct(mt);
2170 task_rq_unlock(rq, &flags);
2171 wake_up_process(mt);
2172 put_task_struct(mt);
2173 wait_for_completion(&req.done);
2174
2175 return;
2176 }
2177out:
2178 task_rq_unlock(rq, &flags);
2179}
2180
2181
2182
2183
2184
2185void sched_exec(void)
2186{
2187 int new_cpu, this_cpu = get_cpu();
2188 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2189 put_cpu();
2190 if (new_cpu != this_cpu)
2191 sched_migrate_task(current, new_cpu);
2192}
2193
2194
2195
2196
2197
2198static void pull_task(struct rq *src_rq, struct task_struct *p,
2199 struct rq *this_rq, int this_cpu)
2200{
2201 deactivate_task(src_rq, p, 0);
2202 set_task_cpu(p, this_cpu);
2203 activate_task(this_rq, p, 0);
2204
2205
2206
2207
2208 check_preempt_curr(this_rq, p);
2209}
2210
2211
2212
2213
2214static
2215int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2216 struct sched_domain *sd, enum cpu_idle_type idle,
2217 int *all_pinned)
2218{
2219
2220
2221
2222
2223
2224
2225 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2226 schedstat_inc(p, se.nr_failed_migrations_affine);
2227 return 0;
2228 }
2229 *all_pinned = 0;
2230
2231 if (task_running(rq, p)) {
2232 schedstat_inc(p, se.nr_failed_migrations_running);
2233 return 0;
2234 }
2235
2236
2237
2238
2239
2240
2241
2242 if (!task_hot(p, rq->clock, sd) ||
2243 sd->nr_balance_failed > sd->cache_nice_tries) {
2244#ifdef CONFIG_SCHEDSTATS
2245 if (task_hot(p, rq->clock, sd)) {
2246 schedstat_inc(sd, lb_hot_gained[idle]);
2247 schedstat_inc(p, se.nr_forced_migrations);
2248 }
2249#endif
2250 return 1;
2251 }
2252
2253 if (task_hot(p, rq->clock, sd)) {
2254 schedstat_inc(p, se.nr_failed_migrations_hot);
2255 return 0;
2256 }
2257 return 1;
2258}
2259
2260static unsigned long
2261balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2262 unsigned long max_load_move, struct sched_domain *sd,
2263 enum cpu_idle_type idle, int *all_pinned,
2264 int *this_best_prio, struct rq_iterator *iterator)
2265{
2266 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2267 struct task_struct *p;
2268 long rem_load_move = max_load_move;
2269
2270 if (max_load_move == 0)
2271 goto out;
2272
2273 pinned = 1;
2274
2275
2276
2277
2278 p = iterator->start(iterator->arg);
2279next:
2280 if (!p || loops++ > sysctl_sched_nr_migrate)
2281 goto out;
2282
2283
2284
2285
2286
2287 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2288 SCHED_LOAD_SCALE_FUZZ;
2289 if ((skip_for_load && p->prio >= *this_best_prio) ||
2290 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2291 p = iterator->next(iterator->arg);
2292 goto next;
2293 }
2294
2295 pull_task(busiest, p, this_rq, this_cpu);
2296 pulled++;
2297 rem_load_move -= p->se.load.weight;
2298
2299
2300
2301
2302 if (rem_load_move > 0) {
2303 if (p->prio < *this_best_prio)
2304 *this_best_prio = p->prio;
2305 p = iterator->next(iterator->arg);
2306 goto next;
2307 }
2308out:
2309
2310
2311
2312
2313
2314 schedstat_add(sd, lb_gained[idle], pulled);
2315
2316 if (all_pinned)
2317 *all_pinned = pinned;
2318
2319 return max_load_move - rem_load_move;
2320}
2321
2322
2323
2324
2325
2326
2327
2328
2329static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2330 unsigned long max_load_move,
2331 struct sched_domain *sd, enum cpu_idle_type idle,
2332 int *all_pinned)
2333{
2334 const struct sched_class *class = sched_class_highest;
2335 unsigned long total_load_moved = 0;
2336 int this_best_prio = this_rq->curr->prio;
2337
2338 do {
2339 total_load_moved +=
2340 class->load_balance(this_rq, this_cpu, busiest,
2341 max_load_move - total_load_moved,
2342 sd, idle, all_pinned, &this_best_prio);
2343 class = class->next;
2344 } while (class && max_load_move > total_load_moved);
2345
2346 return total_load_moved > 0;
2347}
2348
2349static int
2350iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2351 struct sched_domain *sd, enum cpu_idle_type idle,
2352 struct rq_iterator *iterator)
2353{
2354 struct task_struct *p = iterator->start(iterator->arg);
2355 int pinned = 0;
2356
2357 while (p) {
2358 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2359 pull_task(busiest, p, this_rq, this_cpu);
2360
2361
2362
2363
2364
2365 schedstat_inc(sd, lb_gained[idle]);
2366
2367 return 1;
2368 }
2369 p = iterator->next(iterator->arg);
2370 }
2371
2372 return 0;
2373}
2374
2375
2376
2377
2378
2379
2380
2381
2382static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2383 struct sched_domain *sd, enum cpu_idle_type idle)
2384{
2385 const struct sched_class *class;
2386
2387 for (class = sched_class_highest; class; class = class->next)
2388 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
2389 return 1;
2390
2391 return 0;
2392}
2393
2394
2395
2396
2397
2398
2399static struct sched_group *
2400find_busiest_group(struct sched_domain *sd, int this_cpu,
2401 unsigned long *imbalance, enum cpu_idle_type idle,
2402 int *sd_idle, cpumask_t *cpus, int *balance)
2403{
2404 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2405 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2406 unsigned long max_pull;
2407 unsigned long busiest_load_per_task, busiest_nr_running;
2408 unsigned long this_load_per_task, this_nr_running;
2409 int load_idx, group_imb = 0;
2410#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2411 int power_savings_balance = 1;
2412 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2413 unsigned long min_nr_running = ULONG_MAX;
2414 struct sched_group *group_min = NULL, *group_leader = NULL;
2415#endif
2416
2417 max_load = this_load = total_load = total_pwr = 0;
2418 busiest_load_per_task = busiest_nr_running = 0;
2419 this_load_per_task = this_nr_running = 0;
2420 if (idle == CPU_NOT_IDLE)
2421 load_idx = sd->busy_idx;
2422 else if (idle == CPU_NEWLY_IDLE)
2423 load_idx = sd->newidle_idx;
2424 else
2425 load_idx = sd->idle_idx;
2426
2427 do {
2428 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2429 int local_group;
2430 int i;
2431 int __group_imb = 0;
2432 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2433 unsigned long sum_nr_running, sum_weighted_load;
2434
2435 local_group = cpu_isset(this_cpu, group->cpumask);
2436
2437 if (local_group)
2438 balance_cpu = first_cpu(group->cpumask);
2439
2440
2441 sum_weighted_load = sum_nr_running = avg_load = 0;
2442 max_cpu_load = 0;
2443 min_cpu_load = ~0UL;
2444
2445 for_each_cpu_mask(i, group->cpumask) {
2446 struct rq *rq;
2447
2448 if (!cpu_isset(i, *cpus))
2449 continue;
2450
2451 rq = cpu_rq(i);
2452
2453 if (*sd_idle && rq->nr_running)
2454 *sd_idle = 0;
2455
2456
2457 if (local_group) {
2458 if (idle_cpu(i) && !first_idle_cpu) {
2459 first_idle_cpu = 1;
2460 balance_cpu = i;
2461 }
2462
2463 load = target_load(i, load_idx);
2464 } else {
2465 load = source_load(i, load_idx);
2466 if (load > max_cpu_load)
2467 max_cpu_load = load;
2468 if (min_cpu_load > load)
2469 min_cpu_load = load;
2470 }
2471
2472 avg_load += load;
2473 sum_nr_running += rq->nr_running;
2474 sum_weighted_load += weighted_cpuload(i);
2475 }
2476
2477
2478
2479
2480
2481
2482
2483 if (idle != CPU_NEWLY_IDLE && local_group &&
2484 balance_cpu != this_cpu && balance) {
2485 *balance = 0;
2486 goto ret;
2487 }
2488
2489 total_load += avg_load;
2490 total_pwr += group->__cpu_power;
2491
2492
2493 avg_load = sg_div_cpu_power(group,
2494 avg_load * SCHED_LOAD_SCALE);
2495
2496 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
2497 __group_imb = 1;
2498
2499 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2500
2501 if (local_group) {
2502 this_load = avg_load;
2503 this = group;
2504 this_nr_running = sum_nr_running;
2505 this_load_per_task = sum_weighted_load;
2506 } else if (avg_load > max_load &&
2507 (sum_nr_running > group_capacity || __group_imb)) {
2508 max_load = avg_load;
2509 busiest = group;
2510 busiest_nr_running = sum_nr_running;
2511 busiest_load_per_task = sum_weighted_load;
2512 group_imb = __group_imb;
2513 }
2514
2515#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2516
2517
2518
2519
2520 if (idle == CPU_NOT_IDLE ||
2521 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2522 goto group_next;
2523
2524
2525
2526
2527
2528 if (local_group && (this_nr_running >= group_capacity ||
2529 !this_nr_running))
2530 power_savings_balance = 0;
2531
2532
2533
2534
2535
2536 if (!power_savings_balance || sum_nr_running >= group_capacity
2537 || !sum_nr_running)
2538 goto group_next;
2539
2540
2541
2542
2543
2544
2545 if ((sum_nr_running < min_nr_running) ||
2546 (sum_nr_running == min_nr_running &&
2547 first_cpu(group->cpumask) <
2548 first_cpu(group_min->cpumask))) {
2549 group_min = group;
2550 min_nr_running = sum_nr_running;
2551 min_load_per_task = sum_weighted_load /
2552 sum_nr_running;
2553 }
2554
2555
2556
2557
2558
2559
2560 if (sum_nr_running <= group_capacity - 1) {
2561 if (sum_nr_running > leader_nr_running ||
2562 (sum_nr_running == leader_nr_running &&
2563 first_cpu(group->cpumask) >
2564 first_cpu(group_leader->cpumask))) {
2565 group_leader = group;
2566 leader_nr_running = sum_nr_running;
2567 }
2568 }
2569group_next:
2570#endif
2571 group = group->next;
2572 } while (group != sd->groups);
2573
2574 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2575 goto out_balanced;
2576
2577 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2578
2579 if (this_load >= avg_load ||
2580 100*max_load <= sd->imbalance_pct*this_load)
2581 goto out_balanced;
2582
2583 busiest_load_per_task /= busiest_nr_running;
2584 if (group_imb)
2585 busiest_load_per_task = min(busiest_load_per_task, avg_load);
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598 if (max_load <= busiest_load_per_task)
2599 goto out_balanced;
2600
2601
2602
2603
2604
2605
2606 if (max_load < avg_load) {
2607 *imbalance = 0;
2608 goto small_imbalance;
2609 }
2610
2611
2612 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2613
2614
2615 *imbalance = min(max_pull * busiest->__cpu_power,
2616 (avg_load - this_load) * this->__cpu_power)
2617 / SCHED_LOAD_SCALE;
2618
2619
2620
2621
2622
2623
2624
2625 if (*imbalance < busiest_load_per_task) {
2626 unsigned long tmp, pwr_now, pwr_move;
2627 unsigned int imbn;
2628
2629small_imbalance:
2630 pwr_move = pwr_now = 0;
2631 imbn = 2;
2632 if (this_nr_running) {
2633 this_load_per_task /= this_nr_running;
2634 if (busiest_load_per_task > this_load_per_task)
2635 imbn = 1;
2636 } else
2637 this_load_per_task = SCHED_LOAD_SCALE;
2638
2639 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2640 busiest_load_per_task * imbn) {
2641 *imbalance = busiest_load_per_task;
2642 return busiest;
2643 }
2644
2645
2646
2647
2648
2649
2650
2651 pwr_now += busiest->__cpu_power *
2652 min(busiest_load_per_task, max_load);
2653 pwr_now += this->__cpu_power *
2654 min(this_load_per_task, this_load);
2655 pwr_now /= SCHED_LOAD_SCALE;
2656
2657
2658 tmp = sg_div_cpu_power(busiest,
2659 busiest_load_per_task * SCHED_LOAD_SCALE);
2660 if (max_load > tmp)
2661 pwr_move += busiest->__cpu_power *
2662 min(busiest_load_per_task, max_load - tmp);
2663
2664
2665 if (max_load * busiest->__cpu_power <
2666 busiest_load_per_task * SCHED_LOAD_SCALE)
2667 tmp = sg_div_cpu_power(this,
2668 max_load * busiest->__cpu_power);
2669 else
2670 tmp = sg_div_cpu_power(this,
2671 busiest_load_per_task * SCHED_LOAD_SCALE);
2672 pwr_move += this->__cpu_power *
2673 min(this_load_per_task, this_load + tmp);
2674 pwr_move /= SCHED_LOAD_SCALE;
2675
2676
2677 if (pwr_move > pwr_now)
2678 *imbalance = busiest_load_per_task;
2679 }
2680
2681 return busiest;
2682
2683out_balanced:
2684#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2685 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2686 goto ret;
2687
2688 if (this == group_leader && group_leader != group_min) {
2689 *imbalance = min_load_per_task;
2690 return group_min;
2691 }
2692#endif
2693ret:
2694 *imbalance = 0;
2695 return NULL;
2696}
2697
2698
2699
2700
2701static struct rq *
2702find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2703 unsigned long imbalance, cpumask_t *cpus)
2704{
2705 struct rq *busiest = NULL, *rq;
2706 unsigned long max_load = 0;
2707 int i;
2708
2709 for_each_cpu_mask(i, group->cpumask) {
2710 unsigned long wl;
2711
2712 if (!cpu_isset(i, *cpus))
2713 continue;
2714
2715 rq = cpu_rq(i);
2716 wl = weighted_cpuload(i);
2717
2718 if (rq->nr_running == 1 && wl > imbalance)
2719 continue;
2720
2721 if (wl > max_load) {
2722 max_load = wl;
2723 busiest = rq;
2724 }
2725 }
2726
2727 return busiest;
2728}
2729
2730
2731
2732
2733
2734#define MAX_PINNED_INTERVAL 512
2735
2736
2737
2738
2739
2740static int load_balance(int this_cpu, struct rq *this_rq,
2741 struct sched_domain *sd, enum cpu_idle_type idle,
2742 int *balance)
2743{
2744 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2745 struct sched_group *group;
2746 unsigned long imbalance;
2747 struct rq *busiest;
2748 cpumask_t cpus = CPU_MASK_ALL;
2749 unsigned long flags;
2750
2751
2752
2753
2754
2755
2756
2757 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2758 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2759 sd_idle = 1;
2760
2761 schedstat_inc(sd, lb_count[idle]);
2762
2763redo:
2764 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2765 &cpus, balance);
2766
2767 if (*balance == 0)
2768 goto out_balanced;
2769
2770 if (!group) {
2771 schedstat_inc(sd, lb_nobusyg[idle]);
2772 goto out_balanced;
2773 }
2774
2775 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2776 if (!busiest) {
2777 schedstat_inc(sd, lb_nobusyq[idle]);
2778 goto out_balanced;
2779 }
2780
2781 BUG_ON(busiest == this_rq);
2782
2783 schedstat_add(sd, lb_imbalance[idle], imbalance);
2784
2785 ld_moved = 0;
2786 if (busiest->nr_running > 1) {
2787
2788
2789
2790
2791
2792
2793 local_irq_save(flags);
2794 double_rq_lock(this_rq, busiest);
2795 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2796 imbalance, sd, idle, &all_pinned);
2797 double_rq_unlock(this_rq, busiest);
2798 local_irq_restore(flags);
2799
2800
2801
2802
2803 if (ld_moved && this_cpu != smp_processor_id())
2804 resched_cpu(this_cpu);
2805
2806
2807 if (unlikely(all_pinned)) {
2808 cpu_clear(cpu_of(busiest), cpus);
2809 if (!cpus_empty(cpus))
2810 goto redo;
2811 goto out_balanced;
2812 }
2813 }
2814
2815 if (!ld_moved) {
2816 schedstat_inc(sd, lb_failed[idle]);
2817 sd->nr_balance_failed++;
2818
2819 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2820
2821 spin_lock_irqsave(&busiest->lock, flags);
2822
2823
2824
2825
2826 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2827 spin_unlock_irqrestore(&busiest->lock, flags);
2828 all_pinned = 1;
2829 goto out_one_pinned;
2830 }
2831
2832 if (!busiest->active_balance) {
2833 busiest->active_balance = 1;
2834 busiest->push_cpu = this_cpu;
2835 active_balance = 1;
2836 }
2837 spin_unlock_irqrestore(&busiest->lock, flags);
2838 if (active_balance)
2839 wake_up_process(busiest->migration_thread);
2840
2841
2842
2843
2844
2845 sd->nr_balance_failed = sd->cache_nice_tries+1;
2846 }
2847 } else
2848 sd->nr_balance_failed = 0;
2849
2850 if (likely(!active_balance)) {
2851
2852 sd->balance_interval = sd->min_interval;
2853 } else {
2854
2855
2856
2857
2858
2859
2860 if (sd->balance_interval < sd->max_interval)
2861 sd->balance_interval *= 2;
2862 }
2863
2864 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2865 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2866 return -1;
2867 return ld_moved;
2868
2869out_balanced:
2870 schedstat_inc(sd, lb_balanced[idle]);
2871
2872 sd->nr_balance_failed = 0;
2873
2874out_one_pinned:
2875
2876 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2877 (sd->balance_interval < sd->max_interval))
2878 sd->balance_interval *= 2;
2879
2880 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2881 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2882 return -1;
2883 return 0;
2884}
2885
2886
2887
2888
2889
2890
2891
2892
2893static int
2894load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2895{
2896 struct sched_group *group;
2897 struct rq *busiest = NULL;
2898 unsigned long imbalance;
2899 int ld_moved = 0;
2900 int sd_idle = 0;
2901 int all_pinned = 0;
2902 cpumask_t cpus = CPU_MASK_ALL;
2903
2904
2905
2906
2907
2908
2909
2910 if (sd->flags & SD_SHARE_CPUPOWER &&
2911 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2912 sd_idle = 1;
2913
2914 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
2915redo:
2916 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2917 &sd_idle, &cpus, NULL);
2918 if (!group) {
2919 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2920 goto out_balanced;
2921 }
2922
2923 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2924 &cpus);
2925 if (!busiest) {
2926 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2927 goto out_balanced;
2928 }
2929
2930 BUG_ON(busiest == this_rq);
2931
2932 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2933
2934 ld_moved = 0;
2935 if (busiest->nr_running > 1) {
2936
2937 double_lock_balance(this_rq, busiest);
2938
2939 update_rq_clock(busiest);
2940 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2941 imbalance, sd, CPU_NEWLY_IDLE,
2942 &all_pinned);
2943 spin_unlock(&busiest->lock);
2944
2945 if (unlikely(all_pinned)) {
2946 cpu_clear(cpu_of(busiest), cpus);
2947 if (!cpus_empty(cpus))
2948 goto redo;
2949 }
2950 }
2951
2952 if (!ld_moved) {
2953 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2954 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2955 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2956 return -1;
2957 } else
2958 sd->nr_balance_failed = 0;
2959
2960 return ld_moved;
2961
2962out_balanced:
2963 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2964 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2965 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2966 return -1;
2967 sd->nr_balance_failed = 0;
2968
2969 return 0;
2970}
2971
2972
2973
2974
2975
2976static void idle_balance(int this_cpu, struct rq *this_rq)
2977{
2978 struct sched_domain *sd;
2979 int pulled_task = -1;
2980 unsigned long next_balance = jiffies + HZ;
2981
2982 for_each_domain(this_cpu, sd) {
2983 unsigned long interval;
2984
2985 if (!(sd->flags & SD_LOAD_BALANCE))
2986 continue;
2987
2988 if (sd->flags & SD_BALANCE_NEWIDLE)
2989
2990 pulled_task = load_balance_newidle(this_cpu,
2991 this_rq, sd);
2992
2993 interval = msecs_to_jiffies(sd->balance_interval);
2994 if (time_after(next_balance, sd->last_balance + interval))
2995 next_balance = sd->last_balance + interval;
2996 if (pulled_task)
2997 break;
2998 }
2999 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3000
3001
3002
3003
3004 this_rq->next_balance = next_balance;
3005 }
3006}
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3017{
3018 int target_cpu = busiest_rq->push_cpu;
3019 struct sched_domain *sd;
3020 struct rq *target_rq;
3021
3022
3023 if (busiest_rq->nr_running <= 1)
3024 return;
3025
3026 target_rq = cpu_rq(target_cpu);
3027
3028
3029
3030
3031
3032
3033 BUG_ON(busiest_rq == target_rq);
3034
3035
3036 double_lock_balance(busiest_rq, target_rq);
3037 update_rq_clock(busiest_rq);
3038 update_rq_clock(target_rq);
3039
3040
3041 for_each_domain(target_cpu, sd) {
3042 if ((sd->flags & SD_LOAD_BALANCE) &&
3043 cpu_isset(busiest_cpu, sd->span))
3044 break;
3045 }
3046
3047 if (likely(sd)) {
3048 schedstat_inc(sd, alb_count);
3049
3050 if (move_one_task(target_rq, target_cpu, busiest_rq,
3051 sd, CPU_IDLE))
3052 schedstat_inc(sd, alb_pushed);
3053 else
3054 schedstat_inc(sd, alb_failed);
3055 }
3056 spin_unlock(&target_rq->lock);
3057}
3058
3059#ifdef CONFIG_NO_HZ
3060static struct {
3061 atomic_t load_balancer;
3062 cpumask_t cpu_mask;
3063} nohz ____cacheline_aligned = {
3064 .load_balancer = ATOMIC_INIT(-1),
3065 .cpu_mask = CPU_MASK_NONE,
3066};
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088int select_nohz_load_balancer(int stop_tick)
3089{
3090 int cpu = smp_processor_id();
3091
3092 if (stop_tick) {
3093 cpu_set(cpu, nohz.cpu_mask);
3094 cpu_rq(cpu)->in_nohz_recently = 1;
3095
3096
3097
3098
3099 if (cpu_is_offline(cpu) &&
3100 atomic_read(&nohz.load_balancer) == cpu) {
3101 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3102 BUG();
3103 return 0;
3104 }
3105
3106
3107 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3108 if (atomic_read(&nohz.load_balancer) == cpu)
3109 atomic_set(&nohz.load_balancer, -1);
3110 return 0;
3111 }
3112
3113 if (atomic_read(&nohz.load_balancer) == -1) {
3114
3115 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3116 return 1;
3117 } else if (atomic_read(&nohz.load_balancer) == cpu)
3118 return 1;
3119 } else {
3120 if (!cpu_isset(cpu, nohz.cpu_mask))
3121 return 0;
3122
3123 cpu_clear(cpu, nohz.cpu_mask);
3124
3125 if (atomic_read(&nohz.load_balancer) == cpu)
3126 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3127 BUG();
3128 }
3129 return 0;
3130}
3131#endif
3132
3133static DEFINE_SPINLOCK(balancing);
3134
3135
3136
3137
3138
3139
3140
3141static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3142{
3143 int balance = 1;
3144 struct rq *rq = cpu_rq(cpu);
3145 unsigned long interval;
3146 struct sched_domain *sd;
3147
3148 unsigned long next_balance = jiffies + 60*HZ;
3149 int update_next_balance = 0;
3150
3151 for_each_domain(cpu, sd) {
3152 if (!(sd->flags & SD_LOAD_BALANCE))
3153 continue;
3154
3155 interval = sd->balance_interval;
3156 if (idle != CPU_IDLE)
3157 interval *= sd->busy_factor;
3158
3159
3160 interval = msecs_to_jiffies(interval);
3161 if (unlikely(!interval))
3162 interval = 1;
3163 if (interval > HZ*NR_CPUS/10)
3164 interval = HZ*NR_CPUS/10;
3165
3166
3167 if (sd->flags & SD_SERIALIZE) {
3168 if (!spin_trylock(&balancing))
3169 goto out;
3170 }
3171
3172 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3173 if (load_balance(cpu, rq, sd, idle, &balance)) {
3174
3175
3176
3177
3178
3179 idle = CPU_NOT_IDLE;
3180 }
3181 sd->last_balance = jiffies;
3182 }
3183 if (sd->flags & SD_SERIALIZE)
3184 spin_unlock(&balancing);
3185out:
3186 if (time_after(next_balance, sd->last_balance + interval)) {
3187 next_balance = sd->last_balance + interval;
3188 update_next_balance = 1;
3189 }
3190
3191
3192
3193
3194
3195
3196 if (!balance)
3197 break;
3198 }
3199
3200
3201
3202
3203
3204
3205 if (likely(update_next_balance))
3206 rq->next_balance = next_balance;
3207}
3208
3209
3210
3211
3212
3213
3214static void run_rebalance_domains(struct softirq_action *h)
3215{
3216 int this_cpu = smp_processor_id();
3217 struct rq *this_rq = cpu_rq(this_cpu);
3218 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3219 CPU_IDLE : CPU_NOT_IDLE;
3220
3221 rebalance_domains(this_cpu, idle);
3222
3223#ifdef CONFIG_NO_HZ
3224
3225
3226
3227
3228
3229 if (this_rq->idle_at_tick &&
3230 atomic_read(&nohz.load_balancer) == this_cpu) {
3231 cpumask_t cpus = nohz.cpu_mask;
3232 struct rq *rq;
3233 int balance_cpu;
3234
3235 cpu_clear(this_cpu, cpus);
3236 for_each_cpu_mask(balance_cpu, cpus) {
3237
3238
3239
3240
3241
3242 if (need_resched())
3243 break;
3244
3245 rebalance_domains(balance_cpu, CPU_IDLE);
3246
3247 rq = cpu_rq(balance_cpu);
3248 if (time_after(this_rq->next_balance, rq->next_balance))
3249 this_rq->next_balance = rq->next_balance;
3250 }
3251 }
3252#endif
3253}
3254
3255
3256
3257
3258
3259
3260
3261
3262static inline void trigger_load_balance(struct rq *rq, int cpu)
3263{
3264#ifdef CONFIG_NO_HZ
3265
3266
3267
3268
3269
3270 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3271 rq->in_nohz_recently = 0;
3272
3273 if (atomic_read(&nohz.load_balancer) == cpu) {
3274 cpu_clear(cpu, nohz.cpu_mask);
3275 atomic_set(&nohz.load_balancer, -1);
3276 }
3277
3278 if (atomic_read(&nohz.load_balancer) == -1) {
3279
3280
3281
3282
3283
3284
3285
3286
3287 int ilb = first_cpu(nohz.cpu_mask);
3288
3289 if (ilb != NR_CPUS)
3290 resched_cpu(ilb);
3291 }
3292 }
3293
3294
3295
3296
3297
3298 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3299 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3300 resched_cpu(cpu);
3301 return;
3302 }
3303
3304
3305
3306
3307
3308 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3309 cpu_isset(cpu, nohz.cpu_mask))
3310 return;
3311#endif
3312 if (time_after_eq(jiffies, rq->next_balance))
3313 raise_softirq(SCHED_SOFTIRQ);
3314}
3315
3316#else
3317
3318
3319
3320
3321static inline void idle_balance(int cpu, struct rq *rq)
3322{
3323}
3324
3325#endif
3326
3327DEFINE_PER_CPU(struct kernel_stat, kstat);
3328
3329EXPORT_PER_CPU_SYMBOL(kstat);
3330
3331
3332
3333
3334
3335unsigned long long task_sched_runtime(struct task_struct *p)
3336{
3337 unsigned long flags;
3338 u64 ns, delta_exec;
3339 struct rq *rq;
3340
3341 rq = task_rq_lock(p, &flags);
3342 ns = p->se.sum_exec_runtime;
3343 if (task_current(rq, p)) {
3344 update_rq_clock(rq);
3345 delta_exec = rq->clock - p->se.exec_start;
3346 if ((s64)delta_exec > 0)
3347 ns += delta_exec;
3348 }
3349 task_rq_unlock(rq, &flags);
3350
3351 return ns;
3352}
3353
3354
3355
3356
3357
3358
3359void account_user_time(struct task_struct *p, cputime_t cputime)
3360{
3361 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3362 cputime64_t tmp;
3363
3364 p->utime = cputime_add(p->utime, cputime);
3365
3366
3367 tmp = cputime_to_cputime64(cputime);
3368 if (TASK_NICE(p) > 0)
3369 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3370 else
3371 cpustat->user = cputime64_add(cpustat->user, tmp);
3372}
3373
3374
3375
3376
3377
3378
3379static void account_guest_time(struct task_struct *p, cputime_t cputime)
3380{
3381 cputime64_t tmp;
3382 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3383
3384 tmp = cputime_to_cputime64(cputime);
3385
3386 p->utime = cputime_add(p->utime, cputime);
3387 p->gtime = cputime_add(p->gtime, cputime);
3388
3389 cpustat->user = cputime64_add(cpustat->user, tmp);
3390 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3391}
3392
3393
3394
3395
3396
3397
3398void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3399{
3400 p->utimescaled = cputime_add(p->utimescaled, cputime);
3401}
3402
3403
3404
3405
3406
3407
3408
3409void account_system_time(struct task_struct *p, int hardirq_offset,
3410 cputime_t cputime)
3411{
3412 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3413 struct rq *rq = this_rq();
3414 cputime64_t tmp;
3415
3416 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
3417 return account_guest_time(p, cputime);
3418
3419 p->stime = cputime_add(p->stime, cputime);
3420
3421
3422 tmp = cputime_to_cputime64(cputime);
3423 if (hardirq_count() - hardirq_offset)
3424 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3425 else if (softirq_count())
3426 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3427 else if (p != rq->idle)
3428 cpustat->system = cputime64_add(cpustat->system, tmp);
3429 else if (atomic_read(&rq->nr_iowait) > 0)
3430 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3431 else
3432 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3433
3434 acct_update_integrals(p);
3435}
3436
3437
3438
3439
3440
3441
3442
3443void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3444{
3445 p->stimescaled = cputime_add(p->stimescaled, cputime);
3446}
3447
3448
3449
3450
3451
3452
3453void account_steal_time(struct task_struct *p, cputime_t steal)
3454{
3455 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3456 cputime64_t tmp = cputime_to_cputime64(steal);
3457 struct rq *rq = this_rq();
3458
3459 if (p == rq->idle) {
3460 p->stime = cputime_add(p->stime, steal);
3461 if (atomic_read(&rq->nr_iowait) > 0)
3462 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3463 else
3464 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3465 } else
3466 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3467}
3468
3469
3470
3471
3472
3473
3474
3475
3476void scheduler_tick(void)
3477{
3478 int cpu = smp_processor_id();
3479 struct rq *rq = cpu_rq(cpu);
3480 struct task_struct *curr = rq->curr;
3481 u64 next_tick = rq->tick_timestamp + TICK_NSEC;
3482
3483 spin_lock(&rq->lock);
3484 __update_rq_clock(rq);
3485
3486
3487
3488 if (unlikely(rq->clock < next_tick))
3489 rq->clock = next_tick;
3490 rq->tick_timestamp = rq->clock;
3491 update_cpu_load(rq);
3492 if (curr != rq->idle)
3493 curr->sched_class->task_tick(rq, curr);
3494 spin_unlock(&rq->lock);
3495
3496#ifdef CONFIG_SMP
3497 rq->idle_at_tick = idle_cpu(cpu);
3498 trigger_load_balance(rq, cpu);
3499#endif
3500}
3501
3502#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3503
3504void fastcall add_preempt_count(int val)
3505{
3506
3507
3508
3509 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3510 return;
3511 preempt_count() += val;
3512
3513
3514
3515 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3516 PREEMPT_MASK - 10);
3517}
3518EXPORT_SYMBOL(add_preempt_count);
3519
3520void fastcall sub_preempt_count(int val)
3521{
3522
3523
3524
3525 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3526 return;
3527
3528
3529
3530 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3531 !(preempt_count() & PREEMPT_MASK)))
3532 return;
3533
3534 preempt_count() -= val;
3535}
3536EXPORT_SYMBOL(sub_preempt_count);
3537
3538#endif
3539
3540
3541
3542
3543static noinline void __schedule_bug(struct task_struct *prev)
3544{
3545 struct pt_regs *regs = get_irq_regs();
3546
3547 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3548 prev->comm, prev->pid, preempt_count());
3549
3550 debug_show_held_locks(prev);
3551 if (irqs_disabled())
3552 print_irqtrace_events(prev);
3553
3554 if (regs)
3555 show_regs(regs);
3556 else
3557 dump_stack();
3558}
3559
3560
3561
3562
3563static inline void schedule_debug(struct task_struct *prev)
3564{
3565
3566
3567
3568
3569
3570 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3571 __schedule_bug(prev);
3572
3573 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3574
3575 schedstat_inc(this_rq(), sched_count);
3576#ifdef CONFIG_SCHEDSTATS
3577 if (unlikely(prev->lock_depth >= 0)) {
3578 schedstat_inc(this_rq(), bkl_count);
3579 schedstat_inc(prev, sched_info.bkl_count);
3580 }
3581#endif
3582}
3583
3584
3585
3586
3587static inline struct task_struct *
3588pick_next_task(struct rq *rq, struct task_struct *prev)
3589{
3590 const struct sched_class *class;
3591 struct task_struct *p;
3592
3593
3594
3595
3596
3597 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3598 p = fair_sched_class.pick_next_task(rq);
3599 if (likely(p))
3600 return p;
3601 }
3602
3603 class = sched_class_highest;
3604 for ( ; ; ) {
3605 p = class->pick_next_task(rq);
3606 if (p)
3607 return p;
3608
3609
3610
3611
3612 class = class->next;
3613 }
3614}
3615
3616
3617
3618
3619asmlinkage void __sched schedule(void)
3620{
3621 struct task_struct *prev, *next;
3622 long *switch_count;
3623 struct rq *rq;
3624 int cpu;
3625
3626need_resched:
3627 preempt_disable();
3628 cpu = smp_processor_id();
3629 rq = cpu_rq(cpu);
3630 rcu_qsctr_inc(cpu);
3631 prev = rq->curr;
3632 switch_count = &prev->nivcsw;
3633
3634 release_kernel_lock(prev);
3635need_resched_nonpreemptible:
3636
3637 schedule_debug(prev);
3638
3639
3640
3641
3642 local_irq_disable();
3643 __update_rq_clock(rq);
3644 spin_lock(&rq->lock);
3645 clear_tsk_need_resched(prev);
3646
3647 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3648 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3649 unlikely(signal_pending(prev)))) {
3650 prev->state = TASK_RUNNING;
3651 } else {
3652 deactivate_task(rq, prev, 1);
3653 }
3654 switch_count = &prev->nvcsw;
3655 }
3656
3657 if (unlikely(!rq->nr_running))
3658 idle_balance(cpu, rq);
3659
3660 prev->sched_class->put_prev_task(rq, prev);
3661 next = pick_next_task(rq, prev);
3662
3663 sched_info_switch(prev, next);
3664
3665 if (likely(prev != next)) {
3666 rq->nr_switches++;
3667 rq->curr = next;
3668 ++*switch_count;
3669
3670 context_switch(rq, prev, next);
3671 } else
3672 spin_unlock_irq(&rq->lock);
3673
3674 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3675 cpu = smp_processor_id();
3676 rq = cpu_rq(cpu);
3677 goto need_resched_nonpreemptible;
3678 }
3679 preempt_enable_no_resched();
3680 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3681 goto need_resched;
3682}
3683EXPORT_SYMBOL(schedule);
3684
3685#ifdef CONFIG_PREEMPT
3686
3687
3688
3689
3690
3691asmlinkage void __sched preempt_schedule(void)
3692{
3693 struct thread_info *ti = current_thread_info();
3694#ifdef CONFIG_PREEMPT_BKL
3695 struct task_struct *task = current;
3696 int saved_lock_depth;
3697#endif
3698
3699
3700
3701
3702 if (likely(ti->preempt_count || irqs_disabled()))
3703 return;
3704
3705 do {
3706 add_preempt_count(PREEMPT_ACTIVE);
3707
3708
3709
3710
3711
3712
3713#ifdef CONFIG_PREEMPT_BKL
3714 saved_lock_depth = task->lock_depth;
3715 task->lock_depth = -1;
3716#endif
3717 schedule();
3718#ifdef CONFIG_PREEMPT_BKL
3719 task->lock_depth = saved_lock_depth;
3720#endif
3721 sub_preempt_count(PREEMPT_ACTIVE);
3722
3723
3724
3725
3726
3727 barrier();
3728 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3729}
3730EXPORT_SYMBOL(preempt_schedule);
3731
3732
3733
3734
3735
3736
3737
3738asmlinkage void __sched preempt_schedule_irq(void)
3739{
3740 struct thread_info *ti = current_thread_info();
3741#ifdef CONFIG_PREEMPT_BKL
3742 struct task_struct *task = current;
3743 int saved_lock_depth;
3744#endif
3745
3746 BUG_ON(ti->preempt_count || !irqs_disabled());
3747
3748 do {
3749 add_preempt_count(PREEMPT_ACTIVE);
3750
3751
3752
3753
3754
3755
3756#ifdef CONFIG_PREEMPT_BKL
3757 saved_lock_depth = task->lock_depth;
3758 task->lock_depth = -1;
3759#endif
3760 local_irq_enable();
3761 schedule();
3762 local_irq_disable();
3763#ifdef CONFIG_PREEMPT_BKL
3764 task->lock_depth = saved_lock_depth;
3765#endif
3766 sub_preempt_count(PREEMPT_ACTIVE);
3767
3768
3769
3770
3771
3772 barrier();
3773 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3774}
3775
3776#endif
3777
3778int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3779 void *key)
3780{
3781 return try_to_wake_up(curr->private, mode, sync);
3782}
3783EXPORT_SYMBOL(default_wake_function);
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3795 int nr_exclusive, int sync, void *key)
3796{
3797 wait_queue_t *curr, *next;
3798
3799 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3800 unsigned flags = curr->flags;
3801
3802 if (curr->func(curr, mode, sync, key) &&
3803 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3804 break;
3805 }
3806}
3807
3808
3809
3810
3811
3812
3813
3814
3815void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3816 int nr_exclusive, void *key)
3817{
3818 unsigned long flags;
3819
3820 spin_lock_irqsave(&q->lock, flags);
3821 __wake_up_common(q, mode, nr_exclusive, 0, key);
3822 spin_unlock_irqrestore(&q->lock, flags);
3823}
3824EXPORT_SYMBOL(__wake_up);
3825
3826
3827
3828
3829void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3830{
3831 __wake_up_common(q, mode, 1, 0, NULL);
3832}
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847void fastcall
3848__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3849{
3850 unsigned long flags;
3851 int sync = 1;
3852
3853 if (unlikely(!q))
3854 return;
3855
3856 if (unlikely(!nr_exclusive))
3857 sync = 0;
3858
3859 spin_lock_irqsave(&q->lock, flags);
3860 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3861 spin_unlock_irqrestore(&q->lock, flags);
3862}
3863EXPORT_SYMBOL_GPL(__wake_up_sync);
3864
3865void complete(struct completion *x)
3866{
3867 unsigned long flags;
3868
3869 spin_lock_irqsave(&x->wait.lock, flags);
3870 x->done++;
3871 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3872 1, 0, NULL);
3873 spin_unlock_irqrestore(&x->wait.lock, flags);
3874}
3875EXPORT_SYMBOL(complete);
3876
3877void complete_all(struct completion *x)
3878{
3879 unsigned long flags;
3880
3881 spin_lock_irqsave(&x->wait.lock, flags);
3882 x->done += UINT_MAX/2;
3883 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3884 0, 0, NULL);
3885 spin_unlock_irqrestore(&x->wait.lock, flags);
3886}
3887EXPORT_SYMBOL(complete_all);
3888
3889static inline long __sched
3890do_wait_for_common(struct completion *x, long timeout, int state)
3891{
3892 if (!x->done) {
3893 DECLARE_WAITQUEUE(wait, current);
3894
3895 wait.flags |= WQ_FLAG_EXCLUSIVE;
3896 __add_wait_queue_tail(&x->wait, &wait);
3897 do {
3898 if (state == TASK_INTERRUPTIBLE &&
3899 signal_pending(current)) {
3900 __remove_wait_queue(&x->wait, &wait);
3901 return -ERESTARTSYS;
3902 }
3903 __set_current_state(state);
3904 spin_unlock_irq(&x->wait.lock);
3905 timeout = schedule_timeout(timeout);
3906 spin_lock_irq(&x->wait.lock);
3907 if (!timeout) {
3908 __remove_wait_queue(&x->wait, &wait);
3909 return timeout;
3910 }
3911 } while (!x->done);
3912 __remove_wait_queue(&x->wait, &wait);
3913 }
3914 x->done--;
3915 return timeout;
3916}
3917
3918static long __sched
3919wait_for_common(struct completion *x, long timeout, int state)
3920{
3921 might_sleep();
3922
3923 spin_lock_irq(&x->wait.lock);
3924 timeout = do_wait_for_common(x, timeout, state);
3925 spin_unlock_irq(&x->wait.lock);
3926 return timeout;
3927}
3928
3929void __sched wait_for_completion(struct completion *x)
3930{
3931 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3932}
3933EXPORT_SYMBOL(wait_for_completion);
3934
3935unsigned long __sched
3936wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3937{
3938 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3939}
3940EXPORT_SYMBOL(wait_for_completion_timeout);
3941
3942int __sched wait_for_completion_interruptible(struct completion *x)
3943{
3944 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3945 if (t == -ERESTARTSYS)
3946 return t;
3947 return 0;
3948}
3949EXPORT_SYMBOL(wait_for_completion_interruptible);
3950
3951unsigned long __sched
3952wait_for_completion_interruptible_timeout(struct completion *x,
3953 unsigned long timeout)
3954{
3955 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3956}
3957EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3958
3959static long __sched
3960sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3961{
3962 unsigned long flags;
3963 wait_queue_t wait;
3964
3965 init_waitqueue_entry(&wait, current);
3966
3967 __set_current_state(state);
3968
3969 spin_lock_irqsave(&q->lock, flags);
3970 __add_wait_queue(q, &wait);
3971 spin_unlock(&q->lock);
3972 timeout = schedule_timeout(timeout);
3973 spin_lock_irq(&q->lock);
3974 __remove_wait_queue(q, &wait);
3975 spin_unlock_irqrestore(&q->lock, flags);
3976
3977 return timeout;
3978}
3979
3980void __sched interruptible_sleep_on(wait_queue_head_t *q)
3981{
3982 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3983}
3984EXPORT_SYMBOL(interruptible_sleep_on);
3985
3986long __sched
3987interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3988{
3989 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3990}
3991EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3992
3993void __sched sleep_on(wait_queue_head_t *q)
3994{
3995 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3996}
3997EXPORT_SYMBOL(sleep_on);
3998
3999long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4000{
4001 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4002}
4003EXPORT_SYMBOL(sleep_on_timeout);
4004
4005#ifdef CONFIG_RT_MUTEXES
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017void rt_mutex_setprio(struct task_struct *p, int prio)
4018{
4019 unsigned long flags;
4020 int oldprio, on_rq, running;
4021 struct rq *rq;
4022
4023 BUG_ON(prio < 0 || prio > MAX_PRIO);
4024
4025 rq = task_rq_lock(p, &flags);
4026 update_rq_clock(rq);
4027
4028 oldprio = p->prio;
4029 on_rq = p->se.on_rq;
4030 running = task_current(rq, p);
4031 if (on_rq) {
4032 dequeue_task(rq, p, 0);
4033 if (running)
4034 p->sched_class->put_prev_task(rq, p);
4035 }
4036
4037 if (rt_prio(prio))
4038 p->sched_class = &rt_sched_class;
4039 else
4040 p->sched_class = &fair_sched_class;
4041
4042 p->prio = prio;
4043
4044 if (on_rq) {
4045 if (running)
4046 p->sched_class->set_curr_task(rq);
4047 enqueue_task(rq, p, 0);
4048
4049
4050
4051
4052
4053 if (running) {
4054 if (p->prio > oldprio)
4055 resched_task(rq->curr);
4056 } else {
4057 check_preempt_curr(rq, p);
4058 }
4059 }
4060 task_rq_unlock(rq, &flags);
4061}
4062
4063#endif
4064
4065void set_user_nice(struct task_struct *p, long nice)
4066{
4067 int old_prio, delta, on_rq;
4068 unsigned long flags;
4069 struct rq *rq;
4070
4071 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4072 return;
4073
4074
4075
4076
4077 rq = task_rq_lock(p, &flags);
4078 update_rq_clock(rq);
4079
4080
4081
4082
4083
4084
4085 if (task_has_rt_policy(p)) {
4086 p->static_prio = NICE_TO_PRIO(nice);
4087 goto out_unlock;
4088 }
4089 on_rq = p->se.on_rq;
4090 if (on_rq) {
4091 dequeue_task(rq, p, 0);
4092 dec_load(rq, p);
4093 }
4094
4095 p->static_prio = NICE_TO_PRIO(nice);
4096 set_load_weight(p);
4097 old_prio = p->prio;
4098 p->prio = effective_prio(p);
4099 delta = p->prio - old_prio;
4100
4101 if (on_rq) {
4102 enqueue_task(rq, p, 0);
4103 inc_load(rq, p);
4104
4105
4106
4107
4108 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4109 resched_task(rq->curr);
4110 }
4111out_unlock:
4112 task_rq_unlock(rq, &flags);
4113}
4114EXPORT_SYMBOL(set_user_nice);
4115
4116
4117
4118
4119
4120
4121int can_nice(const struct task_struct *p, const int nice)
4122{
4123
4124 int nice_rlim = 20 - nice;
4125
4126 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4127 capable(CAP_SYS_NICE));
4128}
4129
4130#ifdef __ARCH_WANT_SYS_NICE
4131
4132
4133
4134
4135
4136
4137
4138
4139asmlinkage long sys_nice(int increment)
4140{
4141 long nice, retval;
4142
4143
4144
4145
4146
4147
4148 if (increment < -40)
4149 increment = -40;
4150 if (increment > 40)
4151 increment = 40;
4152
4153 nice = PRIO_TO_NICE(current->static_prio) + increment;
4154 if (nice < -20)
4155 nice = -20;
4156 if (nice > 19)
4157 nice = 19;
4158
4159 if (increment < 0 && !can_nice(current, nice))
4160 return -EPERM;
4161
4162 retval = security_task_setnice(current, nice);
4163 if (retval)
4164 return retval;
4165
4166 set_user_nice(current, nice);
4167 return 0;
4168}
4169
4170#endif
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180int task_prio(const struct task_struct *p)
4181{
4182 return p->prio - MAX_RT_PRIO;
4183}
4184
4185
4186
4187
4188
4189int task_nice(const struct task_struct *p)
4190{
4191 return TASK_NICE(p);
4192}
4193EXPORT_SYMBOL_GPL(task_nice);
4194
4195
4196
4197
4198
4199int idle_cpu(int cpu)
4200{
4201 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4202}
4203
4204
4205
4206
4207
4208struct task_struct *idle_task(int cpu)
4209{
4210 return cpu_rq(cpu)->idle;
4211}
4212
4213
4214
4215
4216
4217static struct task_struct *find_process_by_pid(pid_t pid)
4218{
4219 return pid ? find_task_by_vpid(pid) : current;
4220}
4221
4222
4223static void
4224__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4225{
4226 BUG_ON(p->se.on_rq);
4227
4228 p->policy = policy;
4229 switch (p->policy) {
4230 case SCHED_NORMAL:
4231 case SCHED_BATCH:
4232 case SCHED_IDLE:
4233 p->sched_class = &fair_sched_class;
4234 break;
4235 case SCHED_FIFO:
4236 case SCHED_RR:
4237 p->sched_class = &rt_sched_class;
4238 break;
4239 }
4240
4241 p->rt_priority = prio;
4242 p->normal_prio = normal_prio(p);
4243
4244 p->prio = rt_mutex_getprio(p);
4245 set_load_weight(p);
4246}
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256int sched_setscheduler(struct task_struct *p, int policy,
4257 struct sched_param *param)
4258{
4259 int retval, oldprio, oldpolicy = -1, on_rq, running;
4260 unsigned long flags;
4261 struct rq *rq;
4262
4263
4264 BUG_ON(in_interrupt());
4265recheck:
4266
4267 if (policy < 0)
4268 policy = oldpolicy = p->policy;
4269 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4270 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4271 policy != SCHED_IDLE)
4272 return -EINVAL;
4273
4274
4275
4276
4277
4278 if (param->sched_priority < 0 ||
4279 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4280 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4281 return -EINVAL;
4282 if (rt_policy(policy) != (param->sched_priority != 0))
4283 return -EINVAL;
4284
4285
4286
4287
4288 if (!capable(CAP_SYS_NICE)) {
4289 if (rt_policy(policy)) {
4290 unsigned long rlim_rtprio;
4291
4292 if (!lock_task_sighand(p, &flags))
4293 return -ESRCH;
4294 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4295 unlock_task_sighand(p, &flags);
4296
4297
4298 if (policy != p->policy && !rlim_rtprio)
4299 return -EPERM;
4300
4301
4302 if (param->sched_priority > p->rt_priority &&
4303 param->sched_priority > rlim_rtprio)
4304 return -EPERM;
4305 }
4306
4307
4308
4309
4310 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4311 return -EPERM;
4312
4313
4314 if ((current->euid != p->euid) &&
4315 (current->euid != p->uid))
4316 return -EPERM;
4317 }
4318
4319 retval = security_task_setscheduler(p, policy, param);
4320 if (retval)
4321 return retval;
4322
4323
4324
4325
4326 spin_lock_irqsave(&p->pi_lock, flags);
4327
4328
4329
4330
4331 rq = __task_rq_lock(p);
4332
4333 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4334 policy = oldpolicy = -1;
4335 __task_rq_unlock(rq);
4336 spin_unlock_irqrestore(&p->pi_lock, flags);
4337 goto recheck;
4338 }
4339 update_rq_clock(rq);
4340 on_rq = p->se.on_rq;
4341 running = task_current(rq, p);
4342 if (on_rq) {
4343 deactivate_task(rq, p, 0);
4344 if (running)
4345 p->sched_class->put_prev_task(rq, p);
4346 }
4347
4348 oldprio = p->prio;
4349 __setscheduler(rq, p, policy, param->sched_priority);
4350
4351 if (on_rq) {
4352 if (running)
4353 p->sched_class->set_curr_task(rq);
4354 activate_task(rq, p, 0);
4355
4356
4357
4358
4359
4360 if (running) {
4361 if (p->prio > oldprio)
4362 resched_task(rq->curr);
4363 } else {
4364 check_preempt_curr(rq, p);
4365 }
4366 }
4367 __task_rq_unlock(rq);
4368 spin_unlock_irqrestore(&p->pi_lock, flags);
4369
4370 rt_mutex_adjust_pi(p);
4371
4372 return 0;
4373}
4374EXPORT_SYMBOL_GPL(sched_setscheduler);
4375
4376static int
4377do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4378{
4379 struct sched_param lparam;
4380 struct task_struct *p;
4381 int retval;
4382
4383 if (!param || pid < 0)
4384 return -EINVAL;
4385 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4386 return -EFAULT;
4387
4388 rcu_read_lock();
4389 retval = -ESRCH;
4390 p = find_process_by_pid(pid);
4391 if (p != NULL)
4392 retval = sched_setscheduler(p, policy, &lparam);
4393 rcu_read_unlock();
4394
4395 return retval;
4396}
4397
4398
4399
4400
4401
4402
4403
4404asmlinkage long
4405sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4406{
4407
4408 if (policy < 0)
4409 return -EINVAL;
4410
4411 return do_sched_setscheduler(pid, policy, param);
4412}
4413
4414
4415
4416
4417
4418
4419asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4420{
4421 return do_sched_setscheduler(pid, -1, param);
4422}
4423
4424
4425
4426
4427
4428asmlinkage long sys_sched_getscheduler(pid_t pid)
4429{
4430 struct task_struct *p;
4431 int retval;
4432
4433 if (pid < 0)
4434 return -EINVAL;
4435
4436 retval = -ESRCH;
4437 read_lock(&tasklist_lock);
4438 p = find_process_by_pid(pid);
4439 if (p) {
4440 retval = security_task_getscheduler(p);
4441 if (!retval)
4442 retval = p->policy;
4443 }
4444 read_unlock(&tasklist_lock);
4445 return retval;
4446}
4447
4448
4449
4450
4451
4452
4453asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4454{
4455 struct sched_param lp;
4456 struct task_struct *p;
4457 int retval;
4458
4459 if (!param || pid < 0)
4460 return -EINVAL;
4461
4462 read_lock(&tasklist_lock);
4463 p = find_process_by_pid(pid);
4464 retval = -ESRCH;
4465 if (!p)
4466 goto out_unlock;
4467
4468 retval = security_task_getscheduler(p);
4469 if (retval)
4470 goto out_unlock;
4471
4472 lp.sched_priority = p->rt_priority;
4473 read_unlock(&tasklist_lock);
4474
4475
4476
4477
4478 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4479
4480 return retval;
4481
4482out_unlock:
4483 read_unlock(&tasklist_lock);
4484 return retval;
4485}
4486
4487long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4488{
4489 cpumask_t cpus_allowed;
4490 struct task_struct *p;
4491 int retval;
4492
4493 mutex_lock(&sched_hotcpu_mutex);
4494 read_lock(&tasklist_lock);
4495
4496 p = find_process_by_pid(pid);
4497 if (!p) {
4498 read_unlock(&tasklist_lock);
4499 mutex_unlock(&sched_hotcpu_mutex);
4500 return -ESRCH;
4501 }
4502
4503
4504
4505
4506
4507
4508 get_task_struct(p);
4509 read_unlock(&tasklist_lock);
4510
4511 retval = -EPERM;
4512 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4513 !capable(CAP_SYS_NICE))
4514 goto out_unlock;
4515
4516 retval = security_task_setscheduler(p, 0, NULL);
4517 if (retval)
4518 goto out_unlock;
4519
4520 cpus_allowed = cpuset_cpus_allowed(p);
4521 cpus_and(new_mask, new_mask, cpus_allowed);
4522 again:
4523 retval = set_cpus_allowed(p, new_mask);
4524
4525 if (!retval) {
4526 cpus_allowed = cpuset_cpus_allowed(p);
4527 if (!cpus_subset(new_mask, cpus_allowed)) {
4528
4529
4530
4531
4532
4533 new_mask = cpus_allowed;
4534 goto again;
4535 }
4536 }
4537out_unlock:
4538 put_task_struct(p);
4539 mutex_unlock(&sched_hotcpu_mutex);
4540 return retval;
4541}
4542
4543static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4544 cpumask_t *new_mask)
4545{
4546 if (len < sizeof(cpumask_t)) {
4547 memset(new_mask, 0, sizeof(cpumask_t));
4548 } else if (len > sizeof(cpumask_t)) {
4549 len = sizeof(cpumask_t);
4550 }
4551 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4552}
4553
4554
4555
4556
4557
4558
4559
4560asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4561 unsigned long __user *user_mask_ptr)
4562{
4563 cpumask_t new_mask;
4564 int retval;
4565
4566 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4567 if (retval)
4568 return retval;
4569
4570 return sched_setaffinity(pid, new_mask);
4571}
4572
4573
4574
4575
4576
4577
4578
4579
4580cpumask_t cpu_present_map __read_mostly;
4581EXPORT_SYMBOL(cpu_present_map);
4582
4583#ifndef CONFIG_SMP
4584cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4585EXPORT_SYMBOL(cpu_online_map);
4586
4587cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4588EXPORT_SYMBOL(cpu_possible_map);
4589#endif
4590
4591long sched_getaffinity(pid_t pid, cpumask_t *mask)
4592{
4593 struct task_struct *p;
4594 int retval;
4595
4596 mutex_lock(&sched_hotcpu_mutex);
4597 read_lock(&tasklist_lock);
4598
4599 retval = -ESRCH;
4600 p = find_process_by_pid(pid);
4601 if (!p)
4602 goto out_unlock;
4603
4604 retval = security_task_getscheduler(p);
4605 if (retval)
4606 goto out_unlock;
4607
4608 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4609
4610out_unlock:
4611 read_unlock(&tasklist_lock);
4612 mutex_unlock(&sched_hotcpu_mutex);
4613
4614 return retval;
4615}
4616
4617
4618
4619
4620
4621
4622
4623asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4624 unsigned long __user *user_mask_ptr)
4625{
4626 int ret;
4627 cpumask_t mask;
4628
4629 if (len < sizeof(cpumask_t))
4630 return -EINVAL;
4631
4632 ret = sched_getaffinity(pid, &mask);
4633 if (ret < 0)
4634 return ret;
4635
4636 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4637 return -EFAULT;
4638
4639 return sizeof(cpumask_t);
4640}
4641
4642
4643
4644
4645
4646
4647
4648asmlinkage long sys_sched_yield(void)
4649{
4650 struct rq *rq = this_rq_lock();
4651
4652 schedstat_inc(rq, yld_count);
4653 current->sched_class->yield_task(rq);
4654
4655
4656
4657
4658
4659 __release(rq->lock);
4660 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4661 _raw_spin_unlock(&rq->lock);
4662 preempt_enable_no_resched();
4663
4664 schedule();
4665
4666 return 0;
4667}
4668
4669static void __cond_resched(void)
4670{
4671#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4672 __might_sleep(__FILE__, __LINE__);
4673#endif
4674
4675
4676
4677
4678
4679 do {
4680 add_preempt_count(PREEMPT_ACTIVE);
4681 schedule();
4682 sub_preempt_count(PREEMPT_ACTIVE);
4683 } while (need_resched());
4684}
4685
4686int __sched cond_resched(void)
4687{
4688 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4689 system_state == SYSTEM_RUNNING) {
4690 __cond_resched();
4691 return 1;
4692 }
4693 return 0;
4694}
4695EXPORT_SYMBOL(cond_resched);
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705int cond_resched_lock(spinlock_t *lock)
4706{
4707 int ret = 0;
4708
4709 if (need_lockbreak(lock)) {
4710 spin_unlock(lock);
4711 cpu_relax();
4712 ret = 1;
4713 spin_lock(lock);
4714 }
4715 if (need_resched() && system_state == SYSTEM_RUNNING) {
4716 spin_release(&lock->dep_map, 1, _THIS_IP_);
4717 _raw_spin_unlock(lock);
4718 preempt_enable_no_resched();
4719 __cond_resched();
4720 ret = 1;
4721 spin_lock(lock);
4722 }
4723 return ret;
4724}
4725EXPORT_SYMBOL(cond_resched_lock);
4726
4727int __sched cond_resched_softirq(void)
4728{
4729 BUG_ON(!in_softirq());
4730
4731 if (need_resched() && system_state == SYSTEM_RUNNING) {
4732 local_bh_enable();
4733 __cond_resched();
4734 local_bh_disable();
4735 return 1;
4736 }
4737 return 0;
4738}
4739EXPORT_SYMBOL(cond_resched_softirq);
4740
4741
4742
4743
4744
4745
4746
4747void __sched yield(void)
4748{
4749 set_current_state(TASK_RUNNING);
4750 sys_sched_yield();
4751}
4752EXPORT_SYMBOL(yield);
4753
4754
4755
4756
4757
4758
4759
4760
4761void __sched io_schedule(void)
4762{
4763 struct rq *rq = &__raw_get_cpu_var(runqueues);
4764
4765 delayacct_blkio_start();
4766 atomic_inc(&rq->nr_iowait);
4767 schedule();
4768 atomic_dec(&rq->nr_iowait);
4769 delayacct_blkio_end();
4770}
4771EXPORT_SYMBOL(io_schedule);
4772
4773long __sched io_schedule_timeout(long timeout)
4774{
4775 struct rq *rq = &__raw_get_cpu_var(runqueues);
4776 long ret;
4777
4778 delayacct_blkio_start();
4779 atomic_inc(&rq->nr_iowait);
4780 ret = schedule_timeout(timeout);
4781 atomic_dec(&rq->nr_iowait);
4782 delayacct_blkio_end();
4783 return ret;
4784}
4785
4786
4787
4788
4789
4790
4791
4792
4793asmlinkage long sys_sched_get_priority_max(int policy)
4794{
4795 int ret = -EINVAL;
4796
4797 switch (policy) {
4798 case SCHED_FIFO:
4799 case SCHED_RR:
4800 ret = MAX_USER_RT_PRIO-1;
4801 break;
4802 case SCHED_NORMAL:
4803 case SCHED_BATCH:
4804 case SCHED_IDLE:
4805 ret = 0;
4806 break;
4807 }
4808 return ret;
4809}
4810
4811
4812
4813
4814
4815
4816
4817
4818asmlinkage long sys_sched_get_priority_min(int policy)
4819{
4820 int ret = -EINVAL;
4821
4822 switch (policy) {
4823 case SCHED_FIFO:
4824 case SCHED_RR:
4825 ret = 1;
4826 break;
4827 case SCHED_NORMAL:
4828 case SCHED_BATCH:
4829 case SCHED_IDLE:
4830 ret = 0;
4831 }
4832 return ret;
4833}
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843asmlinkage
4844long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4845{
4846 struct task_struct *p;
4847 unsigned int time_slice;
4848 int retval;
4849 struct timespec t;
4850
4851 if (pid < 0)
4852 return -EINVAL;
4853
4854 retval = -ESRCH;
4855 read_lock(&tasklist_lock);
4856 p = find_process_by_pid(pid);
4857 if (!p)
4858 goto out_unlock;
4859
4860 retval = security_task_getscheduler(p);
4861 if (retval)
4862 goto out_unlock;
4863
4864
4865
4866
4867
4868 time_slice = 0;
4869 if (p->policy == SCHED_RR) {
4870 time_slice = DEF_TIMESLICE;
4871 } else {
4872 struct sched_entity *se = &p->se;
4873 unsigned long flags;
4874 struct rq *rq;
4875
4876 rq = task_rq_lock(p, &flags);
4877 if (rq->cfs.load.weight)
4878 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
4879 task_rq_unlock(rq, &flags);
4880 }
4881 read_unlock(&tasklist_lock);
4882 jiffies_to_timespec(time_slice, &t);
4883 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4884 return retval;
4885
4886out_unlock:
4887 read_unlock(&tasklist_lock);
4888 return retval;
4889}
4890
4891static const char stat_nam[] = "RSDTtZX";
4892
4893static void show_task(struct task_struct *p)
4894{
4895 unsigned long free = 0;
4896 unsigned state;
4897
4898 state = p->state ? __ffs(p->state) + 1 : 0;
4899 printk(KERN_INFO "%-13.13s %c", p->comm,
4900 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4901#if BITS_PER_LONG == 32
4902 if (state == TASK_RUNNING)
4903 printk(KERN_CONT " running ");
4904 else
4905 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4906#else
4907 if (state == TASK_RUNNING)
4908 printk(KERN_CONT " running task ");
4909 else
4910 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4911#endif
4912#ifdef CONFIG_DEBUG_STACK_USAGE
4913 {
4914 unsigned long *n = end_of_stack(p);
4915 while (!*n)
4916 n++;
4917 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4918 }
4919#endif
4920 printk(KERN_CONT "%5lu %5d %6d\n", free,
4921 task_pid_nr(p), task_pid_nr(p->real_parent));
4922
4923 if (state != TASK_RUNNING)
4924 show_stack(p, NULL);
4925}
4926
4927void show_state_filter(unsigned long state_filter)
4928{
4929 struct task_struct *g, *p;
4930
4931#if BITS_PER_LONG == 32
4932 printk(KERN_INFO
4933 " task PC stack pid father\n");
4934#else
4935 printk(KERN_INFO
4936 " task PC stack pid father\n");
4937#endif
4938 read_lock(&tasklist_lock);
4939 do_each_thread(g, p) {
4940
4941
4942
4943
4944 touch_nmi_watchdog();
4945 if (!state_filter || (p->state & state_filter))
4946 show_task(p);
4947 } while_each_thread(g, p);
4948
4949 touch_all_softlockup_watchdogs();
4950
4951#ifdef CONFIG_SCHED_DEBUG
4952 sysrq_sched_debug_show();
4953#endif
4954 read_unlock(&tasklist_lock);
4955
4956
4957
4958 if (state_filter == -1)
4959 debug_show_all_locks();
4960}
4961
4962void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4963{
4964 idle->sched_class = &idle_sched_class;
4965}
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975void __cpuinit init_idle(struct task_struct *idle, int cpu)
4976{
4977 struct rq *rq = cpu_rq(cpu);
4978 unsigned long flags;
4979
4980 __sched_fork(idle);
4981 idle->se.exec_start = sched_clock();
4982
4983 idle->prio = idle->normal_prio = MAX_PRIO;
4984 idle->cpus_allowed = cpumask_of_cpu(cpu);
4985 __set_task_cpu(idle, cpu);
4986
4987 spin_lock_irqsave(&rq->lock, flags);
4988 rq->curr = rq->idle = idle;
4989#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4990 idle->oncpu = 1;
4991#endif
4992 spin_unlock_irqrestore(&rq->lock, flags);
4993
4994
4995#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4996 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4997#else
4998 task_thread_info(idle)->preempt_count = 0;
4999#endif
5000
5001
5002
5003 idle->sched_class = &idle_sched_class;
5004}
5005
5006
5007
5008
5009
5010
5011
5012
5013cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024static inline void sched_init_granularity(void)
5025{
5026 unsigned int factor = 1 + ilog2(num_online_cpus());
5027 const unsigned long limit = 200000000;
5028
5029 sysctl_sched_min_granularity *= factor;
5030 if (sysctl_sched_min_granularity > limit)
5031 sysctl_sched_min_granularity = limit;
5032
5033 sysctl_sched_latency *= factor;
5034 if (sysctl_sched_latency > limit)
5035 sysctl_sched_latency = limit;
5036
5037 sysctl_sched_wakeup_granularity *= factor;
5038 sysctl_sched_batch_wakeup_granularity *= factor;
5039}
5040
5041#ifdef CONFIG_SMP
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5068{
5069 struct migration_req req;
5070 unsigned long flags;
5071 struct rq *rq;
5072 int ret = 0;
5073
5074 rq = task_rq_lock(p, &flags);
5075 if (!cpus_intersects(new_mask, cpu_online_map)) {
5076 ret = -EINVAL;
5077 goto out;
5078 }
5079
5080 p->cpus_allowed = new_mask;
5081
5082 if (cpu_isset(task_cpu(p), new_mask))
5083 goto out;
5084
5085 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
5086
5087 task_rq_unlock(rq, &flags);
5088 wake_up_process(rq->migration_thread);
5089 wait_for_completion(&req.done);
5090 tlb_migrate_finish(p->mm);
5091 return 0;
5092 }
5093out:
5094 task_rq_unlock(rq, &flags);
5095
5096 return ret;
5097}
5098EXPORT_SYMBOL_GPL(set_cpus_allowed);
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5112{
5113 struct rq *rq_dest, *rq_src;
5114 int ret = 0, on_rq;
5115
5116 if (unlikely(cpu_is_offline(dest_cpu)))
5117 return ret;
5118
5119 rq_src = cpu_rq(src_cpu);
5120 rq_dest = cpu_rq(dest_cpu);
5121
5122 double_rq_lock(rq_src, rq_dest);
5123
5124 if (task_cpu(p) != src_cpu)
5125 goto out;
5126
5127 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5128 goto out;
5129
5130 on_rq = p->se.on_rq;
5131 if (on_rq)
5132 deactivate_task(rq_src, p, 0);
5133
5134 set_task_cpu(p, dest_cpu);
5135 if (on_rq) {
5136 activate_task(rq_dest, p, 0);
5137 check_preempt_curr(rq_dest, p);
5138 }
5139 ret = 1;
5140out:
5141 double_rq_unlock(rq_src, rq_dest);
5142 return ret;
5143}
5144
5145
5146
5147
5148
5149
5150static int migration_thread(void *data)
5151{
5152 int cpu = (long)data;
5153 struct rq *rq;
5154
5155 rq = cpu_rq(cpu);
5156 BUG_ON(rq->migration_thread != current);
5157
5158 set_current_state(TASK_INTERRUPTIBLE);
5159 while (!kthread_should_stop()) {
5160 struct migration_req *req;
5161 struct list_head *head;
5162
5163 spin_lock_irq(&rq->lock);
5164
5165 if (cpu_is_offline(cpu)) {
5166 spin_unlock_irq(&rq->lock);
5167 goto wait_to_die;
5168 }
5169
5170 if (rq->active_balance) {
5171 active_load_balance(rq, cpu);
5172 rq->active_balance = 0;
5173 }
5174
5175 head = &rq->migration_queue;
5176
5177 if (list_empty(head)) {
5178 spin_unlock_irq(&rq->lock);
5179 schedule();
5180 set_current_state(TASK_INTERRUPTIBLE);
5181 continue;
5182 }
5183 req = list_entry(head->next, struct migration_req, list);
5184 list_del_init(head->next);
5185
5186 spin_unlock(&rq->lock);
5187 __migrate_task(req->task, cpu, req->dest_cpu);
5188 local_irq_enable();
5189
5190 complete(&req->done);
5191 }
5192 __set_current_state(TASK_RUNNING);
5193 return 0;
5194
5195wait_to_die:
5196
5197 set_current_state(TASK_INTERRUPTIBLE);
5198 while (!kthread_should_stop()) {
5199 schedule();
5200 set_current_state(TASK_INTERRUPTIBLE);
5201 }
5202 __set_current_state(TASK_RUNNING);
5203 return 0;
5204}
5205
5206#ifdef CONFIG_HOTPLUG_CPU
5207
5208static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5209{
5210 int ret;
5211
5212 local_irq_disable();
5213 ret = __migrate_task(p, src_cpu, dest_cpu);
5214 local_irq_enable();
5215 return ret;
5216}
5217
5218
5219
5220
5221
5222static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5223{
5224 unsigned long flags;
5225 cpumask_t mask;
5226 struct rq *rq;
5227 int dest_cpu;
5228
5229 do {
5230
5231 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5232 cpus_and(mask, mask, p->cpus_allowed);
5233 dest_cpu = any_online_cpu(mask);
5234
5235
5236 if (dest_cpu == NR_CPUS)
5237 dest_cpu = any_online_cpu(p->cpus_allowed);
5238
5239
5240 if (dest_cpu == NR_CPUS) {
5241 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5242
5243
5244
5245
5246
5247
5248
5249 rq = task_rq_lock(p, &flags);
5250 p->cpus_allowed = cpus_allowed;
5251 dest_cpu = any_online_cpu(p->cpus_allowed);
5252 task_rq_unlock(rq, &flags);
5253
5254
5255
5256
5257
5258
5259 if (p->mm && printk_ratelimit()) {
5260 printk(KERN_INFO "process %d (%s) no "
5261 "longer affine to cpu%d\n",
5262 task_pid_nr(p), p->comm, dead_cpu);
5263 }
5264 }
5265 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5266}
5267
5268
5269
5270
5271
5272
5273
5274
5275static void migrate_nr_uninterruptible(struct rq *rq_src)
5276{
5277 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5278 unsigned long flags;
5279
5280 local_irq_save(flags);
5281 double_rq_lock(rq_src, rq_dest);
5282 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5283 rq_src->nr_uninterruptible = 0;
5284 double_rq_unlock(rq_src, rq_dest);
5285 local_irq_restore(flags);
5286}
5287
5288
5289static void migrate_live_tasks(int src_cpu)
5290{
5291 struct task_struct *p, *t;
5292
5293 read_lock(&tasklist_lock);
5294
5295 do_each_thread(t, p) {
5296 if (p == current)
5297 continue;
5298
5299 if (task_cpu(p) == src_cpu)
5300 move_task_off_dead_cpu(src_cpu, p);
5301 } while_each_thread(t, p);
5302
5303 read_unlock(&tasklist_lock);
5304}
5305
5306
5307
5308
5309
5310
5311void sched_idle_next(void)
5312{
5313 int this_cpu = smp_processor_id();
5314 struct rq *rq = cpu_rq(this_cpu);
5315 struct task_struct *p = rq->idle;
5316 unsigned long flags;
5317
5318
5319 BUG_ON(cpu_online(this_cpu));
5320
5321
5322
5323
5324
5325 spin_lock_irqsave(&rq->lock, flags);
5326
5327 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5328
5329 update_rq_clock(rq);
5330 activate_task(rq, p, 0);
5331
5332 spin_unlock_irqrestore(&rq->lock, flags);
5333}
5334
5335
5336
5337
5338
5339void idle_task_exit(void)
5340{
5341 struct mm_struct *mm = current->active_mm;
5342
5343 BUG_ON(cpu_online(smp_processor_id()));
5344
5345 if (mm != &init_mm)
5346 switch_mm(mm, &init_mm, current);
5347 mmdrop(mm);
5348}
5349
5350
5351static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5352{
5353 struct rq *rq = cpu_rq(dead_cpu);
5354
5355
5356 BUG_ON(!p->exit_state);
5357
5358
5359 BUG_ON(p->state == TASK_DEAD);
5360
5361 get_task_struct(p);
5362
5363
5364
5365
5366
5367
5368 spin_unlock_irq(&rq->lock);
5369 move_task_off_dead_cpu(dead_cpu, p);
5370 spin_lock_irq(&rq->lock);
5371
5372 put_task_struct(p);
5373}
5374
5375
5376static void migrate_dead_tasks(unsigned int dead_cpu)
5377{
5378 struct rq *rq = cpu_rq(dead_cpu);
5379 struct task_struct *next;
5380
5381 for ( ; ; ) {
5382 if (!rq->nr_running)
5383 break;
5384 update_rq_clock(rq);
5385 next = pick_next_task(rq, rq->curr);
5386 if (!next)
5387 break;
5388 migrate_dead(dead_cpu, next);
5389
5390 }
5391}
5392#endif
5393
5394#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5395
5396static struct ctl_table sd_ctl_dir[] = {
5397 {
5398 .procname = "sched_domain",
5399 .mode = 0555,
5400 },
5401 {0, },
5402};
5403
5404static struct ctl_table sd_ctl_root[] = {
5405 {
5406 .ctl_name = CTL_KERN,
5407 .procname = "kernel",
5408 .mode = 0555,
5409 .child = sd_ctl_dir,
5410 },
5411 {0, },
5412};
5413
5414static struct ctl_table *sd_alloc_ctl_entry(int n)
5415{
5416 struct ctl_table *entry =
5417 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5418
5419 return entry;
5420}
5421
5422static void sd_free_ctl_entry(struct ctl_table **tablep)
5423{
5424 struct ctl_table *entry;
5425
5426
5427
5428
5429
5430
5431
5432 for (entry = *tablep; entry->mode; entry++) {
5433 if (entry->child)
5434 sd_free_ctl_entry(&entry->child);
5435 if (entry->proc_handler == NULL)
5436 kfree(entry->procname);
5437 }
5438
5439 kfree(*tablep);
5440 *tablep = NULL;
5441}
5442
5443static void
5444set_table_entry(struct ctl_table *entry,
5445 const char *procname, void *data, int maxlen,
5446 mode_t mode, proc_handler *proc_handler)
5447{
5448 entry->procname = procname;
5449 entry->data = data;
5450 entry->maxlen = maxlen;
5451 entry->mode = mode;
5452 entry->proc_handler = proc_handler;
5453}
5454
5455static struct ctl_table *
5456sd_alloc_ctl_domain_table(struct sched_domain *sd)
5457{
5458 struct ctl_table *table = sd_alloc_ctl_entry(12);
5459
5460 if (table == NULL)
5461 return NULL;
5462
5463 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5464 sizeof(long), 0644, proc_doulongvec_minmax);
5465 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5466 sizeof(long), 0644, proc_doulongvec_minmax);
5467 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5468 sizeof(int), 0644, proc_dointvec_minmax);
5469 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5470 sizeof(int), 0644, proc_dointvec_minmax);
5471 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5472 sizeof(int), 0644, proc_dointvec_minmax);
5473 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5474 sizeof(int), 0644, proc_dointvec_minmax);
5475 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5476 sizeof(int), 0644, proc_dointvec_minmax);
5477 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5478 sizeof(int), 0644, proc_dointvec_minmax);
5479 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5480 sizeof(int), 0644, proc_dointvec_minmax);
5481 set_table_entry(&table[9], "cache_nice_tries",
5482 &sd->cache_nice_tries,
5483 sizeof(int), 0644, proc_dointvec_minmax);
5484 set_table_entry(&table[10], "flags", &sd->flags,
5485 sizeof(int), 0644, proc_dointvec_minmax);
5486
5487
5488 return table;
5489}
5490
5491static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5492{
5493 struct ctl_table *entry, *table;
5494 struct sched_domain *sd;
5495 int domain_num = 0, i;
5496 char buf[32];
5497
5498 for_each_domain(cpu, sd)
5499 domain_num++;
5500 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5501 if (table == NULL)
5502 return NULL;
5503
5504 i = 0;
5505 for_each_domain(cpu, sd) {
5506 snprintf(buf, 32, "domain%d", i);
5507 entry->procname = kstrdup(buf, GFP_KERNEL);
5508 entry->mode = 0555;
5509 entry->child = sd_alloc_ctl_domain_table(sd);
5510 entry++;
5511 i++;
5512 }
5513 return table;
5514}
5515
5516static struct ctl_table_header *sd_sysctl_header;
5517static void register_sched_domain_sysctl(void)
5518{
5519 int i, cpu_num = num_online_cpus();
5520 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5521 char buf[32];
5522
5523 WARN_ON(sd_ctl_dir[0].child);
5524 sd_ctl_dir[0].child = entry;
5525
5526 if (entry == NULL)
5527 return;
5528
5529 for_each_online_cpu(i) {
5530 snprintf(buf, 32, "cpu%d", i);
5531 entry->procname = kstrdup(buf, GFP_KERNEL);
5532 entry->mode = 0555;
5533 entry->child = sd_alloc_ctl_cpu_table(i);
5534 entry++;
5535 }
5536
5537 WARN_ON(sd_sysctl_header);
5538 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5539}
5540
5541
5542static void unregister_sched_domain_sysctl(void)
5543{
5544 if (sd_sysctl_header)
5545 unregister_sysctl_table(sd_sysctl_header);
5546 sd_sysctl_header = NULL;
5547 if (sd_ctl_dir[0].child)
5548 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5549}
5550#else
5551static void register_sched_domain_sysctl(void)
5552{
5553}
5554static void unregister_sched_domain_sysctl(void)
5555{
5556}
5557#endif
5558
5559
5560
5561
5562
5563static int __cpuinit
5564migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5565{
5566 struct task_struct *p;
5567 int cpu = (long)hcpu;
5568 unsigned long flags;
5569 struct rq *rq;
5570
5571 switch (action) {
5572 case CPU_LOCK_ACQUIRE:
5573 mutex_lock(&sched_hotcpu_mutex);
5574 break;
5575
5576 case CPU_UP_PREPARE:
5577 case CPU_UP_PREPARE_FROZEN:
5578 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5579 if (IS_ERR(p))
5580 return NOTIFY_BAD;
5581 kthread_bind(p, cpu);
5582
5583 rq = task_rq_lock(p, &flags);
5584 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5585 task_rq_unlock(rq, &flags);
5586 cpu_rq(cpu)->migration_thread = p;
5587 break;
5588
5589 case CPU_ONLINE:
5590 case CPU_ONLINE_FROZEN:
5591
5592 wake_up_process(cpu_rq(cpu)->migration_thread);
5593 break;
5594
5595#ifdef CONFIG_HOTPLUG_CPU
5596 case CPU_UP_CANCELED:
5597 case CPU_UP_CANCELED_FROZEN:
5598 if (!cpu_rq(cpu)->migration_thread)
5599 break;
5600
5601 kthread_bind(cpu_rq(cpu)->migration_thread,
5602 any_online_cpu(cpu_online_map));
5603 kthread_stop(cpu_rq(cpu)->migration_thread);
5604 cpu_rq(cpu)->migration_thread = NULL;
5605 break;
5606
5607 case CPU_DEAD:
5608 case CPU_DEAD_FROZEN:
5609 cpuset_lock();
5610 migrate_live_tasks(cpu);
5611 rq = cpu_rq(cpu);
5612 kthread_stop(rq->migration_thread);
5613 rq->migration_thread = NULL;
5614
5615 spin_lock_irq(&rq->lock);
5616 update_rq_clock(rq);
5617 deactivate_task(rq, rq->idle, 0);
5618 rq->idle->static_prio = MAX_PRIO;
5619 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5620 rq->idle->sched_class = &idle_sched_class;
5621 migrate_dead_tasks(cpu);
5622 spin_unlock_irq(&rq->lock);
5623 cpuset_unlock();
5624 migrate_nr_uninterruptible(rq);
5625 BUG_ON(rq->nr_running != 0);
5626
5627
5628
5629
5630
5631
5632 spin_lock_irq(&rq->lock);
5633 while (!list_empty(&rq->migration_queue)) {
5634 struct migration_req *req;
5635
5636 req = list_entry(rq->migration_queue.next,
5637 struct migration_req, list);
5638 list_del_init(&req->list);
5639 complete(&req->done);
5640 }
5641 spin_unlock_irq(&rq->lock);
5642 break;
5643#endif
5644 case CPU_LOCK_RELEASE:
5645 mutex_unlock(&sched_hotcpu_mutex);
5646 break;
5647 }
5648 return NOTIFY_OK;
5649}
5650
5651
5652
5653
5654static struct notifier_block __cpuinitdata migration_notifier = {
5655 .notifier_call = migration_call,
5656 .priority = 10
5657};
5658
5659void __init migration_init(void)
5660{
5661 void *cpu = (void *)(long)smp_processor_id();
5662 int err;
5663
5664
5665 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5666 BUG_ON(err == NOTIFY_BAD);
5667 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5668 register_cpu_notifier(&migration_notifier);
5669}
5670#endif
5671
5672#ifdef CONFIG_SMP
5673
5674
5675int nr_cpu_ids __read_mostly = NR_CPUS;
5676EXPORT_SYMBOL(nr_cpu_ids);
5677
5678#ifdef CONFIG_SCHED_DEBUG
5679
5680static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
5681{
5682 struct sched_group *group = sd->groups;
5683 cpumask_t groupmask;
5684 char str[NR_CPUS];
5685
5686 cpumask_scnprintf(str, NR_CPUS, sd->span);
5687 cpus_clear(groupmask);
5688
5689 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5690
5691 if (!(sd->flags & SD_LOAD_BALANCE)) {
5692 printk("does not load-balance\n");
5693 if (sd->parent)
5694 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5695 " has parent");
5696 return -1;
5697 }
5698
5699 printk(KERN_CONT "span %s\n", str);
5700
5701 if (!cpu_isset(cpu, sd->span)) {
5702 printk(KERN_ERR "ERROR: domain->span does not contain "
5703 "CPU%d\n", cpu);
5704 }
5705 if (!cpu_isset(cpu, group->cpumask)) {
5706 printk(KERN_ERR "ERROR: domain->groups does not contain"
5707 " CPU%d\n", cpu);
5708 }
5709
5710 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5711 do {
5712 if (!group) {
5713 printk("\n");
5714 printk(KERN_ERR "ERROR: group is NULL\n");
5715 break;
5716 }
5717
5718 if (!group->__cpu_power) {
5719 printk(KERN_CONT "\n");
5720 printk(KERN_ERR "ERROR: domain->cpu_power not "
5721 "set\n");
5722 break;
5723 }
5724
5725 if (!cpus_weight(group->cpumask)) {
5726 printk(KERN_CONT "\n");
5727 printk(KERN_ERR "ERROR: empty group\n");
5728 break;
5729 }
5730
5731 if (cpus_intersects(groupmask, group->cpumask)) {
5732 printk(KERN_CONT "\n");
5733 printk(KERN_ERR "ERROR: repeated CPUs\n");
5734 break;
5735 }
5736
5737 cpus_or(groupmask, groupmask, group->cpumask);
5738
5739 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5740 printk(KERN_CONT " %s", str);
5741
5742 group = group->next;
5743 } while (group != sd->groups);
5744 printk(KERN_CONT "\n");
5745
5746 if (!cpus_equal(sd->span, groupmask))
5747 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5748
5749 if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
5750 printk(KERN_ERR "ERROR: parent span is not a superset "
5751 "of domain->span\n");
5752 return 0;
5753}
5754
5755static void sched_domain_debug(struct sched_domain *sd, int cpu)
5756{
5757 int level = 0;
5758
5759 if (!sd) {
5760 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5761 return;
5762 }
5763
5764 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5765
5766 for (;;) {
5767 if (sched_domain_debug_one(sd, cpu, level))
5768 break;
5769 level++;
5770 sd = sd->parent;
5771 if (!sd)
5772 break;
5773 }
5774}
5775#else
5776# define sched_domain_debug(sd, cpu) do { } while (0)
5777#endif
5778
5779static int sd_degenerate(struct sched_domain *sd)
5780{
5781 if (cpus_weight(sd->span) == 1)
5782 return 1;
5783
5784
5785 if (sd->flags & (SD_LOAD_BALANCE |
5786 SD_BALANCE_NEWIDLE |
5787 SD_BALANCE_FORK |
5788 SD_BALANCE_EXEC |
5789 SD_SHARE_CPUPOWER |
5790 SD_SHARE_PKG_RESOURCES)) {
5791 if (sd->groups != sd->groups->next)
5792 return 0;
5793 }
5794
5795
5796 if (sd->flags & (SD_WAKE_IDLE |
5797 SD_WAKE_AFFINE |
5798 SD_WAKE_BALANCE))
5799 return 0;
5800
5801 return 1;
5802}
5803
5804static int
5805sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5806{
5807 unsigned long cflags = sd->flags, pflags = parent->flags;
5808
5809 if (sd_degenerate(parent))
5810 return 1;
5811
5812 if (!cpus_equal(sd->span, parent->span))
5813 return 0;
5814
5815
5816
5817 if (cflags & SD_WAKE_AFFINE)
5818 pflags &= ~SD_WAKE_BALANCE;
5819
5820 if (parent->groups == parent->groups->next) {
5821 pflags &= ~(SD_LOAD_BALANCE |
5822 SD_BALANCE_NEWIDLE |
5823 SD_BALANCE_FORK |
5824 SD_BALANCE_EXEC |
5825 SD_SHARE_CPUPOWER |
5826 SD_SHARE_PKG_RESOURCES);
5827 }
5828 if (~cflags & pflags)
5829 return 0;
5830
5831 return 1;
5832}
5833
5834
5835
5836
5837
5838static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5839{
5840 struct rq *rq = cpu_rq(cpu);
5841 struct sched_domain *tmp;
5842
5843
5844 for (tmp = sd; tmp; tmp = tmp->parent) {
5845 struct sched_domain *parent = tmp->parent;
5846 if (!parent)
5847 break;
5848 if (sd_parent_degenerate(tmp, parent)) {
5849 tmp->parent = parent->parent;
5850 if (parent->parent)
5851 parent->parent->child = tmp;
5852 }
5853 }
5854
5855 if (sd && sd_degenerate(sd)) {
5856 sd = sd->parent;
5857 if (sd)
5858 sd->child = NULL;
5859 }
5860
5861 sched_domain_debug(sd, cpu);
5862
5863 rcu_assign_pointer(rq->sd, sd);
5864}
5865
5866
5867static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5868
5869
5870static int __init isolated_cpu_setup(char *str)
5871{
5872 int ints[NR_CPUS], i;
5873
5874 str = get_options(str, ARRAY_SIZE(ints), ints);
5875 cpus_clear(cpu_isolated_map);
5876 for (i = 1; i <= ints[0]; i++)
5877 if (ints[i] < NR_CPUS)
5878 cpu_set(ints[i], cpu_isolated_map);
5879 return 1;
5880}
5881
5882__setup("isolcpus=", isolated_cpu_setup);
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894static void
5895init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5896 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5897 struct sched_group **sg))
5898{
5899 struct sched_group *first = NULL, *last = NULL;
5900 cpumask_t covered = CPU_MASK_NONE;
5901 int i;
5902
5903 for_each_cpu_mask(i, span) {
5904 struct sched_group *sg;
5905 int group = group_fn(i, cpu_map, &sg);
5906 int j;
5907
5908 if (cpu_isset(i, covered))
5909 continue;
5910
5911 sg->cpumask = CPU_MASK_NONE;
5912 sg->__cpu_power = 0;
5913
5914 for_each_cpu_mask(j, span) {
5915 if (group_fn(j, cpu_map, NULL) != group)
5916 continue;
5917
5918 cpu_set(j, covered);
5919 cpu_set(j, sg->cpumask);
5920 }
5921 if (!first)
5922 first = sg;
5923 if (last)
5924 last->next = sg;
5925 last = sg;
5926 }
5927 last->next = first;
5928}
5929
5930#define SD_NODES_PER_DOMAIN 16
5931
5932#ifdef CONFIG_NUMA
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944static int find_next_best_node(int node, unsigned long *used_nodes)
5945{
5946 int i, n, val, min_val, best_node = 0;
5947
5948 min_val = INT_MAX;
5949
5950 for (i = 0; i < MAX_NUMNODES; i++) {
5951
5952 n = (node + i) % MAX_NUMNODES;
5953
5954 if (!nr_cpus_node(n))
5955 continue;
5956
5957
5958 if (test_bit(n, used_nodes))
5959 continue;
5960
5961
5962 val = node_distance(node, n);
5963
5964 if (val < min_val) {
5965 min_val = val;
5966 best_node = n;
5967 }
5968 }
5969
5970 set_bit(best_node, used_nodes);
5971 return best_node;
5972}
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983static cpumask_t sched_domain_node_span(int node)
5984{
5985 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5986 cpumask_t span, nodemask;
5987 int i;
5988
5989 cpus_clear(span);
5990 bitmap_zero(used_nodes, MAX_NUMNODES);
5991
5992 nodemask = node_to_cpumask(node);
5993 cpus_or(span, span, nodemask);
5994 set_bit(node, used_nodes);
5995
5996 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5997 int next_node = find_next_best_node(node, used_nodes);
5998
5999 nodemask = node_to_cpumask(next_node);
6000 cpus_or(span, span, nodemask);
6001 }
6002
6003 return span;
6004}
6005#endif
6006
6007int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6008
6009
6010
6011
6012#ifdef CONFIG_SCHED_SMT
6013static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6014static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6015
6016static int
6017cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6018{
6019 if (sg)
6020 *sg = &per_cpu(sched_group_cpus, cpu);
6021 return cpu;
6022}
6023#endif
6024
6025
6026
6027
6028#ifdef CONFIG_SCHED_MC
6029static DEFINE_PER_CPU(struct sched_domain, core_domains);
6030static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6031#endif
6032
6033#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6034static int
6035cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6036{
6037 int group;
6038 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
6039 cpus_and(mask, mask, *cpu_map);
6040 group = first_cpu(mask);
6041 if (sg)
6042 *sg = &per_cpu(sched_group_core, group);
6043 return group;
6044}
6045#elif defined(CONFIG_SCHED_MC)
6046static int
6047cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6048{
6049 if (sg)
6050 *sg = &per_cpu(sched_group_core, cpu);
6051 return cpu;
6052}
6053#endif
6054
6055static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6056static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6057
6058static int
6059cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6060{
6061 int group;
6062#ifdef CONFIG_SCHED_MC
6063 cpumask_t mask = cpu_coregroup_map(cpu);
6064 cpus_and(mask, mask, *cpu_map);
6065 group = first_cpu(mask);
6066#elif defined(CONFIG_SCHED_SMT)
6067 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
6068 cpus_and(mask, mask, *cpu_map);
6069 group = first_cpu(mask);
6070#else
6071 group = cpu;
6072#endif
6073 if (sg)
6074 *sg = &per_cpu(sched_group_phys, group);
6075 return group;
6076}
6077
6078#ifdef CONFIG_NUMA
6079
6080
6081
6082
6083
6084static DEFINE_PER_CPU(struct sched_domain, node_domains);
6085static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6086
6087static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6088static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6089
6090static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6091 struct sched_group **sg)
6092{
6093 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6094 int group;
6095
6096 cpus_and(nodemask, nodemask, *cpu_map);
6097 group = first_cpu(nodemask);
6098
6099 if (sg)
6100 *sg = &per_cpu(sched_group_allnodes, group);
6101 return group;
6102}
6103
6104static void init_numa_sched_groups_power(struct sched_group *group_head)
6105{
6106 struct sched_group *sg = group_head;
6107 int j;
6108
6109 if (!sg)
6110 return;
6111 do {
6112 for_each_cpu_mask(j, sg->cpumask) {
6113 struct sched_domain *sd;
6114
6115 sd = &per_cpu(phys_domains, j);
6116 if (j != first_cpu(sd->groups->cpumask)) {
6117
6118
6119
6120
6121 continue;
6122 }
6123
6124 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6125 }
6126 sg = sg->next;
6127 } while (sg != group_head);
6128}
6129#endif
6130
6131#ifdef CONFIG_NUMA
6132
6133static void free_sched_groups(const cpumask_t *cpu_map)
6134{
6135 int cpu, i;
6136
6137 for_each_cpu_mask(cpu, *cpu_map) {
6138 struct sched_group **sched_group_nodes
6139 = sched_group_nodes_bycpu[cpu];
6140
6141 if (!sched_group_nodes)
6142 continue;
6143
6144 for (i = 0; i < MAX_NUMNODES; i++) {
6145 cpumask_t nodemask = node_to_cpumask(i);
6146 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6147
6148 cpus_and(nodemask, nodemask, *cpu_map);
6149 if (cpus_empty(nodemask))
6150 continue;
6151
6152 if (sg == NULL)
6153 continue;
6154 sg = sg->next;
6155next_sg:
6156 oldsg = sg;
6157 sg = sg->next;
6158 kfree(oldsg);
6159 if (oldsg != sched_group_nodes[i])
6160 goto next_sg;
6161 }
6162 kfree(sched_group_nodes);
6163 sched_group_nodes_bycpu[cpu] = NULL;
6164 }
6165}
6166#else
6167static void free_sched_groups(const cpumask_t *cpu_map)
6168{
6169}
6170#endif
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6187{
6188 struct sched_domain *child;
6189 struct sched_group *group;
6190
6191 WARN_ON(!sd || !sd->groups);
6192
6193 if (cpu != first_cpu(sd->groups->cpumask))
6194 return;
6195
6196 child = sd->child;
6197
6198 sd->groups->__cpu_power = 0;
6199
6200
6201
6202
6203
6204
6205
6206
6207 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6208 (child->flags &
6209 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6210 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6211 return;
6212 }
6213
6214
6215
6216
6217 group = child->groups;
6218 do {
6219 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6220 group = group->next;
6221 } while (group != child->groups);
6222}
6223
6224
6225
6226
6227
6228static int build_sched_domains(const cpumask_t *cpu_map)
6229{
6230 int i;
6231#ifdef CONFIG_NUMA
6232 struct sched_group **sched_group_nodes = NULL;
6233 int sd_allnodes = 0;
6234
6235
6236
6237
6238 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6239 GFP_KERNEL);
6240 if (!sched_group_nodes) {
6241 printk(KERN_WARNING "Can not alloc sched group node list\n");
6242 return -ENOMEM;
6243 }
6244 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6245#endif
6246
6247
6248
6249
6250 for_each_cpu_mask(i, *cpu_map) {
6251 struct sched_domain *sd = NULL, *p;
6252 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6253
6254 cpus_and(nodemask, nodemask, *cpu_map);
6255
6256#ifdef CONFIG_NUMA
6257 if (cpus_weight(*cpu_map) >
6258 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6259 sd = &per_cpu(allnodes_domains, i);
6260 *sd = SD_ALLNODES_INIT;
6261 sd->span = *cpu_map;
6262 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6263 p = sd;
6264 sd_allnodes = 1;
6265 } else
6266 p = NULL;
6267
6268 sd = &per_cpu(node_domains, i);
6269 *sd = SD_NODE_INIT;
6270 sd->span = sched_domain_node_span(cpu_to_node(i));
6271 sd->parent = p;
6272 if (p)
6273 p->child = sd;
6274 cpus_and(sd->span, sd->span, *cpu_map);
6275#endif
6276
6277 p = sd;
6278 sd = &per_cpu(phys_domains, i);
6279 *sd = SD_CPU_INIT;
6280 sd->span = nodemask;
6281 sd->parent = p;
6282 if (p)
6283 p->child = sd;
6284 cpu_to_phys_group(i, cpu_map, &sd->groups);
6285
6286#ifdef CONFIG_SCHED_MC
6287 p = sd;
6288 sd = &per_cpu(core_domains, i);
6289 *sd = SD_MC_INIT;
6290 sd->span = cpu_coregroup_map(i);
6291 cpus_and(sd->span, sd->span, *cpu_map);
6292 sd->parent = p;
6293 p->child = sd;
6294 cpu_to_core_group(i, cpu_map, &sd->groups);
6295#endif
6296
6297#ifdef CONFIG_SCHED_SMT
6298 p = sd;
6299 sd = &per_cpu(cpu_domains, i);
6300 *sd = SD_SIBLING_INIT;
6301 sd->span = per_cpu(cpu_sibling_map, i);
6302 cpus_and(sd->span, sd->span, *cpu_map);
6303 sd->parent = p;
6304 p->child = sd;
6305 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6306#endif
6307 }
6308
6309#ifdef CONFIG_SCHED_SMT
6310
6311 for_each_cpu_mask(i, *cpu_map) {
6312 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
6313 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6314 if (i != first_cpu(this_sibling_map))
6315 continue;
6316
6317 init_sched_build_groups(this_sibling_map, cpu_map,
6318 &cpu_to_cpu_group);
6319 }
6320#endif
6321
6322#ifdef CONFIG_SCHED_MC
6323
6324 for_each_cpu_mask(i, *cpu_map) {
6325 cpumask_t this_core_map = cpu_coregroup_map(i);
6326 cpus_and(this_core_map, this_core_map, *cpu_map);
6327 if (i != first_cpu(this_core_map))
6328 continue;
6329 init_sched_build_groups(this_core_map, cpu_map,
6330 &cpu_to_core_group);
6331 }
6332#endif
6333
6334
6335 for (i = 0; i < MAX_NUMNODES; i++) {
6336 cpumask_t nodemask = node_to_cpumask(i);
6337
6338 cpus_and(nodemask, nodemask, *cpu_map);
6339 if (cpus_empty(nodemask))
6340 continue;
6341
6342 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6343 }
6344
6345#ifdef CONFIG_NUMA
6346
6347 if (sd_allnodes)
6348 init_sched_build_groups(*cpu_map, cpu_map,
6349 &cpu_to_allnodes_group);
6350
6351 for (i = 0; i < MAX_NUMNODES; i++) {
6352
6353 struct sched_group *sg, *prev;
6354 cpumask_t nodemask = node_to_cpumask(i);
6355 cpumask_t domainspan;
6356 cpumask_t covered = CPU_MASK_NONE;
6357 int j;
6358
6359 cpus_and(nodemask, nodemask, *cpu_map);
6360 if (cpus_empty(nodemask)) {
6361 sched_group_nodes[i] = NULL;
6362 continue;
6363 }
6364
6365 domainspan = sched_domain_node_span(i);
6366 cpus_and(domainspan, domainspan, *cpu_map);
6367
6368 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6369 if (!sg) {
6370 printk(KERN_WARNING "Can not alloc domain group for "
6371 "node %d\n", i);
6372 goto error;
6373 }
6374 sched_group_nodes[i] = sg;
6375 for_each_cpu_mask(j, nodemask) {
6376 struct sched_domain *sd;
6377
6378 sd = &per_cpu(node_domains, j);
6379 sd->groups = sg;
6380 }
6381 sg->__cpu_power = 0;
6382 sg->cpumask = nodemask;
6383 sg->next = sg;
6384 cpus_or(covered, covered, nodemask);
6385 prev = sg;
6386
6387 for (j = 0; j < MAX_NUMNODES; j++) {
6388 cpumask_t tmp, notcovered;
6389 int n = (i + j) % MAX_NUMNODES;
6390
6391 cpus_complement(notcovered, covered);
6392 cpus_and(tmp, notcovered, *cpu_map);
6393 cpus_and(tmp, tmp, domainspan);
6394 if (cpus_empty(tmp))
6395 break;
6396
6397 nodemask = node_to_cpumask(n);
6398 cpus_and(tmp, tmp, nodemask);
6399 if (cpus_empty(tmp))
6400 continue;
6401
6402 sg = kmalloc_node(sizeof(struct sched_group),
6403 GFP_KERNEL, i);
6404 if (!sg) {
6405 printk(KERN_WARNING
6406 "Can not alloc domain group for node %d\n", j);
6407 goto error;
6408 }
6409 sg->__cpu_power = 0;
6410 sg->cpumask = tmp;
6411 sg->next = prev->next;
6412 cpus_or(covered, covered, tmp);
6413 prev->next = sg;
6414 prev = sg;
6415 }
6416 }
6417#endif
6418
6419
6420#ifdef CONFIG_SCHED_SMT
6421 for_each_cpu_mask(i, *cpu_map) {
6422 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6423
6424 init_sched_groups_power(i, sd);
6425 }
6426#endif
6427#ifdef CONFIG_SCHED_MC
6428 for_each_cpu_mask(i, *cpu_map) {
6429 struct sched_domain *sd = &per_cpu(core_domains, i);
6430
6431 init_sched_groups_power(i, sd);
6432 }
6433#endif
6434
6435 for_each_cpu_mask(i, *cpu_map) {
6436 struct sched_domain *sd = &per_cpu(phys_domains, i);
6437
6438 init_sched_groups_power(i, sd);
6439 }
6440
6441#ifdef CONFIG_NUMA
6442 for (i = 0; i < MAX_NUMNODES; i++)
6443 init_numa_sched_groups_power(sched_group_nodes[i]);
6444
6445 if (sd_allnodes) {
6446 struct sched_group *sg;
6447
6448 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6449 init_numa_sched_groups_power(sg);
6450 }
6451#endif
6452
6453
6454 for_each_cpu_mask(i, *cpu_map) {
6455 struct sched_domain *sd;
6456#ifdef CONFIG_SCHED_SMT
6457 sd = &per_cpu(cpu_domains, i);
6458#elif defined(CONFIG_SCHED_MC)
6459 sd = &per_cpu(core_domains, i);
6460#else
6461 sd = &per_cpu(phys_domains, i);
6462#endif
6463 cpu_attach_domain(sd, i);
6464 }
6465
6466 return 0;
6467
6468#ifdef CONFIG_NUMA
6469error:
6470 free_sched_groups(cpu_map);
6471 return -ENOMEM;
6472#endif
6473}
6474
6475static cpumask_t *doms_cur;
6476static int ndoms_cur;
6477
6478
6479
6480
6481
6482
6483static cpumask_t fallback_doms;
6484
6485
6486
6487
6488
6489
6490static int arch_init_sched_domains(const cpumask_t *cpu_map)
6491{
6492 int err;
6493
6494 ndoms_cur = 1;
6495 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6496 if (!doms_cur)
6497 doms_cur = &fallback_doms;
6498 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6499 err = build_sched_domains(doms_cur);
6500 register_sched_domain_sysctl();
6501
6502 return err;
6503}
6504
6505static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6506{
6507 free_sched_groups(cpu_map);
6508}
6509
6510
6511
6512
6513
6514static void detach_destroy_domains(const cpumask_t *cpu_map)
6515{
6516 int i;
6517
6518 unregister_sched_domain_sysctl();
6519
6520 for_each_cpu_mask(i, *cpu_map)
6521 cpu_attach_domain(NULL, i);
6522 synchronize_sched();
6523 arch_destroy_sched_domains(cpu_map);
6524}
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6548{
6549 int i, j;
6550
6551
6552 unregister_sched_domain_sysctl();
6553
6554 if (doms_new == NULL) {
6555 ndoms_new = 1;
6556 doms_new = &fallback_doms;
6557 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6558 }
6559
6560
6561 for (i = 0; i < ndoms_cur; i++) {
6562 for (j = 0; j < ndoms_new; j++) {
6563 if (cpus_equal(doms_cur[i], doms_new[j]))
6564 goto match1;
6565 }
6566
6567 detach_destroy_domains(doms_cur + i);
6568match1:
6569 ;
6570 }
6571
6572
6573 for (i = 0; i < ndoms_new; i++) {
6574 for (j = 0; j < ndoms_cur; j++) {
6575 if (cpus_equal(doms_new[i], doms_cur[j]))
6576 goto match2;
6577 }
6578
6579 build_sched_domains(doms_new + i);
6580match2:
6581 ;
6582 }
6583
6584
6585 if (doms_cur != &fallback_doms)
6586 kfree(doms_cur);
6587 doms_cur = doms_new;
6588 ndoms_cur = ndoms_new;
6589
6590 register_sched_domain_sysctl();
6591}
6592
6593#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6594static int arch_reinit_sched_domains(void)
6595{
6596 int err;
6597
6598 mutex_lock(&sched_hotcpu_mutex);
6599 detach_destroy_domains(&cpu_online_map);
6600 err = arch_init_sched_domains(&cpu_online_map);
6601 mutex_unlock(&sched_hotcpu_mutex);
6602
6603 return err;
6604}
6605
6606static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6607{
6608 int ret;
6609
6610 if (buf[0] != '0' && buf[0] != '1')
6611 return -EINVAL;
6612
6613 if (smt)
6614 sched_smt_power_savings = (buf[0] == '1');
6615 else
6616 sched_mc_power_savings = (buf[0] == '1');
6617
6618 ret = arch_reinit_sched_domains();
6619
6620 return ret ? ret : count;
6621}
6622
6623#ifdef CONFIG_SCHED_MC
6624static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6625{
6626 return sprintf(page, "%u\n", sched_mc_power_savings);
6627}
6628static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6629 const char *buf, size_t count)
6630{
6631 return sched_power_savings_store(buf, count, 0);
6632}
6633static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6634 sched_mc_power_savings_store);
6635#endif
6636
6637#ifdef CONFIG_SCHED_SMT
6638static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6639{
6640 return sprintf(page, "%u\n", sched_smt_power_savings);
6641}
6642static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6643 const char *buf, size_t count)
6644{
6645 return sched_power_savings_store(buf, count, 1);
6646}
6647static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6648 sched_smt_power_savings_store);
6649#endif
6650
6651int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6652{
6653 int err = 0;
6654
6655#ifdef CONFIG_SCHED_SMT
6656 if (smt_capable())
6657 err = sysfs_create_file(&cls->kset.kobj,
6658 &attr_sched_smt_power_savings.attr);
6659#endif
6660#ifdef CONFIG_SCHED_MC
6661 if (!err && mc_capable())
6662 err = sysfs_create_file(&cls->kset.kobj,
6663 &attr_sched_mc_power_savings.attr);
6664#endif
6665 return err;
6666}
6667#endif
6668
6669
6670
6671
6672
6673
6674
6675static int update_sched_domains(struct notifier_block *nfb,
6676 unsigned long action, void *hcpu)
6677{
6678 switch (action) {
6679 case CPU_UP_PREPARE:
6680 case CPU_UP_PREPARE_FROZEN:
6681 case CPU_DOWN_PREPARE:
6682 case CPU_DOWN_PREPARE_FROZEN:
6683 detach_destroy_domains(&cpu_online_map);
6684 return NOTIFY_OK;
6685
6686 case CPU_UP_CANCELED:
6687 case CPU_UP_CANCELED_FROZEN:
6688 case CPU_DOWN_FAILED:
6689 case CPU_DOWN_FAILED_FROZEN:
6690 case CPU_ONLINE:
6691 case CPU_ONLINE_FROZEN:
6692 case CPU_DEAD:
6693 case CPU_DEAD_FROZEN:
6694
6695
6696
6697 break;
6698 default:
6699 return NOTIFY_DONE;
6700 }
6701
6702
6703 arch_init_sched_domains(&cpu_online_map);
6704
6705 return NOTIFY_OK;
6706}
6707
6708void __init sched_init_smp(void)
6709{
6710 cpumask_t non_isolated_cpus;
6711
6712 mutex_lock(&sched_hotcpu_mutex);
6713 arch_init_sched_domains(&cpu_online_map);
6714 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6715 if (cpus_empty(non_isolated_cpus))
6716 cpu_set(smp_processor_id(), non_isolated_cpus);
6717 mutex_unlock(&sched_hotcpu_mutex);
6718
6719 hotcpu_notifier(update_sched_domains, 0);
6720
6721
6722 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6723 BUG();
6724 sched_init_granularity();
6725}
6726#else
6727void __init sched_init_smp(void)
6728{
6729 sched_init_granularity();
6730}
6731#endif
6732
6733int in_sched_functions(unsigned long addr)
6734{
6735 return in_lock_functions(addr) ||
6736 (addr >= (unsigned long)__sched_text_start
6737 && addr < (unsigned long)__sched_text_end);
6738}
6739
6740static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6741{
6742 cfs_rq->tasks_timeline = RB_ROOT;
6743#ifdef CONFIG_FAIR_GROUP_SCHED
6744 cfs_rq->rq = rq;
6745#endif
6746 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6747}
6748
6749void __init sched_init(void)
6750{
6751 int highest_cpu = 0;
6752 int i, j;
6753
6754 for_each_possible_cpu(i) {
6755 struct rt_prio_array *array;
6756 struct rq *rq;
6757
6758 rq = cpu_rq(i);
6759 spin_lock_init(&rq->lock);
6760 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6761 rq->nr_running = 0;
6762 rq->clock = 1;
6763 init_cfs_rq(&rq->cfs, rq);
6764#ifdef CONFIG_FAIR_GROUP_SCHED
6765 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6766 {
6767 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6768 struct sched_entity *se =
6769 &per_cpu(init_sched_entity, i);
6770
6771 init_cfs_rq_p[i] = cfs_rq;
6772 init_cfs_rq(cfs_rq, rq);
6773 cfs_rq->tg = &init_task_group;
6774 list_add(&cfs_rq->leaf_cfs_rq_list,
6775 &rq->leaf_cfs_rq_list);
6776
6777 init_sched_entity_p[i] = se;
6778 se->cfs_rq = &rq->cfs;
6779 se->my_q = cfs_rq;
6780 se->load.weight = init_task_group_load;
6781 se->load.inv_weight =
6782 div64_64(1ULL<<32, init_task_group_load);
6783 se->parent = NULL;
6784 }
6785 init_task_group.shares = init_task_group_load;
6786 spin_lock_init(&init_task_group.lock);
6787#endif
6788
6789 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6790 rq->cpu_load[j] = 0;
6791#ifdef CONFIG_SMP
6792 rq->sd = NULL;
6793 rq->active_balance = 0;
6794 rq->next_balance = jiffies;
6795 rq->push_cpu = 0;
6796 rq->cpu = i;
6797 rq->migration_thread = NULL;
6798 INIT_LIST_HEAD(&rq->migration_queue);
6799#endif
6800 atomic_set(&rq->nr_iowait, 0);
6801
6802 array = &rq->rt.active;
6803 for (j = 0; j < MAX_RT_PRIO; j++) {
6804 INIT_LIST_HEAD(array->queue + j);
6805 __clear_bit(j, array->bitmap);
6806 }
6807 highest_cpu = i;
6808
6809 __set_bit(MAX_RT_PRIO, array->bitmap);
6810 }
6811
6812 set_load_weight(&init_task);
6813
6814#ifdef CONFIG_PREEMPT_NOTIFIERS
6815 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6816#endif
6817
6818#ifdef CONFIG_SMP
6819 nr_cpu_ids = highest_cpu + 1;
6820 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6821#endif
6822
6823#ifdef CONFIG_RT_MUTEXES
6824 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6825#endif
6826
6827
6828
6829
6830 atomic_inc(&init_mm.mm_count);
6831 enter_lazy_tlb(&init_mm, current);
6832
6833
6834
6835
6836
6837
6838
6839 init_idle(current, smp_processor_id());
6840
6841
6842
6843 current->sched_class = &fair_sched_class;
6844}
6845
6846#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6847void __might_sleep(char *file, int line)
6848{
6849#ifdef in_atomic
6850 static unsigned long prev_jiffy;
6851
6852 if ((in_atomic() || irqs_disabled()) &&
6853 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6854 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6855 return;
6856 prev_jiffy = jiffies;
6857 printk(KERN_ERR "BUG: sleeping function called from invalid"
6858 " context at %s:%d\n", file, line);
6859 printk("in_atomic():%d, irqs_disabled():%d\n",
6860 in_atomic(), irqs_disabled());
6861 debug_show_held_locks(current);
6862 if (irqs_disabled())
6863 print_irqtrace_events(current);
6864 dump_stack();
6865 }
6866#endif
6867}
6868EXPORT_SYMBOL(__might_sleep);
6869#endif
6870
6871#ifdef CONFIG_MAGIC_SYSRQ
6872static void normalize_task(struct rq *rq, struct task_struct *p)
6873{
6874 int on_rq;
6875 update_rq_clock(rq);
6876 on_rq = p->se.on_rq;
6877 if (on_rq)
6878 deactivate_task(rq, p, 0);
6879 __setscheduler(rq, p, SCHED_NORMAL, 0);
6880 if (on_rq) {
6881 activate_task(rq, p, 0);
6882 resched_task(rq->curr);
6883 }
6884}
6885
6886void normalize_rt_tasks(void)
6887{
6888 struct task_struct *g, *p;
6889 unsigned long flags;
6890 struct rq *rq;
6891
6892 read_lock_irq(&tasklist_lock);
6893 do_each_thread(g, p) {
6894
6895
6896
6897 if (!p->mm)
6898 continue;
6899
6900 p->se.exec_start = 0;
6901#ifdef CONFIG_SCHEDSTATS
6902 p->se.wait_start = 0;
6903 p->se.sleep_start = 0;
6904 p->se.block_start = 0;
6905#endif
6906 task_rq(p)->clock = 0;
6907
6908 if (!rt_task(p)) {
6909
6910
6911
6912
6913 if (TASK_NICE(p) < 0 && p->mm)
6914 set_user_nice(p, 0);
6915 continue;
6916 }
6917
6918 spin_lock_irqsave(&p->pi_lock, flags);
6919 rq = __task_rq_lock(p);
6920
6921 normalize_task(rq, p);
6922
6923 __task_rq_unlock(rq);
6924 spin_unlock_irqrestore(&p->pi_lock, flags);
6925 } while_each_thread(g, p);
6926
6927 read_unlock_irq(&tasklist_lock);
6928}
6929
6930#endif
6931
6932#ifdef CONFIG_IA64
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949struct task_struct *curr_task(int cpu)
6950{
6951 return cpu_curr(cpu);
6952}
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969void set_curr_task(int cpu, struct task_struct *p)
6970{
6971 cpu_curr(cpu) = p;
6972}
6973
6974#endif
6975
6976#ifdef CONFIG_FAIR_GROUP_SCHED
6977
6978
6979struct task_group *sched_create_group(void)
6980{
6981 struct task_group *tg;
6982 struct cfs_rq *cfs_rq;
6983 struct sched_entity *se;
6984 struct rq *rq;
6985 int i;
6986
6987 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6988 if (!tg)
6989 return ERR_PTR(-ENOMEM);
6990
6991 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
6992 if (!tg->cfs_rq)
6993 goto err;
6994 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6995 if (!tg->se)
6996 goto err;
6997
6998 for_each_possible_cpu(i) {
6999 rq = cpu_rq(i);
7000
7001 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
7002 cpu_to_node(i));
7003 if (!cfs_rq)
7004 goto err;
7005
7006 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
7007 cpu_to_node(i));
7008 if (!se)
7009 goto err;
7010
7011 memset(cfs_rq, 0, sizeof(struct cfs_rq));
7012 memset(se, 0, sizeof(struct sched_entity));
7013
7014 tg->cfs_rq[i] = cfs_rq;
7015 init_cfs_rq(cfs_rq, rq);
7016 cfs_rq->tg = tg;
7017
7018 tg->se[i] = se;
7019 se->cfs_rq = &rq->cfs;
7020 se->my_q = cfs_rq;
7021 se->load.weight = NICE_0_LOAD;
7022 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7023 se->parent = NULL;
7024 }
7025
7026 for_each_possible_cpu(i) {
7027 rq = cpu_rq(i);
7028 cfs_rq = tg->cfs_rq[i];
7029 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7030 }
7031
7032 tg->shares = NICE_0_LOAD;
7033 spin_lock_init(&tg->lock);
7034
7035 return tg;
7036
7037err:
7038 for_each_possible_cpu(i) {
7039 if (tg->cfs_rq)
7040 kfree(tg->cfs_rq[i]);
7041 if (tg->se)
7042 kfree(tg->se[i]);
7043 }
7044 kfree(tg->cfs_rq);
7045 kfree(tg->se);
7046 kfree(tg);
7047
7048 return ERR_PTR(-ENOMEM);
7049}
7050
7051
7052static void free_sched_group(struct rcu_head *rhp)
7053{
7054 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7055 struct cfs_rq *cfs_rq;
7056 struct sched_entity *se;
7057 int i;
7058
7059
7060 for_each_possible_cpu(i) {
7061 cfs_rq = tg->cfs_rq[i];
7062 kfree(cfs_rq);
7063
7064 se = tg->se[i];
7065 kfree(se);
7066 }
7067
7068 kfree(tg->cfs_rq);
7069 kfree(tg->se);
7070 kfree(tg);
7071}
7072
7073
7074void sched_destroy_group(struct task_group *tg)
7075{
7076 struct cfs_rq *cfs_rq = NULL;
7077 int i;
7078
7079 for_each_possible_cpu(i) {
7080 cfs_rq = tg->cfs_rq[i];
7081 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7082 }
7083
7084 BUG_ON(!cfs_rq);
7085
7086
7087 call_rcu(&tg->rcu, free_sched_group);
7088}
7089
7090
7091
7092
7093
7094
7095void sched_move_task(struct task_struct *tsk)
7096{
7097 int on_rq, running;
7098 unsigned long flags;
7099 struct rq *rq;
7100
7101 rq = task_rq_lock(tsk, &flags);
7102
7103 if (tsk->sched_class != &fair_sched_class) {
7104 set_task_cfs_rq(tsk, task_cpu(tsk));
7105 goto done;
7106 }
7107
7108 update_rq_clock(rq);
7109
7110 running = task_current(rq, tsk);
7111 on_rq = tsk->se.on_rq;
7112
7113 if (on_rq) {
7114 dequeue_task(rq, tsk, 0);
7115 if (unlikely(running))
7116 tsk->sched_class->put_prev_task(rq, tsk);
7117 }
7118
7119 set_task_cfs_rq(tsk, task_cpu(tsk));
7120
7121 if (on_rq) {
7122 if (unlikely(running))
7123 tsk->sched_class->set_curr_task(rq);
7124 enqueue_task(rq, tsk, 0);
7125 }
7126
7127done:
7128 task_rq_unlock(rq, &flags);
7129}
7130
7131static void set_se_shares(struct sched_entity *se, unsigned long shares)
7132{
7133 struct cfs_rq *cfs_rq = se->cfs_rq;
7134 struct rq *rq = cfs_rq->rq;
7135 int on_rq;
7136
7137 spin_lock_irq(&rq->lock);
7138
7139 on_rq = se->on_rq;
7140 if (on_rq)
7141 dequeue_entity(cfs_rq, se, 0);
7142
7143 se->load.weight = shares;
7144 se->load.inv_weight = div64_64((1ULL<<32), shares);
7145
7146 if (on_rq)
7147 enqueue_entity(cfs_rq, se, 0);
7148
7149 spin_unlock_irq(&rq->lock);
7150}
7151
7152int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7153{
7154 int i;
7155
7156
7157
7158
7159
7160
7161 if (shares < 2)
7162 shares = 2;
7163
7164 spin_lock(&tg->lock);
7165 if (tg->shares == shares)
7166 goto done;
7167
7168 tg->shares = shares;
7169 for_each_possible_cpu(i)
7170 set_se_shares(tg->se[i], shares);
7171
7172done:
7173 spin_unlock(&tg->lock);
7174 return 0;
7175}
7176
7177unsigned long sched_group_shares(struct task_group *tg)
7178{
7179 return tg->shares;
7180}
7181
7182#endif
7183
7184#ifdef CONFIG_FAIR_CGROUP_SCHED
7185
7186
7187static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7188{
7189 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7190 struct task_group, css);
7191}
7192
7193static struct cgroup_subsys_state *
7194cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7195{
7196 struct task_group *tg;
7197
7198 if (!cgrp->parent) {
7199
7200 init_task_group.css.cgroup = cgrp;
7201 return &init_task_group.css;
7202 }
7203
7204
7205 if (cgrp->parent->parent)
7206 return ERR_PTR(-EINVAL);
7207
7208 tg = sched_create_group();
7209 if (IS_ERR(tg))
7210 return ERR_PTR(-ENOMEM);
7211
7212
7213 tg->css.cgroup = cgrp;
7214
7215 return &tg->css;
7216}
7217
7218static void
7219cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7220{
7221 struct task_group *tg = cgroup_tg(cgrp);
7222
7223 sched_destroy_group(tg);
7224}
7225
7226static int
7227cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7228 struct task_struct *tsk)
7229{
7230
7231 if (tsk->sched_class != &fair_sched_class)
7232 return -EINVAL;
7233
7234 return 0;
7235}
7236
7237static void
7238cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7239 struct cgroup *old_cont, struct task_struct *tsk)
7240{
7241 sched_move_task(tsk);
7242}
7243
7244static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7245 u64 shareval)
7246{
7247 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
7248}
7249
7250static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7251{
7252 struct task_group *tg = cgroup_tg(cgrp);
7253
7254 return (u64) tg->shares;
7255}
7256
7257static struct cftype cpu_files[] = {
7258 {
7259 .name = "shares",
7260 .read_uint = cpu_shares_read_uint,
7261 .write_uint = cpu_shares_write_uint,
7262 },
7263};
7264
7265static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7266{
7267 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7268}
7269
7270struct cgroup_subsys cpu_cgroup_subsys = {
7271 .name = "cpu",
7272 .create = cpu_cgroup_create,
7273 .destroy = cpu_cgroup_destroy,
7274 .can_attach = cpu_cgroup_can_attach,
7275 .attach = cpu_cgroup_attach,
7276 .populate = cpu_cgroup_populate,
7277 .subsys_id = cpu_cgroup_subsys_id,
7278 .early_init = 1,
7279};
7280
7281#endif
7282
7283#ifdef CONFIG_CGROUP_CPUACCT
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293struct cpuacct {
7294 struct cgroup_subsys_state css;
7295
7296 u64 *cpuusage;
7297};
7298
7299struct cgroup_subsys cpuacct_subsys;
7300
7301
7302static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
7303{
7304 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
7305 struct cpuacct, css);
7306}
7307
7308
7309static inline struct cpuacct *task_ca(struct task_struct *tsk)
7310{
7311 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
7312 struct cpuacct, css);
7313}
7314
7315
7316static struct cgroup_subsys_state *cpuacct_create(
7317 struct cgroup_subsys *ss, struct cgroup *cont)
7318{
7319 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7320
7321 if (!ca)
7322 return ERR_PTR(-ENOMEM);
7323
7324 ca->cpuusage = alloc_percpu(u64);
7325 if (!ca->cpuusage) {
7326 kfree(ca);
7327 return ERR_PTR(-ENOMEM);
7328 }
7329
7330 return &ca->css;
7331}
7332
7333
7334static void
7335cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
7336{
7337 struct cpuacct *ca = cgroup_ca(cont);
7338
7339 free_percpu(ca->cpuusage);
7340 kfree(ca);
7341}
7342
7343
7344static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
7345{
7346 struct cpuacct *ca = cgroup_ca(cont);
7347 u64 totalcpuusage = 0;
7348 int i;
7349
7350 for_each_possible_cpu(i) {
7351 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
7352
7353
7354
7355
7356
7357 spin_lock_irq(&cpu_rq(i)->lock);
7358 totalcpuusage += *cpuusage;
7359 spin_unlock_irq(&cpu_rq(i)->lock);
7360 }
7361
7362 return totalcpuusage;
7363}
7364
7365static struct cftype files[] = {
7366 {
7367 .name = "usage",
7368 .read_uint = cpuusage_read,
7369 },
7370};
7371
7372static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7373{
7374 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
7375}
7376
7377
7378
7379
7380
7381
7382static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
7383{
7384 struct cpuacct *ca;
7385
7386 if (!cpuacct_subsys.active)
7387 return;
7388
7389 ca = task_ca(tsk);
7390 if (ca) {
7391 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
7392
7393 *cpuusage += cputime;
7394 }
7395}
7396
7397struct cgroup_subsys cpuacct_subsys = {
7398 .name = "cpuacct",
7399 .create = cpuacct_create,
7400 .destroy = cpuacct_destroy,
7401 .populate = cpuacct_populate,
7402 .subsys_id = cpuacct_subsys_id,
7403};
7404#endif
7405