1
2
3
4
5
6
7
8#include "sched.h"
9
10#include <linux/nospec.h>
11
12#include <linux/kcov.h>
13
14#include <asm/switch_to.h>
15#include <asm/tlb.h>
16
17#include "../workqueue_internal.h"
18#include "../smpboot.h"
19
20#include "pelt.h"
21
22#define CREATE_TRACE_POINTS
23#include <trace/events/sched.h>
24
25DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
26
27#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
28
29
30
31
32
33
34
35#define SCHED_FEAT(name, enabled) \
36 (1UL << __SCHED_FEAT_##name) * enabled |
37const_debug unsigned int sysctl_sched_features =
38#include "features.h"
39 0;
40#undef SCHED_FEAT
41#endif
42
43
44
45
46
47const_debug unsigned int sysctl_sched_nr_migrate = 32;
48
49
50
51
52
53unsigned int sysctl_sched_rt_period = 1000000;
54
55__read_mostly int scheduler_running;
56
57
58
59
60
61int sysctl_sched_rt_runtime = 950000;
62
63
64
65
66struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
67 __acquires(rq->lock)
68{
69 struct rq *rq;
70
71 lockdep_assert_held(&p->pi_lock);
72
73 for (;;) {
74 rq = task_rq(p);
75 raw_spin_lock(&rq->lock);
76 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
77 rq_pin_lock(rq, rf);
78 return rq;
79 }
80 raw_spin_unlock(&rq->lock);
81
82 while (unlikely(task_on_rq_migrating(p)))
83 cpu_relax();
84 }
85}
86
87
88
89
90struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
91 __acquires(p->pi_lock)
92 __acquires(rq->lock)
93{
94 struct rq *rq;
95
96 for (;;) {
97 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
98 rq = task_rq(p);
99 raw_spin_lock(&rq->lock);
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
117 rq_pin_lock(rq, rf);
118 return rq;
119 }
120 raw_spin_unlock(&rq->lock);
121 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
122
123 while (unlikely(task_on_rq_migrating(p)))
124 cpu_relax();
125 }
126}
127
128
129
130
131
132static void update_rq_clock_task(struct rq *rq, s64 delta)
133{
134
135
136
137
138#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
139 s64 steal = 0, irq_delta = 0;
140#endif
141#ifdef CONFIG_IRQ_TIME_ACCOUNTING
142 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159 if (irq_delta > delta)
160 irq_delta = delta;
161
162 rq->prev_irq_time += irq_delta;
163 delta -= irq_delta;
164#endif
165#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
166 if (static_key_false((¶virt_steal_rq_enabled))) {
167 steal = paravirt_steal_clock(cpu_of(rq));
168 steal -= rq->prev_steal_time_rq;
169
170 if (unlikely(steal > delta))
171 steal = delta;
172
173 rq->prev_steal_time_rq += steal;
174 delta -= steal;
175 }
176#endif
177
178 rq->clock_task += delta;
179
180#ifdef HAVE_SCHED_AVG_IRQ
181 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
182 update_irq_load_avg(rq, irq_delta + steal);
183#endif
184}
185
186void update_rq_clock(struct rq *rq)
187{
188 s64 delta;
189
190 lockdep_assert_held(&rq->lock);
191
192 if (rq->clock_update_flags & RQCF_ACT_SKIP)
193 return;
194
195#ifdef CONFIG_SCHED_DEBUG
196 if (sched_feat(WARN_DOUBLE_CLOCK))
197 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
198 rq->clock_update_flags |= RQCF_UPDATED;
199#endif
200
201 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
202 if (delta < 0)
203 return;
204 rq->clock += delta;
205 update_rq_clock_task(rq, delta);
206}
207
208
209#ifdef CONFIG_SCHED_HRTICK
210
211
212
213
214static void hrtick_clear(struct rq *rq)
215{
216 if (hrtimer_active(&rq->hrtick_timer))
217 hrtimer_cancel(&rq->hrtick_timer);
218}
219
220
221
222
223
224static enum hrtimer_restart hrtick(struct hrtimer *timer)
225{
226 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
227 struct rq_flags rf;
228
229 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
230
231 rq_lock(rq, &rf);
232 update_rq_clock(rq);
233 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
234 rq_unlock(rq, &rf);
235
236 return HRTIMER_NORESTART;
237}
238
239#ifdef CONFIG_SMP
240
241static void __hrtick_restart(struct rq *rq)
242{
243 struct hrtimer *timer = &rq->hrtick_timer;
244
245 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
246}
247
248
249
250
251static void __hrtick_start(void *arg)
252{
253 struct rq *rq = arg;
254 struct rq_flags rf;
255
256 rq_lock(rq, &rf);
257 __hrtick_restart(rq);
258 rq->hrtick_csd_pending = 0;
259 rq_unlock(rq, &rf);
260}
261
262
263
264
265
266
267void hrtick_start(struct rq *rq, u64 delay)
268{
269 struct hrtimer *timer = &rq->hrtick_timer;
270 ktime_t time;
271 s64 delta;
272
273
274
275
276
277 delta = max_t(s64, delay, 10000LL);
278 time = ktime_add_ns(timer->base->get_time(), delta);
279
280 hrtimer_set_expires(timer, time);
281
282 if (rq == this_rq()) {
283 __hrtick_restart(rq);
284 } else if (!rq->hrtick_csd_pending) {
285 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
286 rq->hrtick_csd_pending = 1;
287 }
288}
289
290#else
291
292
293
294
295
296void hrtick_start(struct rq *rq, u64 delay)
297{
298
299
300
301
302 delay = max_t(u64, delay, 10000LL);
303 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
304 HRTIMER_MODE_REL_PINNED);
305}
306#endif
307
308static void hrtick_rq_init(struct rq *rq)
309{
310#ifdef CONFIG_SMP
311 rq->hrtick_csd_pending = 0;
312
313 rq->hrtick_csd.flags = 0;
314 rq->hrtick_csd.func = __hrtick_start;
315 rq->hrtick_csd.info = rq;
316#endif
317
318 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
319 rq->hrtick_timer.function = hrtick;
320}
321#else
322static inline void hrtick_clear(struct rq *rq)
323{
324}
325
326static inline void hrtick_rq_init(struct rq *rq)
327{
328}
329#endif
330
331
332
333
334#define fetch_or(ptr, mask) \
335 ({ \
336 typeof(ptr) _ptr = (ptr); \
337 typeof(mask) _mask = (mask); \
338 typeof(*_ptr) _old, _val = *_ptr; \
339 \
340 for (;;) { \
341 _old = cmpxchg(_ptr, _val, _val | _mask); \
342 if (_old == _val) \
343 break; \
344 _val = _old; \
345 } \
346 _old; \
347})
348
349#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
350
351
352
353
354
355static bool set_nr_and_not_polling(struct task_struct *p)
356{
357 struct thread_info *ti = task_thread_info(p);
358 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
359}
360
361
362
363
364
365
366
367static bool set_nr_if_polling(struct task_struct *p)
368{
369 struct thread_info *ti = task_thread_info(p);
370 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
371
372 for (;;) {
373 if (!(val & _TIF_POLLING_NRFLAG))
374 return false;
375 if (val & _TIF_NEED_RESCHED)
376 return true;
377 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
378 if (old == val)
379 break;
380 val = old;
381 }
382 return true;
383}
384
385#else
386static bool set_nr_and_not_polling(struct task_struct *p)
387{
388 set_tsk_need_resched(p);
389 return true;
390}
391
392#ifdef CONFIG_SMP
393static bool set_nr_if_polling(struct task_struct *p)
394{
395 return false;
396}
397#endif
398#endif
399
400void wake_q_add(struct wake_q_head *head, struct task_struct *task)
401{
402 struct wake_q_node *node = &task->wake_q;
403
404
405
406
407
408
409
410
411
412 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
413 return;
414
415 get_task_struct(task);
416
417
418
419
420 *head->lastp = node;
421 head->lastp = &node->next;
422}
423
424void wake_up_q(struct wake_q_head *head)
425{
426 struct wake_q_node *node = head->first;
427
428 while (node != WAKE_Q_TAIL) {
429 struct task_struct *task;
430
431 task = container_of(node, struct task_struct, wake_q);
432 BUG_ON(!task);
433
434 node = node->next;
435 task->wake_q.next = NULL;
436
437
438
439
440
441 wake_up_process(task);
442 put_task_struct(task);
443 }
444}
445
446
447
448
449
450
451
452
453void resched_curr(struct rq *rq)
454{
455 struct task_struct *curr = rq->curr;
456 int cpu;
457
458 lockdep_assert_held(&rq->lock);
459
460 if (test_tsk_need_resched(curr))
461 return;
462
463 cpu = cpu_of(rq);
464
465 if (cpu == smp_processor_id()) {
466 set_tsk_need_resched(curr);
467 set_preempt_need_resched();
468 return;
469 }
470
471 if (set_nr_and_not_polling(curr))
472 smp_send_reschedule(cpu);
473 else
474 trace_sched_wake_idle_without_ipi(cpu);
475}
476
477void resched_cpu(int cpu)
478{
479 struct rq *rq = cpu_rq(cpu);
480 unsigned long flags;
481
482 raw_spin_lock_irqsave(&rq->lock, flags);
483 if (cpu_online(cpu) || cpu == smp_processor_id())
484 resched_curr(rq);
485 raw_spin_unlock_irqrestore(&rq->lock, flags);
486}
487
488#ifdef CONFIG_SMP
489#ifdef CONFIG_NO_HZ_COMMON
490
491
492
493
494
495
496
497
498int get_nohz_timer_target(void)
499{
500 int i, cpu = smp_processor_id();
501 struct sched_domain *sd;
502
503 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
504 return cpu;
505
506 rcu_read_lock();
507 for_each_domain(cpu, sd) {
508 for_each_cpu(i, sched_domain_span(sd)) {
509 if (cpu == i)
510 continue;
511
512 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
513 cpu = i;
514 goto unlock;
515 }
516 }
517 }
518
519 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
520 cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
521unlock:
522 rcu_read_unlock();
523 return cpu;
524}
525
526
527
528
529
530
531
532
533
534
535
536static void wake_up_idle_cpu(int cpu)
537{
538 struct rq *rq = cpu_rq(cpu);
539
540 if (cpu == smp_processor_id())
541 return;
542
543 if (set_nr_and_not_polling(rq->idle))
544 smp_send_reschedule(cpu);
545 else
546 trace_sched_wake_idle_without_ipi(cpu);
547}
548
549static bool wake_up_full_nohz_cpu(int cpu)
550{
551
552
553
554
555
556
557 if (cpu_is_offline(cpu))
558 return true;
559 if (tick_nohz_full_cpu(cpu)) {
560 if (cpu != smp_processor_id() ||
561 tick_nohz_tick_stopped())
562 tick_nohz_full_kick_cpu(cpu);
563 return true;
564 }
565
566 return false;
567}
568
569
570
571
572
573
574void wake_up_nohz_cpu(int cpu)
575{
576 if (!wake_up_full_nohz_cpu(cpu))
577 wake_up_idle_cpu(cpu);
578}
579
580static inline bool got_nohz_idle_kick(void)
581{
582 int cpu = smp_processor_id();
583
584 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
585 return false;
586
587 if (idle_cpu(cpu) && !need_resched())
588 return true;
589
590
591
592
593
594 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
595 return false;
596}
597
598#else
599
600static inline bool got_nohz_idle_kick(void)
601{
602 return false;
603}
604
605#endif
606
607#ifdef CONFIG_NO_HZ_FULL
608bool sched_can_stop_tick(struct rq *rq)
609{
610 int fifo_nr_running;
611
612
613 if (rq->dl.dl_nr_running)
614 return false;
615
616
617
618
619
620 if (rq->rt.rr_nr_running) {
621 if (rq->rt.rr_nr_running == 1)
622 return true;
623 else
624 return false;
625 }
626
627
628
629
630
631 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
632 if (fifo_nr_running)
633 return true;
634
635
636
637
638
639
640 if (rq->nr_running > 1)
641 return false;
642
643 return true;
644}
645#endif
646#endif
647
648#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
649 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
650
651
652
653
654
655
656int walk_tg_tree_from(struct task_group *from,
657 tg_visitor down, tg_visitor up, void *data)
658{
659 struct task_group *parent, *child;
660 int ret;
661
662 parent = from;
663
664down:
665 ret = (*down)(parent, data);
666 if (ret)
667 goto out;
668 list_for_each_entry_rcu(child, &parent->children, siblings) {
669 parent = child;
670 goto down;
671
672up:
673 continue;
674 }
675 ret = (*up)(parent, data);
676 if (ret || parent == from)
677 goto out;
678
679 child = parent;
680 parent = parent->parent;
681 if (parent)
682 goto up;
683out:
684 return ret;
685}
686
687int tg_nop(struct task_group *tg, void *data)
688{
689 return 0;
690}
691#endif
692
693static void set_load_weight(struct task_struct *p, bool update_load)
694{
695 int prio = p->static_prio - MAX_RT_PRIO;
696 struct load_weight *load = &p->se.load;
697
698
699
700
701 if (idle_policy(p->policy)) {
702 load->weight = scale_load(WEIGHT_IDLEPRIO);
703 load->inv_weight = WMULT_IDLEPRIO;
704 return;
705 }
706
707
708
709
710
711 if (update_load && p->sched_class == &fair_sched_class) {
712 reweight_task(p, prio);
713 } else {
714 load->weight = scale_load(sched_prio_to_weight[prio]);
715 load->inv_weight = sched_prio_to_wmult[prio];
716 }
717}
718
719static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
720{
721 if (!(flags & ENQUEUE_NOCLOCK))
722 update_rq_clock(rq);
723
724 if (!(flags & ENQUEUE_RESTORE))
725 sched_info_queued(rq, p);
726
727 p->sched_class->enqueue_task(rq, p, flags);
728}
729
730static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
731{
732 if (!(flags & DEQUEUE_NOCLOCK))
733 update_rq_clock(rq);
734
735 if (!(flags & DEQUEUE_SAVE))
736 sched_info_dequeued(rq, p);
737
738 p->sched_class->dequeue_task(rq, p, flags);
739}
740
741void activate_task(struct rq *rq, struct task_struct *p, int flags)
742{
743 if (task_contributes_to_load(p))
744 rq->nr_uninterruptible--;
745
746 enqueue_task(rq, p, flags);
747}
748
749void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
750{
751 if (task_contributes_to_load(p))
752 rq->nr_uninterruptible++;
753
754 dequeue_task(rq, p, flags);
755}
756
757
758
759
760static inline int __normal_prio(struct task_struct *p)
761{
762 return p->static_prio;
763}
764
765
766
767
768
769
770
771
772static inline int normal_prio(struct task_struct *p)
773{
774 int prio;
775
776 if (task_has_dl_policy(p))
777 prio = MAX_DL_PRIO-1;
778 else if (task_has_rt_policy(p))
779 prio = MAX_RT_PRIO-1 - p->rt_priority;
780 else
781 prio = __normal_prio(p);
782 return prio;
783}
784
785
786
787
788
789
790
791
792static int effective_prio(struct task_struct *p)
793{
794 p->normal_prio = normal_prio(p);
795
796
797
798
799
800 if (!rt_prio(p->prio))
801 return p->normal_prio;
802 return p->prio;
803}
804
805
806
807
808
809
810
811inline int task_curr(const struct task_struct *p)
812{
813 return cpu_curr(task_cpu(p)) == p;
814}
815
816
817
818
819
820
821
822
823static inline void check_class_changed(struct rq *rq, struct task_struct *p,
824 const struct sched_class *prev_class,
825 int oldprio)
826{
827 if (prev_class != p->sched_class) {
828 if (prev_class->switched_from)
829 prev_class->switched_from(rq, p);
830
831 p->sched_class->switched_to(rq, p);
832 } else if (oldprio != p->prio || dl_task(p))
833 p->sched_class->prio_changed(rq, p, oldprio);
834}
835
836void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
837{
838 const struct sched_class *class;
839
840 if (p->sched_class == rq->curr->sched_class) {
841 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
842 } else {
843 for_each_class(class) {
844 if (class == rq->curr->sched_class)
845 break;
846 if (class == p->sched_class) {
847 resched_curr(rq);
848 break;
849 }
850 }
851 }
852
853
854
855
856
857 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
858 rq_clock_skip_update(rq);
859}
860
861#ifdef CONFIG_SMP
862
863static inline bool is_per_cpu_kthread(struct task_struct *p)
864{
865 if (!(p->flags & PF_KTHREAD))
866 return false;
867
868 if (p->nr_cpus_allowed != 1)
869 return false;
870
871 return true;
872}
873
874
875
876
877
878static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
879{
880 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
881 return false;
882
883 if (is_per_cpu_kthread(p))
884 return cpu_online(cpu);
885
886 return cpu_active(cpu);
887}
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
909 struct task_struct *p, int new_cpu)
910{
911 lockdep_assert_held(&rq->lock);
912
913 p->on_rq = TASK_ON_RQ_MIGRATING;
914 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
915 set_task_cpu(p, new_cpu);
916 rq_unlock(rq, rf);
917
918 rq = cpu_rq(new_cpu);
919
920 rq_lock(rq, rf);
921 BUG_ON(task_cpu(p) != new_cpu);
922 enqueue_task(rq, p, 0);
923 p->on_rq = TASK_ON_RQ_QUEUED;
924 check_preempt_curr(rq, p, 0);
925
926 return rq;
927}
928
929struct migration_arg {
930 struct task_struct *task;
931 int dest_cpu;
932};
933
934
935
936
937
938
939
940
941
942
943static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
944 struct task_struct *p, int dest_cpu)
945{
946
947 if (!is_cpu_allowed(p, dest_cpu))
948 return rq;
949
950 update_rq_clock(rq);
951 rq = move_queued_task(rq, rf, p, dest_cpu);
952
953 return rq;
954}
955
956
957
958
959
960
961static int migration_cpu_stop(void *data)
962{
963 struct migration_arg *arg = data;
964 struct task_struct *p = arg->task;
965 struct rq *rq = this_rq();
966 struct rq_flags rf;
967
968
969
970
971
972 local_irq_disable();
973
974
975
976
977
978 sched_ttwu_pending();
979
980 raw_spin_lock(&p->pi_lock);
981 rq_lock(rq, &rf);
982
983
984
985
986
987 if (task_rq(p) == rq) {
988 if (task_on_rq_queued(p))
989 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
990 else
991 p->wake_cpu = arg->dest_cpu;
992 }
993 rq_unlock(rq, &rf);
994 raw_spin_unlock(&p->pi_lock);
995
996 local_irq_enable();
997 return 0;
998}
999
1000
1001
1002
1003
1004void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1005{
1006 cpumask_copy(&p->cpus_allowed, new_mask);
1007 p->nr_cpus_allowed = cpumask_weight(new_mask);
1008}
1009
1010void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1011{
1012 struct rq *rq = task_rq(p);
1013 bool queued, running;
1014
1015 lockdep_assert_held(&p->pi_lock);
1016
1017 queued = task_on_rq_queued(p);
1018 running = task_current(rq, p);
1019
1020 if (queued) {
1021
1022
1023
1024
1025 lockdep_assert_held(&rq->lock);
1026 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1027 }
1028 if (running)
1029 put_prev_task(rq, p);
1030
1031 p->sched_class->set_cpus_allowed(p, new_mask);
1032
1033 if (queued)
1034 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1035 if (running)
1036 set_curr_task(rq, p);
1037}
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048static int __set_cpus_allowed_ptr(struct task_struct *p,
1049 const struct cpumask *new_mask, bool check)
1050{
1051 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1052 unsigned int dest_cpu;
1053 struct rq_flags rf;
1054 struct rq *rq;
1055 int ret = 0;
1056
1057 rq = task_rq_lock(p, &rf);
1058 update_rq_clock(rq);
1059
1060 if (p->flags & PF_KTHREAD) {
1061
1062
1063
1064 cpu_valid_mask = cpu_online_mask;
1065 }
1066
1067
1068
1069
1070
1071 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1072 ret = -EINVAL;
1073 goto out;
1074 }
1075
1076 if (cpumask_equal(&p->cpus_allowed, new_mask))
1077 goto out;
1078
1079 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1080 ret = -EINVAL;
1081 goto out;
1082 }
1083
1084 do_set_cpus_allowed(p, new_mask);
1085
1086 if (p->flags & PF_KTHREAD) {
1087
1088
1089
1090
1091 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1092 !cpumask_intersects(new_mask, cpu_active_mask) &&
1093 p->nr_cpus_allowed != 1);
1094 }
1095
1096
1097 if (cpumask_test_cpu(task_cpu(p), new_mask))
1098 goto out;
1099
1100 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1101 if (task_running(rq, p) || p->state == TASK_WAKING) {
1102 struct migration_arg arg = { p, dest_cpu };
1103
1104 task_rq_unlock(rq, p, &rf);
1105 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1106 tlb_migrate_finish(p->mm);
1107 return 0;
1108 } else if (task_on_rq_queued(p)) {
1109
1110
1111
1112
1113 rq = move_queued_task(rq, &rf, p, dest_cpu);
1114 }
1115out:
1116 task_rq_unlock(rq, p, &rf);
1117
1118 return ret;
1119}
1120
1121int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1122{
1123 return __set_cpus_allowed_ptr(p, new_mask, false);
1124}
1125EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1126
1127void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1128{
1129#ifdef CONFIG_SCHED_DEBUG
1130
1131
1132
1133
1134 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1135 !p->on_rq);
1136
1137
1138
1139
1140
1141
1142 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1143 p->sched_class == &fair_sched_class &&
1144 (p->on_rq && !task_on_rq_migrating(p)));
1145
1146#ifdef CONFIG_LOCKDEP
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1158 lockdep_is_held(&task_rq(p)->lock)));
1159#endif
1160
1161
1162
1163 WARN_ON_ONCE(!cpu_online(new_cpu));
1164#endif
1165
1166 trace_sched_migrate_task(p, new_cpu);
1167
1168 if (task_cpu(p) != new_cpu) {
1169 if (p->sched_class->migrate_task_rq)
1170 p->sched_class->migrate_task_rq(p, new_cpu);
1171 p->se.nr_migrations++;
1172 rseq_migrate(p);
1173 perf_event_task_migrate(p);
1174 }
1175
1176 __set_task_cpu(p, new_cpu);
1177}
1178
1179#ifdef CONFIG_NUMA_BALANCING
1180static void __migrate_swap_task(struct task_struct *p, int cpu)
1181{
1182 if (task_on_rq_queued(p)) {
1183 struct rq *src_rq, *dst_rq;
1184 struct rq_flags srf, drf;
1185
1186 src_rq = task_rq(p);
1187 dst_rq = cpu_rq(cpu);
1188
1189 rq_pin_lock(src_rq, &srf);
1190 rq_pin_lock(dst_rq, &drf);
1191
1192 p->on_rq = TASK_ON_RQ_MIGRATING;
1193 deactivate_task(src_rq, p, 0);
1194 set_task_cpu(p, cpu);
1195 activate_task(dst_rq, p, 0);
1196 p->on_rq = TASK_ON_RQ_QUEUED;
1197 check_preempt_curr(dst_rq, p, 0);
1198
1199 rq_unpin_lock(dst_rq, &drf);
1200 rq_unpin_lock(src_rq, &srf);
1201
1202 } else {
1203
1204
1205
1206
1207
1208 p->wake_cpu = cpu;
1209 }
1210}
1211
1212struct migration_swap_arg {
1213 struct task_struct *src_task, *dst_task;
1214 int src_cpu, dst_cpu;
1215};
1216
1217static int migrate_swap_stop(void *data)
1218{
1219 struct migration_swap_arg *arg = data;
1220 struct rq *src_rq, *dst_rq;
1221 int ret = -EAGAIN;
1222
1223 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1224 return -EAGAIN;
1225
1226 src_rq = cpu_rq(arg->src_cpu);
1227 dst_rq = cpu_rq(arg->dst_cpu);
1228
1229 double_raw_lock(&arg->src_task->pi_lock,
1230 &arg->dst_task->pi_lock);
1231 double_rq_lock(src_rq, dst_rq);
1232
1233 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1234 goto unlock;
1235
1236 if (task_cpu(arg->src_task) != arg->src_cpu)
1237 goto unlock;
1238
1239 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1240 goto unlock;
1241
1242 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1243 goto unlock;
1244
1245 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1246 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1247
1248 ret = 0;
1249
1250unlock:
1251 double_rq_unlock(src_rq, dst_rq);
1252 raw_spin_unlock(&arg->dst_task->pi_lock);
1253 raw_spin_unlock(&arg->src_task->pi_lock);
1254
1255 return ret;
1256}
1257
1258
1259
1260
1261int migrate_swap(struct task_struct *cur, struct task_struct *p,
1262 int target_cpu, int curr_cpu)
1263{
1264 struct migration_swap_arg arg;
1265 int ret = -EINVAL;
1266
1267 arg = (struct migration_swap_arg){
1268 .src_task = cur,
1269 .src_cpu = curr_cpu,
1270 .dst_task = p,
1271 .dst_cpu = target_cpu,
1272 };
1273
1274 if (arg.src_cpu == arg.dst_cpu)
1275 goto out;
1276
1277
1278
1279
1280
1281 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1282 goto out;
1283
1284 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1285 goto out;
1286
1287 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1288 goto out;
1289
1290 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1291 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1292
1293out:
1294 return ret;
1295}
1296#endif
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1315{
1316 int running, queued;
1317 struct rq_flags rf;
1318 unsigned long ncsw;
1319 struct rq *rq;
1320
1321 for (;;) {
1322
1323
1324
1325
1326
1327
1328 rq = task_rq(p);
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341 while (task_running(rq, p)) {
1342 if (match_state && unlikely(p->state != match_state))
1343 return 0;
1344 cpu_relax();
1345 }
1346
1347
1348
1349
1350
1351
1352 rq = task_rq_lock(p, &rf);
1353 trace_sched_wait_task(p);
1354 running = task_running(rq, p);
1355 queued = task_on_rq_queued(p);
1356 ncsw = 0;
1357 if (!match_state || p->state == match_state)
1358 ncsw = p->nvcsw | LONG_MIN;
1359 task_rq_unlock(rq, p, &rf);
1360
1361
1362
1363
1364 if (unlikely(!ncsw))
1365 break;
1366
1367
1368
1369
1370
1371
1372
1373 if (unlikely(running)) {
1374 cpu_relax();
1375 continue;
1376 }
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387 if (unlikely(queued)) {
1388 ktime_t to = NSEC_PER_SEC / HZ;
1389
1390 set_current_state(TASK_UNINTERRUPTIBLE);
1391 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1392 continue;
1393 }
1394
1395
1396
1397
1398
1399
1400 break;
1401 }
1402
1403 return ncsw;
1404}
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419void kick_process(struct task_struct *p)
1420{
1421 int cpu;
1422
1423 preempt_disable();
1424 cpu = task_cpu(p);
1425 if ((cpu != smp_processor_id()) && task_curr(p))
1426 smp_send_reschedule(cpu);
1427 preempt_enable();
1428}
1429EXPORT_SYMBOL_GPL(kick_process);
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453static int select_fallback_rq(int cpu, struct task_struct *p)
1454{
1455 int nid = cpu_to_node(cpu);
1456 const struct cpumask *nodemask = NULL;
1457 enum { cpuset, possible, fail } state = cpuset;
1458 int dest_cpu;
1459
1460
1461
1462
1463
1464
1465 if (nid != -1) {
1466 nodemask = cpumask_of_node(nid);
1467
1468
1469 for_each_cpu(dest_cpu, nodemask) {
1470 if (!cpu_active(dest_cpu))
1471 continue;
1472 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
1473 return dest_cpu;
1474 }
1475 }
1476
1477 for (;;) {
1478
1479 for_each_cpu(dest_cpu, &p->cpus_allowed) {
1480 if (!is_cpu_allowed(p, dest_cpu))
1481 continue;
1482
1483 goto out;
1484 }
1485
1486
1487 switch (state) {
1488 case cpuset:
1489 if (IS_ENABLED(CONFIG_CPUSETS)) {
1490 cpuset_cpus_allowed_fallback(p);
1491 state = possible;
1492 break;
1493 }
1494
1495 case possible:
1496 do_set_cpus_allowed(p, cpu_possible_mask);
1497 state = fail;
1498 break;
1499
1500 case fail:
1501 BUG();
1502 break;
1503 }
1504 }
1505
1506out:
1507 if (state != cpuset) {
1508
1509
1510
1511
1512
1513 if (p->mm && printk_ratelimit()) {
1514 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1515 task_pid_nr(p), p->comm, cpu);
1516 }
1517 }
1518
1519 return dest_cpu;
1520}
1521
1522
1523
1524
1525static inline
1526int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1527{
1528 lockdep_assert_held(&p->pi_lock);
1529
1530 if (p->nr_cpus_allowed > 1)
1531 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1532 else
1533 cpu = cpumask_any(&p->cpus_allowed);
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545 if (unlikely(!is_cpu_allowed(p, cpu)))
1546 cpu = select_fallback_rq(task_cpu(p), p);
1547
1548 return cpu;
1549}
1550
1551static void update_avg(u64 *avg, u64 sample)
1552{
1553 s64 diff = sample - *avg;
1554 *avg += diff >> 3;
1555}
1556
1557void sched_set_stop_task(int cpu, struct task_struct *stop)
1558{
1559 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1560 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1561
1562 if (stop) {
1563
1564
1565
1566
1567
1568
1569
1570
1571 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
1572
1573 stop->sched_class = &stop_sched_class;
1574 }
1575
1576 cpu_rq(cpu)->stop = stop;
1577
1578 if (old_stop) {
1579
1580
1581
1582
1583 old_stop->sched_class = &rt_sched_class;
1584 }
1585}
1586
1587#else
1588
1589static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1590 const struct cpumask *new_mask, bool check)
1591{
1592 return set_cpus_allowed_ptr(p, new_mask);
1593}
1594
1595#endif
1596
1597static void
1598ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1599{
1600 struct rq *rq;
1601
1602 if (!schedstat_enabled())
1603 return;
1604
1605 rq = this_rq();
1606
1607#ifdef CONFIG_SMP
1608 if (cpu == rq->cpu) {
1609 __schedstat_inc(rq->ttwu_local);
1610 __schedstat_inc(p->se.statistics.nr_wakeups_local);
1611 } else {
1612 struct sched_domain *sd;
1613
1614 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
1615 rcu_read_lock();
1616 for_each_domain(rq->cpu, sd) {
1617 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1618 __schedstat_inc(sd->ttwu_wake_remote);
1619 break;
1620 }
1621 }
1622 rcu_read_unlock();
1623 }
1624
1625 if (wake_flags & WF_MIGRATED)
1626 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1627#endif
1628
1629 __schedstat_inc(rq->ttwu_count);
1630 __schedstat_inc(p->se.statistics.nr_wakeups);
1631
1632 if (wake_flags & WF_SYNC)
1633 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
1634}
1635
1636static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1637{
1638 activate_task(rq, p, en_flags);
1639 p->on_rq = TASK_ON_RQ_QUEUED;
1640
1641
1642 if (p->flags & PF_WQ_WORKER)
1643 wq_worker_waking_up(p, cpu_of(rq));
1644}
1645
1646
1647
1648
1649static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1650 struct rq_flags *rf)
1651{
1652 check_preempt_curr(rq, p, wake_flags);
1653 p->state = TASK_RUNNING;
1654 trace_sched_wakeup(p);
1655
1656#ifdef CONFIG_SMP
1657 if (p->sched_class->task_woken) {
1658
1659
1660
1661
1662 rq_unpin_lock(rq, rf);
1663 p->sched_class->task_woken(rq, p);
1664 rq_repin_lock(rq, rf);
1665 }
1666
1667 if (rq->idle_stamp) {
1668 u64 delta = rq_clock(rq) - rq->idle_stamp;
1669 u64 max = 2*rq->max_idle_balance_cost;
1670
1671 update_avg(&rq->avg_idle, delta);
1672
1673 if (rq->avg_idle > max)
1674 rq->avg_idle = max;
1675
1676 rq->idle_stamp = 0;
1677 }
1678#endif
1679}
1680
1681static void
1682ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1683 struct rq_flags *rf)
1684{
1685 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1686
1687 lockdep_assert_held(&rq->lock);
1688
1689#ifdef CONFIG_SMP
1690 if (p->sched_contributes_to_load)
1691 rq->nr_uninterruptible--;
1692
1693 if (wake_flags & WF_MIGRATED)
1694 en_flags |= ENQUEUE_MIGRATED;
1695#endif
1696
1697 ttwu_activate(rq, p, en_flags);
1698 ttwu_do_wakeup(rq, p, wake_flags, rf);
1699}
1700
1701
1702
1703
1704
1705
1706
1707static int ttwu_remote(struct task_struct *p, int wake_flags)
1708{
1709 struct rq_flags rf;
1710 struct rq *rq;
1711 int ret = 0;
1712
1713 rq = __task_rq_lock(p, &rf);
1714 if (task_on_rq_queued(p)) {
1715
1716 update_rq_clock(rq);
1717 ttwu_do_wakeup(rq, p, wake_flags, &rf);
1718 ret = 1;
1719 }
1720 __task_rq_unlock(rq, &rf);
1721
1722 return ret;
1723}
1724
1725#ifdef CONFIG_SMP
1726void sched_ttwu_pending(void)
1727{
1728 struct rq *rq = this_rq();
1729 struct llist_node *llist = llist_del_all(&rq->wake_list);
1730 struct task_struct *p, *t;
1731 struct rq_flags rf;
1732
1733 if (!llist)
1734 return;
1735
1736 rq_lock_irqsave(rq, &rf);
1737 update_rq_clock(rq);
1738
1739 llist_for_each_entry_safe(p, t, llist, wake_entry)
1740 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
1741
1742 rq_unlock_irqrestore(rq, &rf);
1743}
1744
1745void scheduler_ipi(void)
1746{
1747
1748
1749
1750
1751
1752 preempt_fold_need_resched();
1753
1754 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1755 return;
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 irq_enter();
1771 sched_ttwu_pending();
1772
1773
1774
1775
1776 if (unlikely(got_nohz_idle_kick())) {
1777 this_rq()->idle_balance = 1;
1778 raise_softirq_irqoff(SCHED_SOFTIRQ);
1779 }
1780 irq_exit();
1781}
1782
1783static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1784{
1785 struct rq *rq = cpu_rq(cpu);
1786
1787 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1788
1789 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1790 if (!set_nr_if_polling(rq->idle))
1791 smp_send_reschedule(cpu);
1792 else
1793 trace_sched_wake_idle_without_ipi(cpu);
1794 }
1795}
1796
1797void wake_up_if_idle(int cpu)
1798{
1799 struct rq *rq = cpu_rq(cpu);
1800 struct rq_flags rf;
1801
1802 rcu_read_lock();
1803
1804 if (!is_idle_task(rcu_dereference(rq->curr)))
1805 goto out;
1806
1807 if (set_nr_if_polling(rq->idle)) {
1808 trace_sched_wake_idle_without_ipi(cpu);
1809 } else {
1810 rq_lock_irqsave(rq, &rf);
1811 if (is_idle_task(rq->curr))
1812 smp_send_reschedule(cpu);
1813
1814 rq_unlock_irqrestore(rq, &rf);
1815 }
1816
1817out:
1818 rcu_read_unlock();
1819}
1820
1821bool cpus_share_cache(int this_cpu, int that_cpu)
1822{
1823 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1824}
1825#endif
1826
1827static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1828{
1829 struct rq *rq = cpu_rq(cpu);
1830 struct rq_flags rf;
1831
1832#if defined(CONFIG_SMP)
1833 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1834 sched_clock_cpu(cpu);
1835 ttwu_queue_remote(p, cpu, wake_flags);
1836 return;
1837 }
1838#endif
1839
1840 rq_lock(rq, &rf);
1841 update_rq_clock(rq);
1842 ttwu_do_activate(rq, p, wake_flags, &rf);
1843 rq_unlock(rq, &rf);
1844}
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948static int
1949try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1950{
1951 unsigned long flags;
1952 int cpu, success = 0;
1953
1954
1955
1956
1957
1958
1959
1960 raw_spin_lock_irqsave(&p->pi_lock, flags);
1961 smp_mb__after_spinlock();
1962 if (!(p->state & state))
1963 goto out;
1964
1965 trace_sched_waking(p);
1966
1967
1968 success = 1;
1969 cpu = task_cpu(p);
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991 smp_rmb();
1992 if (p->on_rq && ttwu_remote(p, wake_flags))
1993 goto stat;
1994
1995#ifdef CONFIG_SMP
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015 smp_rmb();
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026 smp_cond_load_acquire(&p->on_cpu, !VAL);
2027
2028 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2029 p->state = TASK_WAKING;
2030
2031 if (p->in_iowait) {
2032 delayacct_blkio_end(p);
2033 atomic_dec(&task_rq(p)->nr_iowait);
2034 }
2035
2036 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2037 if (task_cpu(p) != cpu) {
2038 wake_flags |= WF_MIGRATED;
2039 set_task_cpu(p, cpu);
2040 }
2041
2042#else
2043
2044 if (p->in_iowait) {
2045 delayacct_blkio_end(p);
2046 atomic_dec(&task_rq(p)->nr_iowait);
2047 }
2048
2049#endif
2050
2051 ttwu_queue(p, cpu, wake_flags);
2052stat:
2053 ttwu_stat(p, cpu, wake_flags);
2054out:
2055 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2056
2057 return success;
2058}
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2070{
2071 struct rq *rq = task_rq(p);
2072
2073 if (WARN_ON_ONCE(rq != this_rq()) ||
2074 WARN_ON_ONCE(p == current))
2075 return;
2076
2077 lockdep_assert_held(&rq->lock);
2078
2079 if (!raw_spin_trylock(&p->pi_lock)) {
2080
2081
2082
2083
2084
2085
2086 rq_unlock(rq, rf);
2087 raw_spin_lock(&p->pi_lock);
2088 rq_relock(rq, rf);
2089 }
2090
2091 if (!(p->state & TASK_NORMAL))
2092 goto out;
2093
2094 trace_sched_waking(p);
2095
2096 if (!task_on_rq_queued(p)) {
2097 if (p->in_iowait) {
2098 delayacct_blkio_end(p);
2099 atomic_dec(&rq->nr_iowait);
2100 }
2101 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2102 }
2103
2104 ttwu_do_wakeup(rq, p, 0, rf);
2105 ttwu_stat(p, smp_processor_id(), 0);
2106out:
2107 raw_spin_unlock(&p->pi_lock);
2108}
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121int wake_up_process(struct task_struct *p)
2122{
2123 return try_to_wake_up(p, TASK_NORMAL, 0);
2124}
2125EXPORT_SYMBOL(wake_up_process);
2126
2127int wake_up_state(struct task_struct *p, unsigned int state)
2128{
2129 return try_to_wake_up(p, state, 0);
2130}
2131
2132
2133
2134
2135
2136
2137
2138static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2139{
2140 p->on_rq = 0;
2141
2142 p->se.on_rq = 0;
2143 p->se.exec_start = 0;
2144 p->se.sum_exec_runtime = 0;
2145 p->se.prev_sum_exec_runtime = 0;
2146 p->se.nr_migrations = 0;
2147 p->se.vruntime = 0;
2148 INIT_LIST_HEAD(&p->se.group_node);
2149
2150#ifdef CONFIG_FAIR_GROUP_SCHED
2151 p->se.cfs_rq = NULL;
2152#endif
2153
2154#ifdef CONFIG_SCHEDSTATS
2155
2156 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2157#endif
2158
2159 RB_CLEAR_NODE(&p->dl.rb_node);
2160 init_dl_task_timer(&p->dl);
2161 init_dl_inactive_task_timer(&p->dl);
2162 __dl_clear_params(p);
2163
2164 INIT_LIST_HEAD(&p->rt.run_list);
2165 p->rt.timeout = 0;
2166 p->rt.time_slice = sched_rr_timeslice;
2167 p->rt.on_rq = 0;
2168 p->rt.on_list = 0;
2169
2170#ifdef CONFIG_PREEMPT_NOTIFIERS
2171 INIT_HLIST_HEAD(&p->preempt_notifiers);
2172#endif
2173
2174 init_numa_balancing(clone_flags, p);
2175}
2176
2177DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2178
2179#ifdef CONFIG_NUMA_BALANCING
2180
2181void set_numabalancing_state(bool enabled)
2182{
2183 if (enabled)
2184 static_branch_enable(&sched_numa_balancing);
2185 else
2186 static_branch_disable(&sched_numa_balancing);
2187}
2188
2189#ifdef CONFIG_PROC_SYSCTL
2190int sysctl_numa_balancing(struct ctl_table *table, int write,
2191 void __user *buffer, size_t *lenp, loff_t *ppos)
2192{
2193 struct ctl_table t;
2194 int err;
2195 int state = static_branch_likely(&sched_numa_balancing);
2196
2197 if (write && !capable(CAP_SYS_ADMIN))
2198 return -EPERM;
2199
2200 t = *table;
2201 t.data = &state;
2202 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2203 if (err < 0)
2204 return err;
2205 if (write)
2206 set_numabalancing_state(state);
2207 return err;
2208}
2209#endif
2210#endif
2211
2212#ifdef CONFIG_SCHEDSTATS
2213
2214DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2215static bool __initdata __sched_schedstats = false;
2216
2217static void set_schedstats(bool enabled)
2218{
2219 if (enabled)
2220 static_branch_enable(&sched_schedstats);
2221 else
2222 static_branch_disable(&sched_schedstats);
2223}
2224
2225void force_schedstat_enabled(void)
2226{
2227 if (!schedstat_enabled()) {
2228 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2229 static_branch_enable(&sched_schedstats);
2230 }
2231}
2232
2233static int __init setup_schedstats(char *str)
2234{
2235 int ret = 0;
2236 if (!str)
2237 goto out;
2238
2239
2240
2241
2242
2243
2244 if (!strcmp(str, "enable")) {
2245 __sched_schedstats = true;
2246 ret = 1;
2247 } else if (!strcmp(str, "disable")) {
2248 __sched_schedstats = false;
2249 ret = 1;
2250 }
2251out:
2252 if (!ret)
2253 pr_warn("Unable to parse schedstats=\n");
2254
2255 return ret;
2256}
2257__setup("schedstats=", setup_schedstats);
2258
2259static void __init init_schedstats(void)
2260{
2261 set_schedstats(__sched_schedstats);
2262}
2263
2264#ifdef CONFIG_PROC_SYSCTL
2265int sysctl_schedstats(struct ctl_table *table, int write,
2266 void __user *buffer, size_t *lenp, loff_t *ppos)
2267{
2268 struct ctl_table t;
2269 int err;
2270 int state = static_branch_likely(&sched_schedstats);
2271
2272 if (write && !capable(CAP_SYS_ADMIN))
2273 return -EPERM;
2274
2275 t = *table;
2276 t.data = &state;
2277 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2278 if (err < 0)
2279 return err;
2280 if (write)
2281 set_schedstats(state);
2282 return err;
2283}
2284#endif
2285#else
2286static inline void init_schedstats(void) {}
2287#endif
2288
2289
2290
2291
2292int sched_fork(unsigned long clone_flags, struct task_struct *p)
2293{
2294 unsigned long flags;
2295
2296 __sched_fork(clone_flags, p);
2297
2298
2299
2300
2301
2302 p->state = TASK_NEW;
2303
2304
2305
2306
2307 p->prio = current->normal_prio;
2308
2309
2310
2311
2312 if (unlikely(p->sched_reset_on_fork)) {
2313 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2314 p->policy = SCHED_NORMAL;
2315 p->static_prio = NICE_TO_PRIO(0);
2316 p->rt_priority = 0;
2317 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2318 p->static_prio = NICE_TO_PRIO(0);
2319
2320 p->prio = p->normal_prio = __normal_prio(p);
2321 set_load_weight(p, false);
2322
2323
2324
2325
2326
2327 p->sched_reset_on_fork = 0;
2328 }
2329
2330 if (dl_prio(p->prio))
2331 return -EAGAIN;
2332 else if (rt_prio(p->prio))
2333 p->sched_class = &rt_sched_class;
2334 else
2335 p->sched_class = &fair_sched_class;
2336
2337 init_entity_runnable_average(&p->se);
2338
2339
2340
2341
2342
2343
2344
2345
2346 raw_spin_lock_irqsave(&p->pi_lock, flags);
2347
2348
2349
2350
2351 __set_task_cpu(p, smp_processor_id());
2352 if (p->sched_class->task_fork)
2353 p->sched_class->task_fork(p);
2354 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2355
2356#ifdef CONFIG_SCHED_INFO
2357 if (likely(sched_info_on()))
2358 memset(&p->sched_info, 0, sizeof(p->sched_info));
2359#endif
2360#if defined(CONFIG_SMP)
2361 p->on_cpu = 0;
2362#endif
2363 init_task_preempt_count(p);
2364#ifdef CONFIG_SMP
2365 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2366 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2367#endif
2368 return 0;
2369}
2370
2371unsigned long to_ratio(u64 period, u64 runtime)
2372{
2373 if (runtime == RUNTIME_INF)
2374 return BW_UNIT;
2375
2376
2377
2378
2379
2380
2381 if (period == 0)
2382 return 0;
2383
2384 return div64_u64(runtime << BW_SHIFT, period);
2385}
2386
2387
2388
2389
2390
2391
2392
2393
2394void wake_up_new_task(struct task_struct *p)
2395{
2396 struct rq_flags rf;
2397 struct rq *rq;
2398
2399 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2400 p->state = TASK_RUNNING;
2401#ifdef CONFIG_SMP
2402
2403
2404
2405
2406
2407
2408
2409
2410 p->recent_used_cpu = task_cpu(p);
2411 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2412#endif
2413 rq = __task_rq_lock(p, &rf);
2414 update_rq_clock(rq);
2415 post_init_entity_util_avg(&p->se);
2416
2417 activate_task(rq, p, ENQUEUE_NOCLOCK);
2418 p->on_rq = TASK_ON_RQ_QUEUED;
2419 trace_sched_wakeup_new(p);
2420 check_preempt_curr(rq, p, WF_FORK);
2421#ifdef CONFIG_SMP
2422 if (p->sched_class->task_woken) {
2423
2424
2425
2426
2427 rq_unpin_lock(rq, &rf);
2428 p->sched_class->task_woken(rq, p);
2429 rq_repin_lock(rq, &rf);
2430 }
2431#endif
2432 task_rq_unlock(rq, p, &rf);
2433}
2434
2435#ifdef CONFIG_PREEMPT_NOTIFIERS
2436
2437static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2438
2439void preempt_notifier_inc(void)
2440{
2441 static_branch_inc(&preempt_notifier_key);
2442}
2443EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2444
2445void preempt_notifier_dec(void)
2446{
2447 static_branch_dec(&preempt_notifier_key);
2448}
2449EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2450
2451
2452
2453
2454
2455void preempt_notifier_register(struct preempt_notifier *notifier)
2456{
2457 if (!static_branch_unlikely(&preempt_notifier_key))
2458 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2459
2460 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2461}
2462EXPORT_SYMBOL_GPL(preempt_notifier_register);
2463
2464
2465
2466
2467
2468
2469
2470void preempt_notifier_unregister(struct preempt_notifier *notifier)
2471{
2472 hlist_del(¬ifier->link);
2473}
2474EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2475
2476static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2477{
2478 struct preempt_notifier *notifier;
2479
2480 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2481 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2482}
2483
2484static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2485{
2486 if (static_branch_unlikely(&preempt_notifier_key))
2487 __fire_sched_in_preempt_notifiers(curr);
2488}
2489
2490static void
2491__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2492 struct task_struct *next)
2493{
2494 struct preempt_notifier *notifier;
2495
2496 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2497 notifier->ops->sched_out(notifier, next);
2498}
2499
2500static __always_inline void
2501fire_sched_out_preempt_notifiers(struct task_struct *curr,
2502 struct task_struct *next)
2503{
2504 if (static_branch_unlikely(&preempt_notifier_key))
2505 __fire_sched_out_preempt_notifiers(curr, next);
2506}
2507
2508#else
2509
2510static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2511{
2512}
2513
2514static inline void
2515fire_sched_out_preempt_notifiers(struct task_struct *curr,
2516 struct task_struct *next)
2517{
2518}
2519
2520#endif
2521
2522static inline void prepare_task(struct task_struct *next)
2523{
2524#ifdef CONFIG_SMP
2525
2526
2527
2528
2529 next->on_cpu = 1;
2530#endif
2531}
2532
2533static inline void finish_task(struct task_struct *prev)
2534{
2535#ifdef CONFIG_SMP
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546 smp_store_release(&prev->on_cpu, 0);
2547#endif
2548}
2549
2550static inline void
2551prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
2552{
2553
2554
2555
2556
2557
2558
2559 rq_unpin_lock(rq, rf);
2560 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2561#ifdef CONFIG_DEBUG_SPINLOCK
2562
2563 rq->lock.owner = next;
2564#endif
2565}
2566
2567static inline void finish_lock_switch(struct rq *rq)
2568{
2569
2570
2571
2572
2573
2574 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
2575 raw_spin_unlock_irq(&rq->lock);
2576}
2577
2578
2579
2580
2581
2582#ifndef prepare_arch_switch
2583# define prepare_arch_switch(next) do { } while (0)
2584#endif
2585
2586#ifndef finish_arch_post_lock_switch
2587# define finish_arch_post_lock_switch() do { } while (0)
2588#endif
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603static inline void
2604prepare_task_switch(struct rq *rq, struct task_struct *prev,
2605 struct task_struct *next)
2606{
2607 kcov_prepare_switch(prev);
2608 sched_info_switch(rq, prev, next);
2609 perf_event_task_sched_out(prev, next);
2610 rseq_preempt(prev);
2611 fire_sched_out_preempt_notifiers(prev, next);
2612 prepare_task(next);
2613 prepare_arch_switch(next);
2614}
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635static struct rq *finish_task_switch(struct task_struct *prev)
2636 __releases(rq->lock)
2637{
2638 struct rq *rq = this_rq();
2639 struct mm_struct *mm = rq->prev_mm;
2640 long prev_state;
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2654 "corrupted preempt_count: %s/%d/0x%x\n",
2655 current->comm, current->pid, preempt_count()))
2656 preempt_count_set(FORK_PREEMPT_COUNT);
2657
2658 rq->prev_mm = NULL;
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671 prev_state = prev->state;
2672 vtime_task_switch(prev);
2673 perf_event_task_sched_in(prev, current);
2674 finish_task(prev);
2675 finish_lock_switch(rq);
2676 finish_arch_post_lock_switch();
2677 kcov_finish_switch(current);
2678
2679 fire_sched_in_preempt_notifiers(current);
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692 if (mm) {
2693 membarrier_mm_sync_core_before_usermode(mm);
2694 mmdrop(mm);
2695 }
2696 if (unlikely(prev_state == TASK_DEAD)) {
2697 if (prev->sched_class->task_dead)
2698 prev->sched_class->task_dead(prev);
2699
2700
2701
2702
2703
2704 kprobe_flush_task(prev);
2705
2706
2707 put_task_stack(prev);
2708
2709 put_task_struct(prev);
2710 }
2711
2712 tick_nohz_task_switch();
2713 return rq;
2714}
2715
2716#ifdef CONFIG_SMP
2717
2718
2719static void __balance_callback(struct rq *rq)
2720{
2721 struct callback_head *head, *next;
2722 void (*func)(struct rq *rq);
2723 unsigned long flags;
2724
2725 raw_spin_lock_irqsave(&rq->lock, flags);
2726 head = rq->balance_callback;
2727 rq->balance_callback = NULL;
2728 while (head) {
2729 func = (void (*)(struct rq *))head->func;
2730 next = head->next;
2731 head->next = NULL;
2732 head = next;
2733
2734 func(rq);
2735 }
2736 raw_spin_unlock_irqrestore(&rq->lock, flags);
2737}
2738
2739static inline void balance_callback(struct rq *rq)
2740{
2741 if (unlikely(rq->balance_callback))
2742 __balance_callback(rq);
2743}
2744
2745#else
2746
2747static inline void balance_callback(struct rq *rq)
2748{
2749}
2750
2751#endif
2752
2753
2754
2755
2756
2757asmlinkage __visible void schedule_tail(struct task_struct *prev)
2758 __releases(rq->lock)
2759{
2760 struct rq *rq;
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771 rq = finish_task_switch(prev);
2772 balance_callback(rq);
2773 preempt_enable();
2774
2775 if (current->set_child_tid)
2776 put_user(task_pid_vnr(current), current->set_child_tid);
2777
2778 calculate_sigpending();
2779}
2780
2781
2782
2783
2784static __always_inline struct rq *
2785context_switch(struct rq *rq, struct task_struct *prev,
2786 struct task_struct *next, struct rq_flags *rf)
2787{
2788 struct mm_struct *mm, *oldmm;
2789
2790 prepare_task_switch(rq, prev, next);
2791
2792 mm = next->mm;
2793 oldmm = prev->active_mm;
2794
2795
2796
2797
2798
2799 arch_start_context_switch(prev);
2800
2801
2802
2803
2804
2805
2806
2807
2808 if (!mm) {
2809 next->active_mm = oldmm;
2810 mmgrab(oldmm);
2811 enter_lazy_tlb(oldmm, next);
2812 } else
2813 switch_mm_irqs_off(oldmm, mm, next);
2814
2815 if (!prev->mm) {
2816 prev->active_mm = NULL;
2817 rq->prev_mm = oldmm;
2818 }
2819
2820 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
2821
2822 prepare_lock_switch(rq, next, rf);
2823
2824
2825 switch_to(prev, next, prev);
2826 barrier();
2827
2828 return finish_task_switch(prev);
2829}
2830
2831
2832
2833
2834
2835
2836
2837unsigned long nr_running(void)
2838{
2839 unsigned long i, sum = 0;
2840
2841 for_each_online_cpu(i)
2842 sum += cpu_rq(i)->nr_running;
2843
2844 return sum;
2845}
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860bool single_task_running(void)
2861{
2862 return raw_rq()->nr_running == 1;
2863}
2864EXPORT_SYMBOL(single_task_running);
2865
2866unsigned long long nr_context_switches(void)
2867{
2868 int i;
2869 unsigned long long sum = 0;
2870
2871 for_each_possible_cpu(i)
2872 sum += cpu_rq(i)->nr_switches;
2873
2874 return sum;
2875}
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907unsigned long nr_iowait(void)
2908{
2909 unsigned long i, sum = 0;
2910
2911 for_each_possible_cpu(i)
2912 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2913
2914 return sum;
2915}
2916
2917
2918
2919
2920
2921
2922
2923
2924unsigned long nr_iowait_cpu(int cpu)
2925{
2926 struct rq *this = cpu_rq(cpu);
2927 return atomic_read(&this->nr_iowait);
2928}
2929
2930void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2931{
2932 struct rq *rq = this_rq();
2933 *nr_waiters = atomic_read(&rq->nr_iowait);
2934 *load = rq->load.weight;
2935}
2936
2937#ifdef CONFIG_SMP
2938
2939
2940
2941
2942
2943void sched_exec(void)
2944{
2945 struct task_struct *p = current;
2946 unsigned long flags;
2947 int dest_cpu;
2948
2949 raw_spin_lock_irqsave(&p->pi_lock, flags);
2950 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2951 if (dest_cpu == smp_processor_id())
2952 goto unlock;
2953
2954 if (likely(cpu_active(dest_cpu))) {
2955 struct migration_arg arg = { p, dest_cpu };
2956
2957 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2958 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2959 return;
2960 }
2961unlock:
2962 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2963}
2964
2965#endif
2966
2967DEFINE_PER_CPU(struct kernel_stat, kstat);
2968DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2969
2970EXPORT_PER_CPU_SYMBOL(kstat);
2971EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2972
2973
2974
2975
2976
2977
2978
2979static inline void prefetch_curr_exec_start(struct task_struct *p)
2980{
2981#ifdef CONFIG_FAIR_GROUP_SCHED
2982 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
2983#else
2984 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
2985#endif
2986 prefetch(curr);
2987 prefetch(&curr->exec_start);
2988}
2989
2990
2991
2992
2993
2994
2995unsigned long long task_sched_runtime(struct task_struct *p)
2996{
2997 struct rq_flags rf;
2998 struct rq *rq;
2999 u64 ns;
3000
3001#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013 if (!p->on_cpu || !task_on_rq_queued(p))
3014 return p->se.sum_exec_runtime;
3015#endif
3016
3017 rq = task_rq_lock(p, &rf);
3018
3019
3020
3021
3022
3023 if (task_current(rq, p) && task_on_rq_queued(p)) {
3024 prefetch_curr_exec_start(p);
3025 update_rq_clock(rq);
3026 p->sched_class->update_curr(rq);
3027 }
3028 ns = p->se.sum_exec_runtime;
3029 task_rq_unlock(rq, p, &rf);
3030
3031 return ns;
3032}
3033
3034
3035
3036
3037
3038void scheduler_tick(void)
3039{
3040 int cpu = smp_processor_id();
3041 struct rq *rq = cpu_rq(cpu);
3042 struct task_struct *curr = rq->curr;
3043 struct rq_flags rf;
3044
3045 sched_clock_tick();
3046
3047 rq_lock(rq, &rf);
3048
3049 update_rq_clock(rq);
3050 curr->sched_class->task_tick(rq, curr, 0);
3051 cpu_load_update_active(rq);
3052 calc_global_load_tick(rq);
3053
3054 rq_unlock(rq, &rf);
3055
3056 perf_event_task_tick();
3057
3058#ifdef CONFIG_SMP
3059 rq->idle_balance = idle_cpu(cpu);
3060 trigger_load_balance(rq);
3061#endif
3062}
3063
3064#ifdef CONFIG_NO_HZ_FULL
3065
3066struct tick_work {
3067 int cpu;
3068 struct delayed_work work;
3069};
3070
3071static struct tick_work __percpu *tick_work_cpu;
3072
3073static void sched_tick_remote(struct work_struct *work)
3074{
3075 struct delayed_work *dwork = to_delayed_work(work);
3076 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3077 int cpu = twork->cpu;
3078 struct rq *rq = cpu_rq(cpu);
3079 struct task_struct *curr;
3080 struct rq_flags rf;
3081 u64 delta;
3082
3083
3084
3085
3086
3087
3088
3089
3090 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3091 goto out_requeue;
3092
3093 rq_lock_irq(rq, &rf);
3094 curr = rq->curr;
3095 if (is_idle_task(curr))
3096 goto out_unlock;
3097
3098 update_rq_clock(rq);
3099 delta = rq_clock_task(rq) - curr->se.exec_start;
3100
3101
3102
3103
3104
3105 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3106 curr->sched_class->task_tick(rq, curr, 0);
3107
3108out_unlock:
3109 rq_unlock_irq(rq, &rf);
3110
3111out_requeue:
3112
3113
3114
3115
3116
3117 queue_delayed_work(system_unbound_wq, dwork, HZ);
3118}
3119
3120static void sched_tick_start(int cpu)
3121{
3122 struct tick_work *twork;
3123
3124 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3125 return;
3126
3127 WARN_ON_ONCE(!tick_work_cpu);
3128
3129 twork = per_cpu_ptr(tick_work_cpu, cpu);
3130 twork->cpu = cpu;
3131 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3132 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3133}
3134
3135#ifdef CONFIG_HOTPLUG_CPU
3136static void sched_tick_stop(int cpu)
3137{
3138 struct tick_work *twork;
3139
3140 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3141 return;
3142
3143 WARN_ON_ONCE(!tick_work_cpu);
3144
3145 twork = per_cpu_ptr(tick_work_cpu, cpu);
3146 cancel_delayed_work_sync(&twork->work);
3147}
3148#endif
3149
3150int __init sched_tick_offload_init(void)
3151{
3152 tick_work_cpu = alloc_percpu(struct tick_work);
3153 BUG_ON(!tick_work_cpu);
3154
3155 return 0;
3156}
3157
3158#else
3159static inline void sched_tick_start(int cpu) { }
3160static inline void sched_tick_stop(int cpu) { }
3161#endif
3162
3163#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3164 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3165
3166
3167
3168
3169static inline void preempt_latency_start(int val)
3170{
3171 if (preempt_count() == val) {
3172 unsigned long ip = get_lock_parent_ip();
3173#ifdef CONFIG_DEBUG_PREEMPT
3174 current->preempt_disable_ip = ip;
3175#endif
3176 trace_preempt_off(CALLER_ADDR0, ip);
3177 }
3178}
3179
3180void preempt_count_add(int val)
3181{
3182#ifdef CONFIG_DEBUG_PREEMPT
3183
3184
3185
3186 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3187 return;
3188#endif
3189 __preempt_count_add(val);
3190#ifdef CONFIG_DEBUG_PREEMPT
3191
3192
3193
3194 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3195 PREEMPT_MASK - 10);
3196#endif
3197 preempt_latency_start(val);
3198}
3199EXPORT_SYMBOL(preempt_count_add);
3200NOKPROBE_SYMBOL(preempt_count_add);
3201
3202
3203
3204
3205
3206static inline void preempt_latency_stop(int val)
3207{
3208 if (preempt_count() == val)
3209 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3210}
3211
3212void preempt_count_sub(int val)
3213{
3214#ifdef CONFIG_DEBUG_PREEMPT
3215
3216
3217
3218 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3219 return;
3220
3221
3222
3223 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3224 !(preempt_count() & PREEMPT_MASK)))
3225 return;
3226#endif
3227
3228 preempt_latency_stop(val);
3229 __preempt_count_sub(val);
3230}
3231EXPORT_SYMBOL(preempt_count_sub);
3232NOKPROBE_SYMBOL(preempt_count_sub);
3233
3234#else
3235static inline void preempt_latency_start(int val) { }
3236static inline void preempt_latency_stop(int val) { }
3237#endif
3238
3239static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3240{
3241#ifdef CONFIG_DEBUG_PREEMPT
3242 return p->preempt_disable_ip;
3243#else
3244 return 0;
3245#endif
3246}
3247
3248
3249
3250
3251static noinline void __schedule_bug(struct task_struct *prev)
3252{
3253
3254 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3255
3256 if (oops_in_progress)
3257 return;
3258
3259 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3260 prev->comm, prev->pid, preempt_count());
3261
3262 debug_show_held_locks(prev);
3263 print_modules();
3264 if (irqs_disabled())
3265 print_irqtrace_events(prev);
3266 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3267 && in_atomic_preempt_off()) {
3268 pr_err("Preemption disabled at:");
3269 print_ip_sym(preempt_disable_ip);
3270 pr_cont("\n");
3271 }
3272 if (panic_on_warn)
3273 panic("scheduling while atomic\n");
3274
3275 dump_stack();
3276 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3277}
3278
3279
3280
3281
3282static inline void schedule_debug(struct task_struct *prev)
3283{
3284#ifdef CONFIG_SCHED_STACK_END_CHECK
3285 if (task_stack_end_corrupted(prev))
3286 panic("corrupted stack end detected inside scheduler\n");
3287#endif
3288
3289 if (unlikely(in_atomic_preempt_off())) {
3290 __schedule_bug(prev);
3291 preempt_count_set(PREEMPT_DISABLED);
3292 }
3293 rcu_sleep_check();
3294
3295 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3296
3297 schedstat_inc(this_rq()->sched_count);
3298}
3299
3300
3301
3302
3303static inline struct task_struct *
3304pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3305{
3306 const struct sched_class *class;
3307 struct task_struct *p;
3308
3309
3310
3311
3312
3313
3314
3315 if (likely((prev->sched_class == &idle_sched_class ||
3316 prev->sched_class == &fair_sched_class) &&
3317 rq->nr_running == rq->cfs.h_nr_running)) {
3318
3319 p = fair_sched_class.pick_next_task(rq, prev, rf);
3320 if (unlikely(p == RETRY_TASK))
3321 goto again;
3322
3323
3324 if (unlikely(!p))
3325 p = idle_sched_class.pick_next_task(rq, prev, rf);
3326
3327 return p;
3328 }
3329
3330again:
3331 for_each_class(class) {
3332 p = class->pick_next_task(rq, prev, rf);
3333 if (p) {
3334 if (unlikely(p == RETRY_TASK))
3335 goto again;
3336 return p;
3337 }
3338 }
3339
3340
3341 BUG();
3342}
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383static void __sched notrace __schedule(bool preempt)
3384{
3385 struct task_struct *prev, *next;
3386 unsigned long *switch_count;
3387 struct rq_flags rf;
3388 struct rq *rq;
3389 int cpu;
3390
3391 cpu = smp_processor_id();
3392 rq = cpu_rq(cpu);
3393 prev = rq->curr;
3394
3395 schedule_debug(prev);
3396
3397 if (sched_feat(HRTICK))
3398 hrtick_clear(rq);
3399
3400 local_irq_disable();
3401 rcu_note_context_switch(preempt);
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411 rq_lock(rq, &rf);
3412 smp_mb__after_spinlock();
3413
3414
3415 rq->clock_update_flags <<= 1;
3416 update_rq_clock(rq);
3417
3418 switch_count = &prev->nivcsw;
3419 if (!preempt && prev->state) {
3420 if (unlikely(signal_pending_state(prev->state, prev))) {
3421 prev->state = TASK_RUNNING;
3422 } else {
3423 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3424 prev->on_rq = 0;
3425
3426 if (prev->in_iowait) {
3427 atomic_inc(&rq->nr_iowait);
3428 delayacct_blkio_start();
3429 }
3430
3431
3432
3433
3434
3435
3436 if (prev->flags & PF_WQ_WORKER) {
3437 struct task_struct *to_wakeup;
3438
3439 to_wakeup = wq_worker_sleeping(prev);
3440 if (to_wakeup)
3441 try_to_wake_up_local(to_wakeup, &rf);
3442 }
3443 }
3444 switch_count = &prev->nvcsw;
3445 }
3446
3447 next = pick_next_task(rq, prev, &rf);
3448 clear_tsk_need_resched(prev);
3449 clear_preempt_need_resched();
3450
3451 if (likely(prev != next)) {
3452 rq->nr_switches++;
3453 rq->curr = next;
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468 ++*switch_count;
3469
3470 trace_sched_switch(preempt, prev, next);
3471
3472
3473 rq = context_switch(rq, prev, next, &rf);
3474 } else {
3475 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3476 rq_unlock_irq(rq, &rf);
3477 }
3478
3479 balance_callback(rq);
3480}
3481
3482void __noreturn do_task_dead(void)
3483{
3484
3485 set_special_state(TASK_DEAD);
3486
3487
3488 current->flags |= PF_NOFREEZE;
3489
3490 __schedule(false);
3491 BUG();
3492
3493
3494 for (;;)
3495 cpu_relax();
3496}
3497
3498static inline void sched_submit_work(struct task_struct *tsk)
3499{
3500 if (!tsk->state || tsk_is_pi_blocked(tsk))
3501 return;
3502
3503
3504
3505
3506 if (blk_needs_flush_plug(tsk))
3507 blk_schedule_flush_plug(tsk);
3508}
3509
3510asmlinkage __visible void __sched schedule(void)
3511{
3512 struct task_struct *tsk = current;
3513
3514 sched_submit_work(tsk);
3515 do {
3516 preempt_disable();
3517 __schedule(false);
3518 sched_preempt_enable_no_resched();
3519 } while (need_resched());
3520}
3521EXPORT_SYMBOL(schedule);
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533void __sched schedule_idle(void)
3534{
3535
3536
3537
3538
3539
3540
3541
3542 WARN_ON_ONCE(current->state);
3543 do {
3544 __schedule(false);
3545 } while (need_resched());
3546}
3547
3548#ifdef CONFIG_CONTEXT_TRACKING
3549asmlinkage __visible void __sched schedule_user(void)
3550{
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561 enum ctx_state prev_state = exception_enter();
3562 schedule();
3563 exception_exit(prev_state);
3564}
3565#endif
3566
3567
3568
3569
3570
3571
3572void __sched schedule_preempt_disabled(void)
3573{
3574 sched_preempt_enable_no_resched();
3575 schedule();
3576 preempt_disable();
3577}
3578
3579static void __sched notrace preempt_schedule_common(void)
3580{
3581 do {
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595 preempt_disable_notrace();
3596 preempt_latency_start(1);
3597 __schedule(true);
3598 preempt_latency_stop(1);
3599 preempt_enable_no_resched_notrace();
3600
3601
3602
3603
3604
3605 } while (need_resched());
3606}
3607
3608#ifdef CONFIG_PREEMPT
3609
3610
3611
3612
3613
3614asmlinkage __visible void __sched notrace preempt_schedule(void)
3615{
3616
3617
3618
3619
3620 if (likely(!preemptible()))
3621 return;
3622
3623 preempt_schedule_common();
3624}
3625NOKPROBE_SYMBOL(preempt_schedule);
3626EXPORT_SYMBOL(preempt_schedule);
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3643{
3644 enum ctx_state prev_ctx;
3645
3646 if (likely(!preemptible()))
3647 return;
3648
3649 do {
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663 preempt_disable_notrace();
3664 preempt_latency_start(1);
3665
3666
3667
3668
3669
3670 prev_ctx = exception_enter();
3671 __schedule(true);
3672 exception_exit(prev_ctx);
3673
3674 preempt_latency_stop(1);
3675 preempt_enable_no_resched_notrace();
3676 } while (need_resched());
3677}
3678EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3679
3680#endif
3681
3682
3683
3684
3685
3686
3687
3688asmlinkage __visible void __sched preempt_schedule_irq(void)
3689{
3690 enum ctx_state prev_state;
3691
3692
3693 BUG_ON(preempt_count() || !irqs_disabled());
3694
3695 prev_state = exception_enter();
3696
3697 do {
3698 preempt_disable();
3699 local_irq_enable();
3700 __schedule(true);
3701 local_irq_disable();
3702 sched_preempt_enable_no_resched();
3703 } while (need_resched());
3704
3705 exception_exit(prev_state);
3706}
3707
3708int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
3709 void *key)
3710{
3711 return try_to_wake_up(curr->private, mode, wake_flags);
3712}
3713EXPORT_SYMBOL(default_wake_function);
3714
3715#ifdef CONFIG_RT_MUTEXES
3716
3717static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3718{
3719 if (pi_task)
3720 prio = min(prio, pi_task->prio);
3721
3722 return prio;
3723}
3724
3725static inline int rt_effective_prio(struct task_struct *p, int prio)
3726{
3727 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3728
3729 return __rt_effective_prio(pi_task, prio);
3730}
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3744{
3745 int prio, oldprio, queued, running, queue_flag =
3746 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3747 const struct sched_class *prev_class;
3748 struct rq_flags rf;
3749 struct rq *rq;
3750
3751
3752 prio = __rt_effective_prio(pi_task, p->normal_prio);
3753
3754
3755
3756
3757 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3758 return;
3759
3760 rq = __task_rq_lock(p, &rf);
3761 update_rq_clock(rq);
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772 p->pi_top_task = pi_task;
3773
3774
3775
3776
3777 if (prio == p->prio && !dl_prio(prio))
3778 goto out_unlock;
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792 if (unlikely(p == rq->idle)) {
3793 WARN_ON(p != rq->curr);
3794 WARN_ON(p->pi_blocked_on);
3795 goto out_unlock;
3796 }
3797
3798 trace_sched_pi_setprio(p, pi_task);
3799 oldprio = p->prio;
3800
3801 if (oldprio == prio)
3802 queue_flag &= ~DEQUEUE_MOVE;
3803
3804 prev_class = p->sched_class;
3805 queued = task_on_rq_queued(p);
3806 running = task_current(rq, p);
3807 if (queued)
3808 dequeue_task(rq, p, queue_flag);
3809 if (running)
3810 put_prev_task(rq, p);
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821 if (dl_prio(prio)) {
3822 if (!dl_prio(p->normal_prio) ||
3823 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3824 p->dl.dl_boosted = 1;
3825 queue_flag |= ENQUEUE_REPLENISH;
3826 } else
3827 p->dl.dl_boosted = 0;
3828 p->sched_class = &dl_sched_class;
3829 } else if (rt_prio(prio)) {
3830 if (dl_prio(oldprio))
3831 p->dl.dl_boosted = 0;
3832 if (oldprio < prio)
3833 queue_flag |= ENQUEUE_HEAD;
3834 p->sched_class = &rt_sched_class;
3835 } else {
3836 if (dl_prio(oldprio))
3837 p->dl.dl_boosted = 0;
3838 if (rt_prio(oldprio))
3839 p->rt.timeout = 0;
3840 p->sched_class = &fair_sched_class;
3841 }
3842
3843 p->prio = prio;
3844
3845 if (queued)
3846 enqueue_task(rq, p, queue_flag);
3847 if (running)
3848 set_curr_task(rq, p);
3849
3850 check_class_changed(rq, p, prev_class, oldprio);
3851out_unlock:
3852
3853 preempt_disable();
3854 __task_rq_unlock(rq, &rf);
3855
3856 balance_callback(rq);
3857 preempt_enable();
3858}
3859#else
3860static inline int rt_effective_prio(struct task_struct *p, int prio)
3861{
3862 return prio;
3863}
3864#endif
3865
3866void set_user_nice(struct task_struct *p, long nice)
3867{
3868 bool queued, running;
3869 int old_prio, delta;
3870 struct rq_flags rf;
3871 struct rq *rq;
3872
3873 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3874 return;
3875
3876
3877
3878
3879 rq = task_rq_lock(p, &rf);
3880 update_rq_clock(rq);
3881
3882
3883
3884
3885
3886
3887
3888 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3889 p->static_prio = NICE_TO_PRIO(nice);
3890 goto out_unlock;
3891 }
3892 queued = task_on_rq_queued(p);
3893 running = task_current(rq, p);
3894 if (queued)
3895 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3896 if (running)
3897 put_prev_task(rq, p);
3898
3899 p->static_prio = NICE_TO_PRIO(nice);
3900 set_load_weight(p, true);
3901 old_prio = p->prio;
3902 p->prio = effective_prio(p);
3903 delta = p->prio - old_prio;
3904
3905 if (queued) {
3906 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3907
3908
3909
3910
3911 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3912 resched_curr(rq);
3913 }
3914 if (running)
3915 set_curr_task(rq, p);
3916out_unlock:
3917 task_rq_unlock(rq, p, &rf);
3918}
3919EXPORT_SYMBOL(set_user_nice);
3920
3921
3922
3923
3924
3925
3926int can_nice(const struct task_struct *p, const int nice)
3927{
3928
3929 int nice_rlim = nice_to_rlimit(nice);
3930
3931 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3932 capable(CAP_SYS_NICE));
3933}
3934
3935#ifdef __ARCH_WANT_SYS_NICE
3936
3937
3938
3939
3940
3941
3942
3943
3944SYSCALL_DEFINE1(nice, int, increment)
3945{
3946 long nice, retval;
3947
3948
3949
3950
3951
3952
3953 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3954 nice = task_nice(current) + increment;
3955
3956 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3957 if (increment < 0 && !can_nice(current, nice))
3958 return -EPERM;
3959
3960 retval = security_task_setnice(current, nice);
3961 if (retval)
3962 return retval;
3963
3964 set_user_nice(current, nice);
3965 return 0;
3966}
3967
3968#endif
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978int task_prio(const struct task_struct *p)
3979{
3980 return p->prio - MAX_RT_PRIO;
3981}
3982
3983
3984
3985
3986
3987
3988
3989int idle_cpu(int cpu)
3990{
3991 struct rq *rq = cpu_rq(cpu);
3992
3993 if (rq->curr != rq->idle)
3994 return 0;
3995
3996 if (rq->nr_running)
3997 return 0;
3998
3999#ifdef CONFIG_SMP
4000 if (!llist_empty(&rq->wake_list))
4001 return 0;
4002#endif
4003
4004 return 1;
4005}
4006
4007
4008
4009
4010
4011
4012
4013int available_idle_cpu(int cpu)
4014{
4015 if (!idle_cpu(cpu))
4016 return 0;
4017
4018 if (vcpu_is_preempted(cpu))
4019 return 0;
4020
4021 return 1;
4022}
4023
4024
4025
4026
4027
4028
4029
4030struct task_struct *idle_task(int cpu)
4031{
4032 return cpu_rq(cpu)->idle;
4033}
4034
4035
4036
4037
4038
4039
4040
4041static struct task_struct *find_process_by_pid(pid_t pid)
4042{
4043 return pid ? find_task_by_vpid(pid) : current;
4044}
4045
4046
4047
4048
4049
4050#define SETPARAM_POLICY -1
4051
4052static void __setscheduler_params(struct task_struct *p,
4053 const struct sched_attr *attr)
4054{
4055 int policy = attr->sched_policy;
4056
4057 if (policy == SETPARAM_POLICY)
4058 policy = p->policy;
4059
4060 p->policy = policy;
4061
4062 if (dl_policy(policy))
4063 __setparam_dl(p, attr);
4064 else if (fair_policy(policy))
4065 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4066
4067
4068
4069
4070
4071
4072 p->rt_priority = attr->sched_priority;
4073 p->normal_prio = normal_prio(p);
4074 set_load_weight(p, true);
4075}
4076
4077
4078static void __setscheduler(struct rq *rq, struct task_struct *p,
4079 const struct sched_attr *attr, bool keep_boost)
4080{
4081 __setscheduler_params(p, attr);
4082
4083
4084
4085
4086
4087 p->prio = normal_prio(p);
4088 if (keep_boost)
4089 p->prio = rt_effective_prio(p, p->prio);
4090
4091 if (dl_prio(p->prio))
4092 p->sched_class = &dl_sched_class;
4093 else if (rt_prio(p->prio))
4094 p->sched_class = &rt_sched_class;
4095 else
4096 p->sched_class = &fair_sched_class;
4097}
4098
4099
4100
4101
4102static bool check_same_owner(struct task_struct *p)
4103{
4104 const struct cred *cred = current_cred(), *pcred;
4105 bool match;
4106
4107 rcu_read_lock();
4108 pcred = __task_cred(p);
4109 match = (uid_eq(cred->euid, pcred->euid) ||
4110 uid_eq(cred->euid, pcred->uid));
4111 rcu_read_unlock();
4112 return match;
4113}
4114
4115static int __sched_setscheduler(struct task_struct *p,
4116 const struct sched_attr *attr,
4117 bool user, bool pi)
4118{
4119 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4120 MAX_RT_PRIO - 1 - attr->sched_priority;
4121 int retval, oldprio, oldpolicy = -1, queued, running;
4122 int new_effective_prio, policy = attr->sched_policy;
4123 const struct sched_class *prev_class;
4124 struct rq_flags rf;
4125 int reset_on_fork;
4126 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4127 struct rq *rq;
4128
4129
4130 BUG_ON(pi && in_interrupt());
4131recheck:
4132
4133 if (policy < 0) {
4134 reset_on_fork = p->sched_reset_on_fork;
4135 policy = oldpolicy = p->policy;
4136 } else {
4137 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4138
4139 if (!valid_policy(policy))
4140 return -EINVAL;
4141 }
4142
4143 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
4144 return -EINVAL;
4145
4146
4147
4148
4149
4150
4151 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4152 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4153 return -EINVAL;
4154 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4155 (rt_policy(policy) != (attr->sched_priority != 0)))
4156 return -EINVAL;
4157
4158
4159
4160
4161 if (user && !capable(CAP_SYS_NICE)) {
4162 if (fair_policy(policy)) {
4163 if (attr->sched_nice < task_nice(p) &&
4164 !can_nice(p, attr->sched_nice))
4165 return -EPERM;
4166 }
4167
4168 if (rt_policy(policy)) {
4169 unsigned long rlim_rtprio =
4170 task_rlimit(p, RLIMIT_RTPRIO);
4171
4172
4173 if (policy != p->policy && !rlim_rtprio)
4174 return -EPERM;
4175
4176
4177 if (attr->sched_priority > p->rt_priority &&
4178 attr->sched_priority > rlim_rtprio)
4179 return -EPERM;
4180 }
4181
4182
4183
4184
4185
4186
4187
4188 if (dl_policy(policy))
4189 return -EPERM;
4190
4191
4192
4193
4194
4195 if (idle_policy(p->policy) && !idle_policy(policy)) {
4196 if (!can_nice(p, task_nice(p)))
4197 return -EPERM;
4198 }
4199
4200
4201 if (!check_same_owner(p))
4202 return -EPERM;
4203
4204
4205 if (p->sched_reset_on_fork && !reset_on_fork)
4206 return -EPERM;
4207 }
4208
4209 if (user) {
4210 if (attr->sched_flags & SCHED_FLAG_SUGOV)
4211 return -EINVAL;
4212
4213 retval = security_task_setscheduler(p);
4214 if (retval)
4215 return retval;
4216 }
4217
4218
4219
4220
4221
4222
4223
4224
4225 rq = task_rq_lock(p, &rf);
4226 update_rq_clock(rq);
4227
4228
4229
4230
4231 if (p == rq->stop) {
4232 task_rq_unlock(rq, p, &rf);
4233 return -EINVAL;
4234 }
4235
4236
4237
4238
4239
4240 if (unlikely(policy == p->policy)) {
4241 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4242 goto change;
4243 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4244 goto change;
4245 if (dl_policy(policy) && dl_param_changed(p, attr))
4246 goto change;
4247
4248 p->sched_reset_on_fork = reset_on_fork;
4249 task_rq_unlock(rq, p, &rf);
4250 return 0;
4251 }
4252change:
4253
4254 if (user) {
4255#ifdef CONFIG_RT_GROUP_SCHED
4256
4257
4258
4259
4260 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4261 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4262 !task_group_is_autogroup(task_group(p))) {
4263 task_rq_unlock(rq, p, &rf);
4264 return -EPERM;
4265 }
4266#endif
4267#ifdef CONFIG_SMP
4268 if (dl_bandwidth_enabled() && dl_policy(policy) &&
4269 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
4270 cpumask_t *span = rq->rd->span;
4271
4272
4273
4274
4275
4276
4277 if (!cpumask_subset(span, &p->cpus_allowed) ||
4278 rq->rd->dl_bw.bw == 0) {
4279 task_rq_unlock(rq, p, &rf);
4280 return -EPERM;
4281 }
4282 }
4283#endif
4284 }
4285
4286
4287 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4288 policy = oldpolicy = -1;
4289 task_rq_unlock(rq, p, &rf);
4290 goto recheck;
4291 }
4292
4293
4294
4295
4296
4297
4298 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4299 task_rq_unlock(rq, p, &rf);
4300 return -EBUSY;
4301 }
4302
4303 p->sched_reset_on_fork = reset_on_fork;
4304 oldprio = p->prio;
4305
4306 if (pi) {
4307
4308
4309
4310
4311
4312
4313
4314 new_effective_prio = rt_effective_prio(p, newprio);
4315 if (new_effective_prio == oldprio)
4316 queue_flags &= ~DEQUEUE_MOVE;
4317 }
4318
4319 queued = task_on_rq_queued(p);
4320 running = task_current(rq, p);
4321 if (queued)
4322 dequeue_task(rq, p, queue_flags);
4323 if (running)
4324 put_prev_task(rq, p);
4325
4326 prev_class = p->sched_class;
4327 __setscheduler(rq, p, attr, pi);
4328
4329 if (queued) {
4330
4331
4332
4333
4334 if (oldprio < p->prio)
4335 queue_flags |= ENQUEUE_HEAD;
4336
4337 enqueue_task(rq, p, queue_flags);
4338 }
4339 if (running)
4340 set_curr_task(rq, p);
4341
4342 check_class_changed(rq, p, prev_class, oldprio);
4343
4344
4345 preempt_disable();
4346 task_rq_unlock(rq, p, &rf);
4347
4348 if (pi)
4349 rt_mutex_adjust_pi(p);
4350
4351
4352 balance_callback(rq);
4353 preempt_enable();
4354
4355 return 0;
4356}
4357
4358static int _sched_setscheduler(struct task_struct *p, int policy,
4359 const struct sched_param *param, bool check)
4360{
4361 struct sched_attr attr = {
4362 .sched_policy = policy,
4363 .sched_priority = param->sched_priority,
4364 .sched_nice = PRIO_TO_NICE(p->static_prio),
4365 };
4366
4367
4368 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4369 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4370 policy &= ~SCHED_RESET_ON_FORK;
4371 attr.sched_policy = policy;
4372 }
4373
4374 return __sched_setscheduler(p, &attr, check, true);
4375}
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386int sched_setscheduler(struct task_struct *p, int policy,
4387 const struct sched_param *param)
4388{
4389 return _sched_setscheduler(p, policy, param, true);
4390}
4391EXPORT_SYMBOL_GPL(sched_setscheduler);
4392
4393int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4394{
4395 return __sched_setscheduler(p, attr, true, true);
4396}
4397EXPORT_SYMBOL_GPL(sched_setattr);
4398
4399int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
4400{
4401 return __sched_setscheduler(p, attr, false, true);
4402}
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4418 const struct sched_param *param)
4419{
4420 return _sched_setscheduler(p, policy, param, false);
4421}
4422EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4423
4424static int
4425do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4426{
4427 struct sched_param lparam;
4428 struct task_struct *p;
4429 int retval;
4430
4431 if (!param || pid < 0)
4432 return -EINVAL;
4433 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4434 return -EFAULT;
4435
4436 rcu_read_lock();
4437 retval = -ESRCH;
4438 p = find_process_by_pid(pid);
4439 if (p != NULL)
4440 retval = sched_setscheduler(p, policy, &lparam);
4441 rcu_read_unlock();
4442
4443 return retval;
4444}
4445
4446
4447
4448
4449static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4450{
4451 u32 size;
4452 int ret;
4453
4454 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4455 return -EFAULT;
4456
4457
4458 memset(attr, 0, sizeof(*attr));
4459
4460 ret = get_user(size, &uattr->size);
4461 if (ret)
4462 return ret;
4463
4464
4465 if (size > PAGE_SIZE)
4466 goto err_size;
4467
4468
4469 if (!size)
4470 size = SCHED_ATTR_SIZE_VER0;
4471
4472 if (size < SCHED_ATTR_SIZE_VER0)
4473 goto err_size;
4474
4475
4476
4477
4478
4479
4480
4481 if (size > sizeof(*attr)) {
4482 unsigned char __user *addr;
4483 unsigned char __user *end;
4484 unsigned char val;
4485
4486 addr = (void __user *)uattr + sizeof(*attr);
4487 end = (void __user *)uattr + size;
4488
4489 for (; addr < end; addr++) {
4490 ret = get_user(val, addr);
4491 if (ret)
4492 return ret;
4493 if (val)
4494 goto err_size;
4495 }
4496 size = sizeof(*attr);
4497 }
4498
4499 ret = copy_from_user(attr, uattr, size);
4500 if (ret)
4501 return -EFAULT;
4502
4503
4504
4505
4506
4507 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4508
4509 return 0;
4510
4511err_size:
4512 put_user(sizeof(*attr), &uattr->size);
4513 return -E2BIG;
4514}
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4525{
4526 if (policy < 0)
4527 return -EINVAL;
4528
4529 return do_sched_setscheduler(pid, policy, param);
4530}
4531
4532
4533
4534
4535
4536
4537
4538
4539SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4540{
4541 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4542}
4543
4544
4545
4546
4547
4548
4549
4550SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4551 unsigned int, flags)
4552{
4553 struct sched_attr attr;
4554 struct task_struct *p;
4555 int retval;
4556
4557 if (!uattr || pid < 0 || flags)
4558 return -EINVAL;
4559
4560 retval = sched_copy_attr(uattr, &attr);
4561 if (retval)
4562 return retval;
4563
4564 if ((int)attr.sched_policy < 0)
4565 return -EINVAL;
4566
4567 rcu_read_lock();
4568 retval = -ESRCH;
4569 p = find_process_by_pid(pid);
4570 if (p != NULL)
4571 retval = sched_setattr(p, &attr);
4572 rcu_read_unlock();
4573
4574 return retval;
4575}
4576
4577
4578
4579
4580
4581
4582
4583
4584SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4585{
4586 struct task_struct *p;
4587 int retval;
4588
4589 if (pid < 0)
4590 return -EINVAL;
4591
4592 retval = -ESRCH;
4593 rcu_read_lock();
4594 p = find_process_by_pid(pid);
4595 if (p) {
4596 retval = security_task_getscheduler(p);
4597 if (!retval)
4598 retval = p->policy
4599 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4600 }
4601 rcu_read_unlock();
4602 return retval;
4603}
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4614{
4615 struct sched_param lp = { .sched_priority = 0 };
4616 struct task_struct *p;
4617 int retval;
4618
4619 if (!param || pid < 0)
4620 return -EINVAL;
4621
4622 rcu_read_lock();
4623 p = find_process_by_pid(pid);
4624 retval = -ESRCH;
4625 if (!p)
4626 goto out_unlock;
4627
4628 retval = security_task_getscheduler(p);
4629 if (retval)
4630 goto out_unlock;
4631
4632 if (task_has_rt_policy(p))
4633 lp.sched_priority = p->rt_priority;
4634 rcu_read_unlock();
4635
4636
4637
4638
4639 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4640
4641 return retval;
4642
4643out_unlock:
4644 rcu_read_unlock();
4645 return retval;
4646}
4647
4648static int sched_read_attr(struct sched_attr __user *uattr,
4649 struct sched_attr *attr,
4650 unsigned int usize)
4651{
4652 int ret;
4653
4654 if (!access_ok(VERIFY_WRITE, uattr, usize))
4655 return -EFAULT;
4656
4657
4658
4659
4660
4661
4662 if (usize < sizeof(*attr)) {
4663 unsigned char *addr;
4664 unsigned char *end;
4665
4666 addr = (void *)attr + usize;
4667 end = (void *)attr + sizeof(*attr);
4668
4669 for (; addr < end; addr++) {
4670 if (*addr)
4671 return -EFBIG;
4672 }
4673
4674 attr->size = usize;
4675 }
4676
4677 ret = copy_to_user(uattr, attr, attr->size);
4678 if (ret)
4679 return -EFAULT;
4680
4681 return 0;
4682}
4683
4684
4685
4686
4687
4688
4689
4690
4691SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4692 unsigned int, size, unsigned int, flags)
4693{
4694 struct sched_attr attr = {
4695 .size = sizeof(struct sched_attr),
4696 };
4697 struct task_struct *p;
4698 int retval;
4699
4700 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4701 size < SCHED_ATTR_SIZE_VER0 || flags)
4702 return -EINVAL;
4703
4704 rcu_read_lock();
4705 p = find_process_by_pid(pid);
4706 retval = -ESRCH;
4707 if (!p)
4708 goto out_unlock;
4709
4710 retval = security_task_getscheduler(p);
4711 if (retval)
4712 goto out_unlock;
4713
4714 attr.sched_policy = p->policy;
4715 if (p->sched_reset_on_fork)
4716 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4717 if (task_has_dl_policy(p))
4718 __getparam_dl(p, &attr);
4719 else if (task_has_rt_policy(p))
4720 attr.sched_priority = p->rt_priority;
4721 else
4722 attr.sched_nice = task_nice(p);
4723
4724 rcu_read_unlock();
4725
4726 retval = sched_read_attr(uattr, &attr, size);
4727 return retval;
4728
4729out_unlock:
4730 rcu_read_unlock();
4731 return retval;
4732}
4733
4734long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4735{
4736 cpumask_var_t cpus_allowed, new_mask;
4737 struct task_struct *p;
4738 int retval;
4739
4740 rcu_read_lock();
4741
4742 p = find_process_by_pid(pid);
4743 if (!p) {
4744 rcu_read_unlock();
4745 return -ESRCH;
4746 }
4747
4748
4749 get_task_struct(p);
4750 rcu_read_unlock();
4751
4752 if (p->flags & PF_NO_SETAFFINITY) {
4753 retval = -EINVAL;
4754 goto out_put_task;
4755 }
4756 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4757 retval = -ENOMEM;
4758 goto out_put_task;
4759 }
4760 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4761 retval = -ENOMEM;
4762 goto out_free_cpus_allowed;
4763 }
4764 retval = -EPERM;
4765 if (!check_same_owner(p)) {
4766 rcu_read_lock();
4767 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4768 rcu_read_unlock();
4769 goto out_free_new_mask;
4770 }
4771 rcu_read_unlock();
4772 }
4773
4774 retval = security_task_setscheduler(p);
4775 if (retval)
4776 goto out_free_new_mask;
4777
4778
4779 cpuset_cpus_allowed(p, cpus_allowed);
4780 cpumask_and(new_mask, in_mask, cpus_allowed);
4781
4782
4783
4784
4785
4786
4787
4788#ifdef CONFIG_SMP
4789 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4790 rcu_read_lock();
4791 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4792 retval = -EBUSY;
4793 rcu_read_unlock();
4794 goto out_free_new_mask;
4795 }
4796 rcu_read_unlock();
4797 }
4798#endif
4799again:
4800 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4801
4802 if (!retval) {
4803 cpuset_cpus_allowed(p, cpus_allowed);
4804 if (!cpumask_subset(new_mask, cpus_allowed)) {
4805
4806
4807
4808
4809
4810 cpumask_copy(new_mask, cpus_allowed);
4811 goto again;
4812 }
4813 }
4814out_free_new_mask:
4815 free_cpumask_var(new_mask);
4816out_free_cpus_allowed:
4817 free_cpumask_var(cpus_allowed);
4818out_put_task:
4819 put_task_struct(p);
4820 return retval;
4821}
4822
4823static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4824 struct cpumask *new_mask)
4825{
4826 if (len < cpumask_size())
4827 cpumask_clear(new_mask);
4828 else if (len > cpumask_size())
4829 len = cpumask_size();
4830
4831 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4832}
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4843 unsigned long __user *, user_mask_ptr)
4844{
4845 cpumask_var_t new_mask;
4846 int retval;
4847
4848 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4849 return -ENOMEM;
4850
4851 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4852 if (retval == 0)
4853 retval = sched_setaffinity(pid, new_mask);
4854 free_cpumask_var(new_mask);
4855 return retval;
4856}
4857
4858long sched_getaffinity(pid_t pid, struct cpumask *mask)
4859{
4860 struct task_struct *p;
4861 unsigned long flags;
4862 int retval;
4863
4864 rcu_read_lock();
4865
4866 retval = -ESRCH;
4867 p = find_process_by_pid(pid);
4868 if (!p)
4869 goto out_unlock;
4870
4871 retval = security_task_getscheduler(p);
4872 if (retval)
4873 goto out_unlock;
4874
4875 raw_spin_lock_irqsave(&p->pi_lock, flags);
4876 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4878
4879out_unlock:
4880 rcu_read_unlock();
4881
4882 return retval;
4883}
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4895 unsigned long __user *, user_mask_ptr)
4896{
4897 int ret;
4898 cpumask_var_t mask;
4899
4900 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4901 return -EINVAL;
4902 if (len & (sizeof(unsigned long)-1))
4903 return -EINVAL;
4904
4905 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4906 return -ENOMEM;
4907
4908 ret = sched_getaffinity(pid, mask);
4909 if (ret == 0) {
4910 unsigned int retlen = min(len, cpumask_size());
4911
4912 if (copy_to_user(user_mask_ptr, mask, retlen))
4913 ret = -EFAULT;
4914 else
4915 ret = retlen;
4916 }
4917 free_cpumask_var(mask);
4918
4919 return ret;
4920}
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930static void do_sched_yield(void)
4931{
4932 struct rq_flags rf;
4933 struct rq *rq;
4934
4935 local_irq_disable();
4936 rq = this_rq();
4937 rq_lock(rq, &rf);
4938
4939 schedstat_inc(rq->yld_count);
4940 current->sched_class->yield_task(rq);
4941
4942
4943
4944
4945
4946 preempt_disable();
4947 rq_unlock(rq, &rf);
4948 sched_preempt_enable_no_resched();
4949
4950 schedule();
4951}
4952
4953SYSCALL_DEFINE0(sched_yield)
4954{
4955 do_sched_yield();
4956 return 0;
4957}
4958
4959#ifndef CONFIG_PREEMPT
4960int __sched _cond_resched(void)
4961{
4962 if (should_resched(0)) {
4963 preempt_schedule_common();
4964 return 1;
4965 }
4966 rcu_all_qs();
4967 return 0;
4968}
4969EXPORT_SYMBOL(_cond_resched);
4970#endif
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980int __cond_resched_lock(spinlock_t *lock)
4981{
4982 int resched = should_resched(PREEMPT_LOCK_OFFSET);
4983 int ret = 0;
4984
4985 lockdep_assert_held(lock);
4986
4987 if (spin_needbreak(lock) || resched) {
4988 spin_unlock(lock);
4989 if (resched)
4990 preempt_schedule_common();
4991 else
4992 cpu_relax();
4993 ret = 1;
4994 spin_lock(lock);
4995 }
4996 return ret;
4997}
4998EXPORT_SYMBOL(__cond_resched_lock);
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022void __sched yield(void)
5023{
5024 set_current_state(TASK_RUNNING);
5025 do_sched_yield();
5026}
5027EXPORT_SYMBOL(yield);
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044int __sched yield_to(struct task_struct *p, bool preempt)
5045{
5046 struct task_struct *curr = current;
5047 struct rq *rq, *p_rq;
5048 unsigned long flags;
5049 int yielded = 0;
5050
5051 local_irq_save(flags);
5052 rq = this_rq();
5053
5054again:
5055 p_rq = task_rq(p);
5056
5057
5058
5059
5060 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5061 yielded = -ESRCH;
5062 goto out_irq;
5063 }
5064
5065 double_rq_lock(rq, p_rq);
5066 if (task_rq(p) != p_rq) {
5067 double_rq_unlock(rq, p_rq);
5068 goto again;
5069 }
5070
5071 if (!curr->sched_class->yield_to_task)
5072 goto out_unlock;
5073
5074 if (curr->sched_class != p->sched_class)
5075 goto out_unlock;
5076
5077 if (task_running(p_rq, p) || p->state)
5078 goto out_unlock;
5079
5080 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5081 if (yielded) {
5082 schedstat_inc(rq->yld_count);
5083
5084
5085
5086
5087 if (preempt && rq != p_rq)
5088 resched_curr(p_rq);
5089 }
5090
5091out_unlock:
5092 double_rq_unlock(rq, p_rq);
5093out_irq:
5094 local_irq_restore(flags);
5095
5096 if (yielded > 0)
5097 schedule();
5098
5099 return yielded;
5100}
5101EXPORT_SYMBOL_GPL(yield_to);
5102
5103int io_schedule_prepare(void)
5104{
5105 int old_iowait = current->in_iowait;
5106
5107 current->in_iowait = 1;
5108 blk_schedule_flush_plug(current);
5109
5110 return old_iowait;
5111}
5112
5113void io_schedule_finish(int token)
5114{
5115 current->in_iowait = token;
5116}
5117
5118
5119
5120
5121
5122long __sched io_schedule_timeout(long timeout)
5123{
5124 int token;
5125 long ret;
5126
5127 token = io_schedule_prepare();
5128 ret = schedule_timeout(timeout);
5129 io_schedule_finish(token);
5130
5131 return ret;
5132}
5133EXPORT_SYMBOL(io_schedule_timeout);
5134
5135void io_schedule(void)
5136{
5137 int token;
5138
5139 token = io_schedule_prepare();
5140 schedule();
5141 io_schedule_finish(token);
5142}
5143EXPORT_SYMBOL(io_schedule);
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5154{
5155 int ret = -EINVAL;
5156
5157 switch (policy) {
5158 case SCHED_FIFO:
5159 case SCHED_RR:
5160 ret = MAX_USER_RT_PRIO-1;
5161 break;
5162 case SCHED_DEADLINE:
5163 case SCHED_NORMAL:
5164 case SCHED_BATCH:
5165 case SCHED_IDLE:
5166 ret = 0;
5167 break;
5168 }
5169 return ret;
5170}
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5181{
5182 int ret = -EINVAL;
5183
5184 switch (policy) {
5185 case SCHED_FIFO:
5186 case SCHED_RR:
5187 ret = 1;
5188 break;
5189 case SCHED_DEADLINE:
5190 case SCHED_NORMAL:
5191 case SCHED_BATCH:
5192 case SCHED_IDLE:
5193 ret = 0;
5194 }
5195 return ret;
5196}
5197
5198static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
5199{
5200 struct task_struct *p;
5201 unsigned int time_slice;
5202 struct rq_flags rf;
5203 struct rq *rq;
5204 int retval;
5205
5206 if (pid < 0)
5207 return -EINVAL;
5208
5209 retval = -ESRCH;
5210 rcu_read_lock();
5211 p = find_process_by_pid(pid);
5212 if (!p)
5213 goto out_unlock;
5214
5215 retval = security_task_getscheduler(p);
5216 if (retval)
5217 goto out_unlock;
5218
5219 rq = task_rq_lock(p, &rf);
5220 time_slice = 0;
5221 if (p->sched_class->get_rr_interval)
5222 time_slice = p->sched_class->get_rr_interval(rq, p);
5223 task_rq_unlock(rq, p, &rf);
5224
5225 rcu_read_unlock();
5226 jiffies_to_timespec64(time_slice, t);
5227 return 0;
5228
5229out_unlock:
5230 rcu_read_unlock();
5231 return retval;
5232}
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5246 struct timespec __user *, interval)
5247{
5248 struct timespec64 t;
5249 int retval = sched_rr_get_interval(pid, &t);
5250
5251 if (retval == 0)
5252 retval = put_timespec64(&t, interval);
5253
5254 return retval;
5255}
5256
5257#ifdef CONFIG_COMPAT
5258COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
5259 compat_pid_t, pid,
5260 struct compat_timespec __user *, interval)
5261{
5262 struct timespec64 t;
5263 int retval = sched_rr_get_interval(pid, &t);
5264
5265 if (retval == 0)
5266 retval = compat_put_timespec64(&t, interval);
5267 return retval;
5268}
5269#endif
5270
5271void sched_show_task(struct task_struct *p)
5272{
5273 unsigned long free = 0;
5274 int ppid;
5275
5276 if (!try_get_task_stack(p))
5277 return;
5278
5279 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
5280
5281 if (p->state == TASK_RUNNING)
5282 printk(KERN_CONT " running task ");
5283#ifdef CONFIG_DEBUG_STACK_USAGE
5284 free = stack_not_used(p);
5285#endif
5286 ppid = 0;
5287 rcu_read_lock();
5288 if (pid_alive(p))
5289 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5290 rcu_read_unlock();
5291 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5292 task_pid_nr(p), ppid,
5293 (unsigned long)task_thread_info(p)->flags);
5294
5295 print_worker_info(KERN_INFO, p);
5296 show_stack(p, NULL);
5297 put_task_stack(p);
5298}
5299EXPORT_SYMBOL_GPL(sched_show_task);
5300
5301static inline bool
5302state_filter_match(unsigned long state_filter, struct task_struct *p)
5303{
5304
5305 if (!state_filter)
5306 return true;
5307
5308
5309 if (!(p->state & state_filter))
5310 return false;
5311
5312
5313
5314
5315
5316 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5317 return false;
5318
5319 return true;
5320}
5321
5322
5323void show_state_filter(unsigned long state_filter)
5324{
5325 struct task_struct *g, *p;
5326
5327#if BITS_PER_LONG == 32
5328 printk(KERN_INFO
5329 " task PC stack pid father\n");
5330#else
5331 printk(KERN_INFO
5332 " task PC stack pid father\n");
5333#endif
5334 rcu_read_lock();
5335 for_each_process_thread(g, p) {
5336
5337
5338
5339
5340
5341
5342
5343 touch_nmi_watchdog();
5344 touch_all_softlockup_watchdogs();
5345 if (state_filter_match(state_filter, p))
5346 sched_show_task(p);
5347 }
5348
5349#ifdef CONFIG_SCHED_DEBUG
5350 if (!state_filter)
5351 sysrq_sched_debug_show();
5352#endif
5353 rcu_read_unlock();
5354
5355
5356
5357 if (!state_filter)
5358 debug_show_all_locks();
5359}
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369void init_idle(struct task_struct *idle, int cpu)
5370{
5371 struct rq *rq = cpu_rq(cpu);
5372 unsigned long flags;
5373
5374 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5375 raw_spin_lock(&rq->lock);
5376
5377 __sched_fork(0, idle);
5378 idle->state = TASK_RUNNING;
5379 idle->se.exec_start = sched_clock();
5380 idle->flags |= PF_IDLE;
5381
5382 kasan_unpoison_task_stack(idle);
5383
5384#ifdef CONFIG_SMP
5385
5386
5387
5388
5389
5390
5391 set_cpus_allowed_common(idle, cpumask_of(cpu));
5392#endif
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403 rcu_read_lock();
5404 __set_task_cpu(idle, cpu);
5405 rcu_read_unlock();
5406
5407 rq->curr = rq->idle = idle;
5408 idle->on_rq = TASK_ON_RQ_QUEUED;
5409#ifdef CONFIG_SMP
5410 idle->on_cpu = 1;
5411#endif
5412 raw_spin_unlock(&rq->lock);
5413 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5414
5415
5416 init_idle_preempt_count(idle, cpu);
5417
5418
5419
5420
5421 idle->sched_class = &idle_sched_class;
5422 ftrace_graph_init_idle_task(idle, cpu);
5423 vtime_init_idle(idle, cpu);
5424#ifdef CONFIG_SMP
5425 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5426#endif
5427}
5428
5429#ifdef CONFIG_SMP
5430
5431int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5432 const struct cpumask *trial)
5433{
5434 int ret = 1;
5435
5436 if (!cpumask_weight(cur))
5437 return ret;
5438
5439 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5440
5441 return ret;
5442}
5443
5444int task_can_attach(struct task_struct *p,
5445 const struct cpumask *cs_cpus_allowed)
5446{
5447 int ret = 0;
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458 if (p->flags & PF_NO_SETAFFINITY) {
5459 ret = -EINVAL;
5460 goto out;
5461 }
5462
5463 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5464 cs_cpus_allowed))
5465 ret = dl_task_can_attach(p, cs_cpus_allowed);
5466
5467out:
5468 return ret;
5469}
5470
5471bool sched_smp_initialized __read_mostly;
5472
5473#ifdef CONFIG_NUMA_BALANCING
5474
5475int migrate_task_to(struct task_struct *p, int target_cpu)
5476{
5477 struct migration_arg arg = { p, target_cpu };
5478 int curr_cpu = task_cpu(p);
5479
5480 if (curr_cpu == target_cpu)
5481 return 0;
5482
5483 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
5484 return -EINVAL;
5485
5486
5487
5488 trace_sched_move_numa(p, curr_cpu, target_cpu);
5489 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5490}
5491
5492
5493
5494
5495
5496void sched_setnuma(struct task_struct *p, int nid)
5497{
5498 bool queued, running;
5499 struct rq_flags rf;
5500 struct rq *rq;
5501
5502 rq = task_rq_lock(p, &rf);
5503 queued = task_on_rq_queued(p);
5504 running = task_current(rq, p);
5505
5506 if (queued)
5507 dequeue_task(rq, p, DEQUEUE_SAVE);
5508 if (running)
5509 put_prev_task(rq, p);
5510
5511 p->numa_preferred_nid = nid;
5512
5513 if (queued)
5514 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5515 if (running)
5516 set_curr_task(rq, p);
5517 task_rq_unlock(rq, p, &rf);
5518}
5519#endif
5520
5521#ifdef CONFIG_HOTPLUG_CPU
5522
5523
5524
5525
5526void idle_task_exit(void)
5527{
5528 struct mm_struct *mm = current->active_mm;
5529
5530 BUG_ON(cpu_online(smp_processor_id()));
5531
5532 if (mm != &init_mm) {
5533 switch_mm(mm, &init_mm, current);
5534 current->active_mm = &init_mm;
5535 finish_arch_post_lock_switch();
5536 }
5537 mmdrop(mm);
5538}
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549static void calc_load_migrate(struct rq *rq)
5550{
5551 long delta = calc_load_fold_active(rq, 1);
5552 if (delta)
5553 atomic_long_add(delta, &calc_load_tasks);
5554}
5555
5556static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5557{
5558}
5559
5560static const struct sched_class fake_sched_class = {
5561 .put_prev_task = put_prev_task_fake,
5562};
5563
5564static struct task_struct fake_task = {
5565
5566
5567
5568 .prio = MAX_PRIO + 1,
5569 .sched_class = &fake_sched_class,
5570};
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5581{
5582 struct rq *rq = dead_rq;
5583 struct task_struct *next, *stop = rq->stop;
5584 struct rq_flags orf = *rf;
5585 int dest_cpu;
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596 rq->stop = NULL;
5597
5598
5599
5600
5601
5602
5603 update_rq_clock(rq);
5604
5605 for (;;) {
5606
5607
5608
5609
5610 if (rq->nr_running == 1)
5611 break;
5612
5613
5614
5615
5616 next = pick_next_task(rq, &fake_task, rf);
5617 BUG_ON(!next);
5618 put_prev_task(rq, next);
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629 rq_unlock(rq, rf);
5630 raw_spin_lock(&next->pi_lock);
5631 rq_relock(rq, rf);
5632
5633
5634
5635
5636
5637
5638 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5639 raw_spin_unlock(&next->pi_lock);
5640 continue;
5641 }
5642
5643
5644 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5645 rq = __migrate_task(rq, rf, next, dest_cpu);
5646 if (rq != dead_rq) {
5647 rq_unlock(rq, rf);
5648 rq = dead_rq;
5649 *rf = orf;
5650 rq_relock(rq, rf);
5651 }
5652 raw_spin_unlock(&next->pi_lock);
5653 }
5654
5655 rq->stop = stop;
5656}
5657#endif
5658
5659void set_rq_online(struct rq *rq)
5660{
5661 if (!rq->online) {
5662 const struct sched_class *class;
5663
5664 cpumask_set_cpu(rq->cpu, rq->rd->online);
5665 rq->online = 1;
5666
5667 for_each_class(class) {
5668 if (class->rq_online)
5669 class->rq_online(rq);
5670 }
5671 }
5672}
5673
5674void set_rq_offline(struct rq *rq)
5675{
5676 if (rq->online) {
5677 const struct sched_class *class;
5678
5679 for_each_class(class) {
5680 if (class->rq_offline)
5681 class->rq_offline(rq);
5682 }
5683
5684 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5685 rq->online = 0;
5686 }
5687}
5688
5689
5690
5691
5692static int num_cpus_frozen;
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702static void cpuset_cpu_active(void)
5703{
5704 if (cpuhp_tasks_frozen) {
5705
5706
5707
5708
5709
5710
5711 partition_sched_domains(1, NULL, NULL);
5712 if (--num_cpus_frozen)
5713 return;
5714
5715
5716
5717
5718
5719 cpuset_force_rebuild();
5720 }
5721 cpuset_update_active_cpus();
5722}
5723
5724static int cpuset_cpu_inactive(unsigned int cpu)
5725{
5726 if (!cpuhp_tasks_frozen) {
5727 if (dl_cpu_busy(cpu))
5728 return -EBUSY;
5729 cpuset_update_active_cpus();
5730 } else {
5731 num_cpus_frozen++;
5732 partition_sched_domains(1, NULL, NULL);
5733 }
5734 return 0;
5735}
5736
5737int sched_cpu_activate(unsigned int cpu)
5738{
5739 struct rq *rq = cpu_rq(cpu);
5740 struct rq_flags rf;
5741
5742#ifdef CONFIG_SCHED_SMT
5743
5744
5745
5746
5747
5748
5749
5750
5751 if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
5752 static_branch_enable_cpuslocked(&sched_smt_present);
5753#endif
5754 set_cpu_active(cpu, true);
5755
5756 if (sched_smp_initialized) {
5757 sched_domains_numa_masks_set(cpu);
5758 cpuset_cpu_active();
5759 }
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770 rq_lock_irqsave(rq, &rf);
5771 if (rq->rd) {
5772 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5773 set_rq_online(rq);
5774 }
5775 rq_unlock_irqrestore(rq, &rf);
5776
5777 update_max_interval();
5778
5779 return 0;
5780}
5781
5782int sched_cpu_deactivate(unsigned int cpu)
5783{
5784 int ret;
5785
5786 set_cpu_active(cpu, false);
5787
5788
5789
5790
5791
5792
5793
5794 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5795
5796 if (!sched_smp_initialized)
5797 return 0;
5798
5799 ret = cpuset_cpu_inactive(cpu);
5800 if (ret) {
5801 set_cpu_active(cpu, true);
5802 return ret;
5803 }
5804 sched_domains_numa_masks_clear(cpu);
5805 return 0;
5806}
5807
5808static void sched_rq_cpu_starting(unsigned int cpu)
5809{
5810 struct rq *rq = cpu_rq(cpu);
5811
5812 rq->calc_load_update = calc_load_update;
5813 update_max_interval();
5814}
5815
5816int sched_cpu_starting(unsigned int cpu)
5817{
5818 sched_rq_cpu_starting(cpu);
5819 sched_tick_start(cpu);
5820 return 0;
5821}
5822
5823#ifdef CONFIG_HOTPLUG_CPU
5824int sched_cpu_dying(unsigned int cpu)
5825{
5826 struct rq *rq = cpu_rq(cpu);
5827 struct rq_flags rf;
5828
5829
5830 sched_ttwu_pending();
5831 sched_tick_stop(cpu);
5832
5833 rq_lock_irqsave(rq, &rf);
5834 if (rq->rd) {
5835 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5836 set_rq_offline(rq);
5837 }
5838 migrate_tasks(rq, &rf);
5839 BUG_ON(rq->nr_running != 1);
5840 rq_unlock_irqrestore(rq, &rf);
5841
5842 calc_load_migrate(rq);
5843 update_max_interval();
5844 nohz_balance_exit_idle(rq);
5845 hrtick_clear(rq);
5846 return 0;
5847}
5848#endif
5849
5850void __init sched_init_smp(void)
5851{
5852 sched_init_numa();
5853
5854
5855
5856
5857
5858
5859 mutex_lock(&sched_domains_mutex);
5860 sched_init_domains(cpu_active_mask);
5861 mutex_unlock(&sched_domains_mutex);
5862
5863
5864 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
5865 BUG();
5866 sched_init_granularity();
5867
5868 init_sched_rt_class();
5869 init_sched_dl_class();
5870
5871 sched_smp_initialized = true;
5872}
5873
5874static int __init migration_init(void)
5875{
5876 sched_rq_cpu_starting(smp_processor_id());
5877 return 0;
5878}
5879early_initcall(migration_init);
5880
5881#else
5882void __init sched_init_smp(void)
5883{
5884 sched_init_granularity();
5885}
5886#endif
5887
5888int in_sched_functions(unsigned long addr)
5889{
5890 return in_lock_functions(addr) ||
5891 (addr >= (unsigned long)__sched_text_start
5892 && addr < (unsigned long)__sched_text_end);
5893}
5894
5895#ifdef CONFIG_CGROUP_SCHED
5896
5897
5898
5899
5900struct task_group root_task_group;
5901LIST_HEAD(task_groups);
5902
5903
5904static struct kmem_cache *task_group_cache __read_mostly;
5905#endif
5906
5907DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
5908DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5909
5910void __init sched_init(void)
5911{
5912 int i, j;
5913 unsigned long alloc_size = 0, ptr;
5914
5915 wait_bit_init();
5916
5917#ifdef CONFIG_FAIR_GROUP_SCHED
5918 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5919#endif
5920#ifdef CONFIG_RT_GROUP_SCHED
5921 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5922#endif
5923 if (alloc_size) {
5924 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
5925
5926#ifdef CONFIG_FAIR_GROUP_SCHED
5927 root_task_group.se = (struct sched_entity **)ptr;
5928 ptr += nr_cpu_ids * sizeof(void **);
5929
5930 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
5931 ptr += nr_cpu_ids * sizeof(void **);
5932
5933#endif
5934#ifdef CONFIG_RT_GROUP_SCHED
5935 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
5936 ptr += nr_cpu_ids * sizeof(void **);
5937
5938 root_task_group.rt_rq = (struct rt_rq **)ptr;
5939 ptr += nr_cpu_ids * sizeof(void **);
5940
5941#endif
5942 }
5943#ifdef CONFIG_CPUMASK_OFFSTACK
5944 for_each_possible_cpu(i) {
5945 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
5946 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5947 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
5948 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5949 }
5950#endif
5951
5952 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
5953 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
5954
5955#ifdef CONFIG_SMP
5956 init_defrootdomain();
5957#endif
5958
5959#ifdef CONFIG_RT_GROUP_SCHED
5960 init_rt_bandwidth(&root_task_group.rt_bandwidth,
5961 global_rt_period(), global_rt_runtime());
5962#endif
5963
5964#ifdef CONFIG_CGROUP_SCHED
5965 task_group_cache = KMEM_CACHE(task_group, 0);
5966
5967 list_add(&root_task_group.list, &task_groups);
5968 INIT_LIST_HEAD(&root_task_group.children);
5969 INIT_LIST_HEAD(&root_task_group.siblings);
5970 autogroup_init(&init_task);
5971#endif
5972
5973 for_each_possible_cpu(i) {
5974 struct rq *rq;
5975
5976 rq = cpu_rq(i);
5977 raw_spin_lock_init(&rq->lock);
5978 rq->nr_running = 0;
5979 rq->calc_load_active = 0;
5980 rq->calc_load_update = jiffies + LOAD_FREQ;
5981 init_cfs_rq(&rq->cfs);
5982 init_rt_rq(&rq->rt);
5983 init_dl_rq(&rq->dl);
5984#ifdef CONFIG_FAIR_GROUP_SCHED
5985 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
5986 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
5987 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6008 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6009#endif
6010
6011 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6012#ifdef CONFIG_RT_GROUP_SCHED
6013 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6014#endif
6015
6016 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6017 rq->cpu_load[j] = 0;
6018
6019#ifdef CONFIG_SMP
6020 rq->sd = NULL;
6021 rq->rd = NULL;
6022 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
6023 rq->balance_callback = NULL;
6024 rq->active_balance = 0;
6025 rq->next_balance = jiffies;
6026 rq->push_cpu = 0;
6027 rq->cpu = i;
6028 rq->online = 0;
6029 rq->idle_stamp = 0;
6030 rq->avg_idle = 2*sysctl_sched_migration_cost;
6031 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6032
6033 INIT_LIST_HEAD(&rq->cfs_tasks);
6034
6035 rq_attach_root(rq, &def_root_domain);
6036#ifdef CONFIG_NO_HZ_COMMON
6037 rq->last_load_update_tick = jiffies;
6038 rq->last_blocked_load_update_tick = jiffies;
6039 atomic_set(&rq->nohz_flags, 0);
6040#endif
6041#endif
6042 hrtick_rq_init(rq);
6043 atomic_set(&rq->nr_iowait, 0);
6044 }
6045
6046 set_load_weight(&init_task, false);
6047
6048
6049
6050
6051 mmgrab(&init_mm);
6052 enter_lazy_tlb(&init_mm, current);
6053
6054
6055
6056
6057
6058
6059
6060 init_idle(current, smp_processor_id());
6061
6062 calc_load_update = jiffies + LOAD_FREQ;
6063
6064#ifdef CONFIG_SMP
6065 idle_thread_set_boot_cpu();
6066#endif
6067 init_sched_fair_class();
6068
6069 init_schedstats();
6070
6071 scheduler_running = 1;
6072}
6073
6074#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6075static inline int preempt_count_equals(int preempt_offset)
6076{
6077 int nested = preempt_count() + rcu_preempt_depth();
6078
6079 return (nested == preempt_offset);
6080}
6081
6082void __might_sleep(const char *file, int line, int preempt_offset)
6083{
6084
6085
6086
6087
6088
6089 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6090 "do not call blocking ops when !TASK_RUNNING; "
6091 "state=%lx set at [<%p>] %pS\n",
6092 current->state,
6093 (void *)current->task_state_change,
6094 (void *)current->task_state_change);
6095
6096 ___might_sleep(file, line, preempt_offset);
6097}
6098EXPORT_SYMBOL(__might_sleep);
6099
6100void ___might_sleep(const char *file, int line, int preempt_offset)
6101{
6102
6103 static unsigned long prev_jiffy;
6104
6105 unsigned long preempt_disable_ip;
6106
6107
6108 rcu_sleep_check();
6109
6110 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6111 !is_idle_task(current)) ||
6112 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6113 oops_in_progress)
6114 return;
6115
6116 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6117 return;
6118 prev_jiffy = jiffies;
6119
6120
6121 preempt_disable_ip = get_preempt_disable_ip(current);
6122
6123 printk(KERN_ERR
6124 "BUG: sleeping function called from invalid context at %s:%d\n",
6125 file, line);
6126 printk(KERN_ERR
6127 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6128 in_atomic(), irqs_disabled(),
6129 current->pid, current->comm);
6130
6131 if (task_stack_end_corrupted(current))
6132 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6133
6134 debug_show_held_locks(current);
6135 if (irqs_disabled())
6136 print_irqtrace_events(current);
6137 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6138 && !preempt_count_equals(preempt_offset)) {
6139 pr_err("Preemption disabled at:");
6140 print_ip_sym(preempt_disable_ip);
6141 pr_cont("\n");
6142 }
6143 dump_stack();
6144 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6145}
6146EXPORT_SYMBOL(___might_sleep);
6147#endif
6148
6149#ifdef CONFIG_MAGIC_SYSRQ
6150void normalize_rt_tasks(void)
6151{
6152 struct task_struct *g, *p;
6153 struct sched_attr attr = {
6154 .sched_policy = SCHED_NORMAL,
6155 };
6156
6157 read_lock(&tasklist_lock);
6158 for_each_process_thread(g, p) {
6159
6160
6161
6162 if (p->flags & PF_KTHREAD)
6163 continue;
6164
6165 p->se.exec_start = 0;
6166 schedstat_set(p->se.statistics.wait_start, 0);
6167 schedstat_set(p->se.statistics.sleep_start, 0);
6168 schedstat_set(p->se.statistics.block_start, 0);
6169
6170 if (!dl_task(p) && !rt_task(p)) {
6171
6172
6173
6174
6175 if (task_nice(p) < 0)
6176 set_user_nice(p, 0);
6177 continue;
6178 }
6179
6180 __sched_setscheduler(p, &attr, false, false);
6181 }
6182 read_unlock(&tasklist_lock);
6183}
6184
6185#endif
6186
6187#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206struct task_struct *curr_task(int cpu)
6207{
6208 return cpu_curr(cpu);
6209}
6210
6211#endif
6212
6213#ifdef CONFIG_IA64
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229void ia64_set_curr_task(int cpu, struct task_struct *p)
6230{
6231 cpu_curr(cpu) = p;
6232}
6233
6234#endif
6235
6236#ifdef CONFIG_CGROUP_SCHED
6237
6238static DEFINE_SPINLOCK(task_group_lock);
6239
6240static void sched_free_group(struct task_group *tg)
6241{
6242 free_fair_sched_group(tg);
6243 free_rt_sched_group(tg);
6244 autogroup_free(tg);
6245 kmem_cache_free(task_group_cache, tg);
6246}
6247
6248
6249struct task_group *sched_create_group(struct task_group *parent)
6250{
6251 struct task_group *tg;
6252
6253 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6254 if (!tg)
6255 return ERR_PTR(-ENOMEM);
6256
6257 if (!alloc_fair_sched_group(tg, parent))
6258 goto err;
6259
6260 if (!alloc_rt_sched_group(tg, parent))
6261 goto err;
6262
6263 return tg;
6264
6265err:
6266 sched_free_group(tg);
6267 return ERR_PTR(-ENOMEM);
6268}
6269
6270void sched_online_group(struct task_group *tg, struct task_group *parent)
6271{
6272 unsigned long flags;
6273
6274 spin_lock_irqsave(&task_group_lock, flags);
6275 list_add_rcu(&tg->list, &task_groups);
6276
6277
6278 WARN_ON(!parent);
6279
6280 tg->parent = parent;
6281 INIT_LIST_HEAD(&tg->children);
6282 list_add_rcu(&tg->siblings, &parent->children);
6283 spin_unlock_irqrestore(&task_group_lock, flags);
6284
6285 online_fair_sched_group(tg);
6286}
6287
6288
6289static void sched_free_group_rcu(struct rcu_head *rhp)
6290{
6291
6292 sched_free_group(container_of(rhp, struct task_group, rcu));
6293}
6294
6295void sched_destroy_group(struct task_group *tg)
6296{
6297
6298 call_rcu(&tg->rcu, sched_free_group_rcu);
6299}
6300
6301void sched_offline_group(struct task_group *tg)
6302{
6303 unsigned long flags;
6304
6305
6306 unregister_fair_sched_group(tg);
6307
6308 spin_lock_irqsave(&task_group_lock, flags);
6309 list_del_rcu(&tg->list);
6310 list_del_rcu(&tg->siblings);
6311 spin_unlock_irqrestore(&task_group_lock, flags);
6312}
6313
6314static void sched_change_group(struct task_struct *tsk, int type)
6315{
6316 struct task_group *tg;
6317
6318
6319
6320
6321
6322
6323 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6324 struct task_group, css);
6325 tg = autogroup_task_group(tsk, tg);
6326 tsk->sched_task_group = tg;
6327
6328#ifdef CONFIG_FAIR_GROUP_SCHED
6329 if (tsk->sched_class->task_change_group)
6330 tsk->sched_class->task_change_group(tsk, type);
6331 else
6332#endif
6333 set_task_rq(tsk, task_cpu(tsk));
6334}
6335
6336
6337
6338
6339
6340
6341
6342
6343void sched_move_task(struct task_struct *tsk)
6344{
6345 int queued, running, queue_flags =
6346 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6347 struct rq_flags rf;
6348 struct rq *rq;
6349
6350 rq = task_rq_lock(tsk, &rf);
6351 update_rq_clock(rq);
6352
6353 running = task_current(rq, tsk);
6354 queued = task_on_rq_queued(tsk);
6355
6356 if (queued)
6357 dequeue_task(rq, tsk, queue_flags);
6358 if (running)
6359 put_prev_task(rq, tsk);
6360
6361 sched_change_group(tsk, TASK_MOVE_GROUP);
6362
6363 if (queued)
6364 enqueue_task(rq, tsk, queue_flags);
6365 if (running)
6366 set_curr_task(rq, tsk);
6367
6368 task_rq_unlock(rq, tsk, &rf);
6369}
6370
6371static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6372{
6373 return css ? container_of(css, struct task_group, css) : NULL;
6374}
6375
6376static struct cgroup_subsys_state *
6377cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6378{
6379 struct task_group *parent = css_tg(parent_css);
6380 struct task_group *tg;
6381
6382 if (!parent) {
6383
6384 return &root_task_group.css;
6385 }
6386
6387 tg = sched_create_group(parent);
6388 if (IS_ERR(tg))
6389 return ERR_PTR(-ENOMEM);
6390
6391 return &tg->css;
6392}
6393
6394
6395static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6396{
6397 struct task_group *tg = css_tg(css);
6398 struct task_group *parent = css_tg(css->parent);
6399
6400 if (parent)
6401 sched_online_group(tg, parent);
6402 return 0;
6403}
6404
6405static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6406{
6407 struct task_group *tg = css_tg(css);
6408
6409 sched_offline_group(tg);
6410}
6411
6412static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6413{
6414 struct task_group *tg = css_tg(css);
6415
6416
6417
6418
6419 sched_free_group(tg);
6420}
6421
6422
6423
6424
6425
6426static void cpu_cgroup_fork(struct task_struct *task)
6427{
6428 struct rq_flags rf;
6429 struct rq *rq;
6430
6431 rq = task_rq_lock(task, &rf);
6432
6433 update_rq_clock(rq);
6434 sched_change_group(task, TASK_SET_GROUP);
6435
6436 task_rq_unlock(rq, task, &rf);
6437}
6438
6439static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6440{
6441 struct task_struct *task;
6442 struct cgroup_subsys_state *css;
6443 int ret = 0;
6444
6445 cgroup_taskset_for_each(task, css, tset) {
6446#ifdef CONFIG_RT_GROUP_SCHED
6447 if (!sched_rt_can_attach(css_tg(css), task))
6448 return -EINVAL;
6449#else
6450
6451 if (task->sched_class != &fair_sched_class)
6452 return -EINVAL;
6453#endif
6454
6455
6456
6457
6458 raw_spin_lock_irq(&task->pi_lock);
6459
6460
6461
6462
6463
6464 if (task->state == TASK_NEW)
6465 ret = -EINVAL;
6466 raw_spin_unlock_irq(&task->pi_lock);
6467
6468 if (ret)
6469 break;
6470 }
6471 return ret;
6472}
6473
6474static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6475{
6476 struct task_struct *task;
6477 struct cgroup_subsys_state *css;
6478
6479 cgroup_taskset_for_each(task, css, tset)
6480 sched_move_task(task);
6481}
6482
6483#ifdef CONFIG_FAIR_GROUP_SCHED
6484static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6485 struct cftype *cftype, u64 shareval)
6486{
6487 return sched_group_set_shares(css_tg(css), scale_load(shareval));
6488}
6489
6490static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6491 struct cftype *cft)
6492{
6493 struct task_group *tg = css_tg(css);
6494
6495 return (u64) scale_load_down(tg->shares);
6496}
6497
6498#ifdef CONFIG_CFS_BANDWIDTH
6499static DEFINE_MUTEX(cfs_constraints_mutex);
6500
6501const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
6502const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
6503
6504static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
6505
6506static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
6507{
6508 int i, ret = 0, runtime_enabled, runtime_was_enabled;
6509 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6510
6511 if (tg == &root_task_group)
6512 return -EINVAL;
6513
6514
6515
6516
6517
6518
6519 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
6520 return -EINVAL;
6521
6522
6523
6524
6525
6526
6527 if (period > max_cfs_quota_period)
6528 return -EINVAL;
6529
6530
6531
6532
6533
6534 get_online_cpus();
6535 mutex_lock(&cfs_constraints_mutex);
6536 ret = __cfs_schedulable(tg, period, quota);
6537 if (ret)
6538 goto out_unlock;
6539
6540 runtime_enabled = quota != RUNTIME_INF;
6541 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
6542
6543
6544
6545
6546 if (runtime_enabled && !runtime_was_enabled)
6547 cfs_bandwidth_usage_inc();
6548 raw_spin_lock_irq(&cfs_b->lock);
6549 cfs_b->period = ns_to_ktime(period);
6550 cfs_b->quota = quota;
6551
6552 __refill_cfs_bandwidth_runtime(cfs_b);
6553
6554
6555 if (runtime_enabled)
6556 start_cfs_bandwidth(cfs_b);
6557
6558 raw_spin_unlock_irq(&cfs_b->lock);
6559
6560 for_each_online_cpu(i) {
6561 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
6562 struct rq *rq = cfs_rq->rq;
6563 struct rq_flags rf;
6564
6565 rq_lock_irq(rq, &rf);
6566 cfs_rq->runtime_enabled = runtime_enabled;
6567 cfs_rq->runtime_remaining = 0;
6568
6569 if (cfs_rq->throttled)
6570 unthrottle_cfs_rq(cfs_rq);
6571 rq_unlock_irq(rq, &rf);
6572 }
6573 if (runtime_was_enabled && !runtime_enabled)
6574 cfs_bandwidth_usage_dec();
6575out_unlock:
6576 mutex_unlock(&cfs_constraints_mutex);
6577 put_online_cpus();
6578
6579 return ret;
6580}
6581
6582int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
6583{
6584 u64 quota, period;
6585
6586 period = ktime_to_ns(tg->cfs_bandwidth.period);
6587 if (cfs_quota_us < 0)
6588 quota = RUNTIME_INF;
6589 else
6590 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
6591
6592 return tg_set_cfs_bandwidth(tg, period, quota);
6593}
6594
6595long tg_get_cfs_quota(struct task_group *tg)
6596{
6597 u64 quota_us;
6598
6599 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
6600 return -1;
6601
6602 quota_us = tg->cfs_bandwidth.quota;
6603 do_div(quota_us, NSEC_PER_USEC);
6604
6605 return quota_us;
6606}
6607
6608int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
6609{
6610 u64 quota, period;
6611
6612 period = (u64)cfs_period_us * NSEC_PER_USEC;
6613 quota = tg->cfs_bandwidth.quota;
6614
6615 return tg_set_cfs_bandwidth(tg, period, quota);
6616}
6617
6618long tg_get_cfs_period(struct task_group *tg)
6619{
6620 u64 cfs_period_us;
6621
6622 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
6623 do_div(cfs_period_us, NSEC_PER_USEC);
6624
6625 return cfs_period_us;
6626}
6627
6628static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
6629 struct cftype *cft)
6630{
6631 return tg_get_cfs_quota(css_tg(css));
6632}
6633
6634static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
6635 struct cftype *cftype, s64 cfs_quota_us)
6636{
6637 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
6638}
6639
6640static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
6641 struct cftype *cft)
6642{
6643 return tg_get_cfs_period(css_tg(css));
6644}
6645
6646static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
6647 struct cftype *cftype, u64 cfs_period_us)
6648{
6649 return tg_set_cfs_period(css_tg(css), cfs_period_us);
6650}
6651
6652struct cfs_schedulable_data {
6653 struct task_group *tg;
6654 u64 period, quota;
6655};
6656
6657
6658
6659
6660
6661static u64 normalize_cfs_quota(struct task_group *tg,
6662 struct cfs_schedulable_data *d)
6663{
6664 u64 quota, period;
6665
6666 if (tg == d->tg) {
6667 period = d->period;
6668 quota = d->quota;
6669 } else {
6670 period = tg_get_cfs_period(tg);
6671 quota = tg_get_cfs_quota(tg);
6672 }
6673
6674
6675 if (quota == RUNTIME_INF || quota == -1)
6676 return RUNTIME_INF;
6677
6678 return to_ratio(period, quota);
6679}
6680
6681static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
6682{
6683 struct cfs_schedulable_data *d = data;
6684 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6685 s64 quota = 0, parent_quota = -1;
6686
6687 if (!tg->parent) {
6688 quota = RUNTIME_INF;
6689 } else {
6690 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
6691
6692 quota = normalize_cfs_quota(tg, d);
6693 parent_quota = parent_b->hierarchical_quota;
6694
6695
6696
6697
6698
6699
6700 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
6701 quota = min(quota, parent_quota);
6702 } else {
6703 if (quota == RUNTIME_INF)
6704 quota = parent_quota;
6705 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
6706 return -EINVAL;
6707 }
6708 }
6709 cfs_b->hierarchical_quota = quota;
6710
6711 return 0;
6712}
6713
6714static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
6715{
6716 int ret;
6717 struct cfs_schedulable_data data = {
6718 .tg = tg,
6719 .period = period,
6720 .quota = quota,
6721 };
6722
6723 if (quota != RUNTIME_INF) {
6724 do_div(data.period, NSEC_PER_USEC);
6725 do_div(data.quota, NSEC_PER_USEC);
6726 }
6727
6728 rcu_read_lock();
6729 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
6730 rcu_read_unlock();
6731
6732 return ret;
6733}
6734
6735static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
6736{
6737 struct task_group *tg = css_tg(seq_css(sf));
6738 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6739
6740 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
6741 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
6742 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
6743
6744 if (schedstat_enabled() && tg != &root_task_group) {
6745 u64 ws = 0;
6746 int i;
6747
6748 for_each_possible_cpu(i)
6749 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
6750
6751 seq_printf(sf, "wait_sum %llu\n", ws);
6752 }
6753
6754 return 0;
6755}
6756#endif
6757#endif
6758
6759#ifdef CONFIG_RT_GROUP_SCHED
6760static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
6761 struct cftype *cft, s64 val)
6762{
6763 return sched_group_set_rt_runtime(css_tg(css), val);
6764}
6765
6766static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
6767 struct cftype *cft)
6768{
6769 return sched_group_rt_runtime(css_tg(css));
6770}
6771
6772static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
6773 struct cftype *cftype, u64 rt_period_us)
6774{
6775 return sched_group_set_rt_period(css_tg(css), rt_period_us);
6776}
6777
6778static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
6779 struct cftype *cft)
6780{
6781 return sched_group_rt_period(css_tg(css));
6782}
6783#endif
6784
6785static struct cftype cpu_legacy_files[] = {
6786#ifdef CONFIG_FAIR_GROUP_SCHED
6787 {
6788 .name = "shares",
6789 .read_u64 = cpu_shares_read_u64,
6790 .write_u64 = cpu_shares_write_u64,
6791 },
6792#endif
6793#ifdef CONFIG_CFS_BANDWIDTH
6794 {
6795 .name = "cfs_quota_us",
6796 .read_s64 = cpu_cfs_quota_read_s64,
6797 .write_s64 = cpu_cfs_quota_write_s64,
6798 },
6799 {
6800 .name = "cfs_period_us",
6801 .read_u64 = cpu_cfs_period_read_u64,
6802 .write_u64 = cpu_cfs_period_write_u64,
6803 },
6804 {
6805 .name = "stat",
6806 .seq_show = cpu_cfs_stat_show,
6807 },
6808#endif
6809#ifdef CONFIG_RT_GROUP_SCHED
6810 {
6811 .name = "rt_runtime_us",
6812 .read_s64 = cpu_rt_runtime_read,
6813 .write_s64 = cpu_rt_runtime_write,
6814 },
6815 {
6816 .name = "rt_period_us",
6817 .read_u64 = cpu_rt_period_read_uint,
6818 .write_u64 = cpu_rt_period_write_uint,
6819 },
6820#endif
6821 { }
6822};
6823
6824static int cpu_extra_stat_show(struct seq_file *sf,
6825 struct cgroup_subsys_state *css)
6826{
6827#ifdef CONFIG_CFS_BANDWIDTH
6828 {
6829 struct task_group *tg = css_tg(css);
6830 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6831 u64 throttled_usec;
6832
6833 throttled_usec = cfs_b->throttled_time;
6834 do_div(throttled_usec, NSEC_PER_USEC);
6835
6836 seq_printf(sf, "nr_periods %d\n"
6837 "nr_throttled %d\n"
6838 "throttled_usec %llu\n",
6839 cfs_b->nr_periods, cfs_b->nr_throttled,
6840 throttled_usec);
6841 }
6842#endif
6843 return 0;
6844}
6845
6846#ifdef CONFIG_FAIR_GROUP_SCHED
6847static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
6848 struct cftype *cft)
6849{
6850 struct task_group *tg = css_tg(css);
6851 u64 weight = scale_load_down(tg->shares);
6852
6853 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
6854}
6855
6856static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
6857 struct cftype *cft, u64 weight)
6858{
6859
6860
6861
6862
6863
6864
6865
6866 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
6867 return -ERANGE;
6868
6869 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
6870
6871 return sched_group_set_shares(css_tg(css), scale_load(weight));
6872}
6873
6874static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
6875 struct cftype *cft)
6876{
6877 unsigned long weight = scale_load_down(css_tg(css)->shares);
6878 int last_delta = INT_MAX;
6879 int prio, delta;
6880
6881
6882 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
6883 delta = abs(sched_prio_to_weight[prio] - weight);
6884 if (delta >= last_delta)
6885 break;
6886 last_delta = delta;
6887 }
6888
6889 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
6890}
6891
6892static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
6893 struct cftype *cft, s64 nice)
6894{
6895 unsigned long weight;
6896 int idx;
6897
6898 if (nice < MIN_NICE || nice > MAX_NICE)
6899 return -ERANGE;
6900
6901 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
6902 idx = array_index_nospec(idx, 40);
6903 weight = sched_prio_to_weight[idx];
6904
6905 return sched_group_set_shares(css_tg(css), scale_load(weight));
6906}
6907#endif
6908
6909static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
6910 long period, long quota)
6911{
6912 if (quota < 0)
6913 seq_puts(sf, "max");
6914 else
6915 seq_printf(sf, "%ld", quota);
6916
6917 seq_printf(sf, " %ld\n", period);
6918}
6919
6920
6921static int __maybe_unused cpu_period_quota_parse(char *buf,
6922 u64 *periodp, u64 *quotap)
6923{
6924 char tok[21];
6925
6926 if (!sscanf(buf, "%s %llu", tok, periodp))
6927 return -EINVAL;
6928
6929 *periodp *= NSEC_PER_USEC;
6930
6931 if (sscanf(tok, "%llu", quotap))
6932 *quotap *= NSEC_PER_USEC;
6933 else if (!strcmp(tok, "max"))
6934 *quotap = RUNTIME_INF;
6935 else
6936 return -EINVAL;
6937
6938 return 0;
6939}
6940
6941#ifdef CONFIG_CFS_BANDWIDTH
6942static int cpu_max_show(struct seq_file *sf, void *v)
6943{
6944 struct task_group *tg = css_tg(seq_css(sf));
6945
6946 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
6947 return 0;
6948}
6949
6950static ssize_t cpu_max_write(struct kernfs_open_file *of,
6951 char *buf, size_t nbytes, loff_t off)
6952{
6953 struct task_group *tg = css_tg(of_css(of));
6954 u64 period = tg_get_cfs_period(tg);
6955 u64 quota;
6956 int ret;
6957
6958 ret = cpu_period_quota_parse(buf, &period, "a);
6959 if (!ret)
6960 ret = tg_set_cfs_bandwidth(tg, period, quota);
6961 return ret ?: nbytes;
6962}
6963#endif
6964
6965static struct cftype cpu_files[] = {
6966#ifdef CONFIG_FAIR_GROUP_SCHED
6967 {
6968 .name = "weight",
6969 .flags = CFTYPE_NOT_ON_ROOT,
6970 .read_u64 = cpu_weight_read_u64,
6971 .write_u64 = cpu_weight_write_u64,
6972 },
6973 {
6974 .name = "weight.nice",
6975 .flags = CFTYPE_NOT_ON_ROOT,
6976 .read_s64 = cpu_weight_nice_read_s64,
6977 .write_s64 = cpu_weight_nice_write_s64,
6978 },
6979#endif
6980#ifdef CONFIG_CFS_BANDWIDTH
6981 {
6982 .name = "max",
6983 .flags = CFTYPE_NOT_ON_ROOT,
6984 .seq_show = cpu_max_show,
6985 .write = cpu_max_write,
6986 },
6987#endif
6988 { }
6989};
6990
6991struct cgroup_subsys cpu_cgrp_subsys = {
6992 .css_alloc = cpu_cgroup_css_alloc,
6993 .css_online = cpu_cgroup_css_online,
6994 .css_released = cpu_cgroup_css_released,
6995 .css_free = cpu_cgroup_css_free,
6996 .css_extra_stat_show = cpu_extra_stat_show,
6997 .fork = cpu_cgroup_fork,
6998 .can_attach = cpu_cgroup_can_attach,
6999 .attach = cpu_cgroup_attach,
7000 .legacy_cftypes = cpu_legacy_files,
7001 .dfl_cftypes = cpu_files,
7002 .early_init = true,
7003 .threaded = true,
7004};
7005
7006#endif
7007
7008void dump_cpu_task(int cpu)
7009{
7010 pr_info("Task dump for CPU %d:\n", cpu);
7011 sched_show_task(cpu_curr(cpu));
7012}
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026const int sched_prio_to_weight[40] = {
7027 88761, 71755, 56483, 46273, 36291,
7028 29154, 23254, 18705, 14949, 11916,
7029 9548, 7620, 6100, 4904, 3906,
7030 3121, 2501, 1991, 1586, 1277,
7031 1024, 820, 655, 526, 423,
7032 335, 272, 215, 172, 137,
7033 110, 87, 70, 56, 45,
7034 36, 29, 23, 18, 15,
7035};
7036
7037
7038
7039
7040
7041
7042
7043
7044const u32 sched_prio_to_wmult[40] = {
7045 48388, 59856, 76040, 92818, 118348,
7046 147320, 184698, 229616, 287308, 360437,
7047 449829, 563644, 704093, 875809, 1099582,
7048 1376151, 1717300, 2157191, 2708050, 3363326,
7049 4194304, 5237765, 6557202, 8165337, 10153587,
7050 12820798, 15790321, 19976592, 24970740, 31350126,
7051 39045157, 49367440, 61356676, 76695844, 95443717,
7052 119304647, 148102320, 186737708, 238609294, 286331153,
7053};
7054
7055#undef CREATE_TRACE_POINTS
7056