1
2
3
4
5
6
7
8#include "sched.h"
9
10#include <linux/nospec.h>
11
12#include <linux/kcov.h>
13
14#include <asm/switch_to.h>
15#include <asm/tlb.h>
16
17#include "../workqueue_internal.h"
18#include "../smpboot.h"
19
20#include "pelt.h"
21
22#define CREATE_TRACE_POINTS
23#include <trace/events/sched.h>
24
25DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
26
27#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
28
29
30
31
32
33
34
35#define SCHED_FEAT(name, enabled) \
36 (1UL << __SCHED_FEAT_##name) * enabled |
37const_debug unsigned int sysctl_sched_features =
38#include "features.h"
39 0;
40#undef SCHED_FEAT
41#endif
42
43
44
45
46
47const_debug unsigned int sysctl_sched_nr_migrate = 32;
48
49
50
51
52
53unsigned int sysctl_sched_rt_period = 1000000;
54
55__read_mostly int scheduler_running;
56
57
58
59
60
61int sysctl_sched_rt_runtime = 950000;
62
63
64
65
66struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
67 __acquires(rq->lock)
68{
69 struct rq *rq;
70
71 lockdep_assert_held(&p->pi_lock);
72
73 for (;;) {
74 rq = task_rq(p);
75 raw_spin_lock(&rq->lock);
76 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
77 rq_pin_lock(rq, rf);
78 return rq;
79 }
80 raw_spin_unlock(&rq->lock);
81
82 while (unlikely(task_on_rq_migrating(p)))
83 cpu_relax();
84 }
85}
86
87
88
89
90struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
91 __acquires(p->pi_lock)
92 __acquires(rq->lock)
93{
94 struct rq *rq;
95
96 for (;;) {
97 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
98 rq = task_rq(p);
99 raw_spin_lock(&rq->lock);
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
118 rq_pin_lock(rq, rf);
119 return rq;
120 }
121 raw_spin_unlock(&rq->lock);
122 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
123
124 while (unlikely(task_on_rq_migrating(p)))
125 cpu_relax();
126 }
127}
128
129
130
131
132
133static void update_rq_clock_task(struct rq *rq, s64 delta)
134{
135
136
137
138
139 s64 __maybe_unused steal = 0, irq_delta = 0;
140
141#ifdef CONFIG_IRQ_TIME_ACCOUNTING
142 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159 if (irq_delta > delta)
160 irq_delta = delta;
161
162 rq->prev_irq_time += irq_delta;
163 delta -= irq_delta;
164#endif
165#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
166 if (static_key_false((¶virt_steal_rq_enabled))) {
167 steal = paravirt_steal_clock(cpu_of(rq));
168 steal -= rq->prev_steal_time_rq;
169
170 if (unlikely(steal > delta))
171 steal = delta;
172
173 rq->prev_steal_time_rq += steal;
174 delta -= steal;
175 }
176#endif
177
178 rq->clock_task += delta;
179
180#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
181 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
182 update_irq_load_avg(rq, irq_delta + steal);
183#endif
184 update_rq_clock_pelt(rq, delta);
185}
186
187void update_rq_clock(struct rq *rq)
188{
189 s64 delta;
190
191 lockdep_assert_held(&rq->lock);
192
193 if (rq->clock_update_flags & RQCF_ACT_SKIP)
194 return;
195
196#ifdef CONFIG_SCHED_DEBUG
197 if (sched_feat(WARN_DOUBLE_CLOCK))
198 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
199 rq->clock_update_flags |= RQCF_UPDATED;
200#endif
201
202 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
203 if (delta < 0)
204 return;
205 rq->clock += delta;
206 update_rq_clock_task(rq, delta);
207}
208
209
210#ifdef CONFIG_SCHED_HRTICK
211
212
213
214
215static void hrtick_clear(struct rq *rq)
216{
217 if (hrtimer_active(&rq->hrtick_timer))
218 hrtimer_cancel(&rq->hrtick_timer);
219}
220
221
222
223
224
225static enum hrtimer_restart hrtick(struct hrtimer *timer)
226{
227 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
228 struct rq_flags rf;
229
230 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
231
232 rq_lock(rq, &rf);
233 update_rq_clock(rq);
234 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
235 rq_unlock(rq, &rf);
236
237 return HRTIMER_NORESTART;
238}
239
240#ifdef CONFIG_SMP
241
242static void __hrtick_restart(struct rq *rq)
243{
244 struct hrtimer *timer = &rq->hrtick_timer;
245
246 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
247}
248
249
250
251
252static void __hrtick_start(void *arg)
253{
254 struct rq *rq = arg;
255 struct rq_flags rf;
256
257 rq_lock(rq, &rf);
258 __hrtick_restart(rq);
259 rq->hrtick_csd_pending = 0;
260 rq_unlock(rq, &rf);
261}
262
263
264
265
266
267
268void hrtick_start(struct rq *rq, u64 delay)
269{
270 struct hrtimer *timer = &rq->hrtick_timer;
271 ktime_t time;
272 s64 delta;
273
274
275
276
277
278 delta = max_t(s64, delay, 10000LL);
279 time = ktime_add_ns(timer->base->get_time(), delta);
280
281 hrtimer_set_expires(timer, time);
282
283 if (rq == this_rq()) {
284 __hrtick_restart(rq);
285 } else if (!rq->hrtick_csd_pending) {
286 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
287 rq->hrtick_csd_pending = 1;
288 }
289}
290
291#else
292
293
294
295
296
297void hrtick_start(struct rq *rq, u64 delay)
298{
299
300
301
302
303 delay = max_t(u64, delay, 10000LL);
304 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
305 HRTIMER_MODE_REL_PINNED);
306}
307#endif
308
309static void hrtick_rq_init(struct rq *rq)
310{
311#ifdef CONFIG_SMP
312 rq->hrtick_csd_pending = 0;
313
314 rq->hrtick_csd.flags = 0;
315 rq->hrtick_csd.func = __hrtick_start;
316 rq->hrtick_csd.info = rq;
317#endif
318
319 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
320 rq->hrtick_timer.function = hrtick;
321}
322#else
323static inline void hrtick_clear(struct rq *rq)
324{
325}
326
327static inline void hrtick_rq_init(struct rq *rq)
328{
329}
330#endif
331
332
333
334
335#define fetch_or(ptr, mask) \
336 ({ \
337 typeof(ptr) _ptr = (ptr); \
338 typeof(mask) _mask = (mask); \
339 typeof(*_ptr) _old, _val = *_ptr; \
340 \
341 for (;;) { \
342 _old = cmpxchg(_ptr, _val, _val | _mask); \
343 if (_old == _val) \
344 break; \
345 _val = _old; \
346 } \
347 _old; \
348})
349
350#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
351
352
353
354
355
356static bool set_nr_and_not_polling(struct task_struct *p)
357{
358 struct thread_info *ti = task_thread_info(p);
359 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
360}
361
362
363
364
365
366
367
368static bool set_nr_if_polling(struct task_struct *p)
369{
370 struct thread_info *ti = task_thread_info(p);
371 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
372
373 for (;;) {
374 if (!(val & _TIF_POLLING_NRFLAG))
375 return false;
376 if (val & _TIF_NEED_RESCHED)
377 return true;
378 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
379 if (old == val)
380 break;
381 val = old;
382 }
383 return true;
384}
385
386#else
387static bool set_nr_and_not_polling(struct task_struct *p)
388{
389 set_tsk_need_resched(p);
390 return true;
391}
392
393#ifdef CONFIG_SMP
394static bool set_nr_if_polling(struct task_struct *p)
395{
396 return false;
397}
398#endif
399#endif
400
401static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
402{
403 struct wake_q_node *node = &task->wake_q;
404
405
406
407
408
409
410
411
412
413 smp_mb__before_atomic();
414 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
415 return false;
416
417
418
419
420 *head->lastp = node;
421 head->lastp = &node->next;
422 return true;
423}
424
425
426
427
428
429
430
431
432
433
434
435
436
437void wake_q_add(struct wake_q_head *head, struct task_struct *task)
438{
439 if (__wake_q_add(head, task))
440 get_task_struct(task);
441}
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
461{
462 if (!__wake_q_add(head, task))
463 put_task_struct(task);
464}
465
466void wake_up_q(struct wake_q_head *head)
467{
468 struct wake_q_node *node = head->first;
469
470 while (node != WAKE_Q_TAIL) {
471 struct task_struct *task;
472
473 task = container_of(node, struct task_struct, wake_q);
474 BUG_ON(!task);
475
476 node = node->next;
477 task->wake_q.next = NULL;
478
479
480
481
482
483 wake_up_process(task);
484 put_task_struct(task);
485 }
486}
487
488
489
490
491
492
493
494
495void resched_curr(struct rq *rq)
496{
497 struct task_struct *curr = rq->curr;
498 int cpu;
499
500 lockdep_assert_held(&rq->lock);
501
502 if (test_tsk_need_resched(curr))
503 return;
504
505 cpu = cpu_of(rq);
506
507 if (cpu == smp_processor_id()) {
508 set_tsk_need_resched(curr);
509 set_preempt_need_resched();
510 return;
511 }
512
513 if (set_nr_and_not_polling(curr))
514 smp_send_reschedule(cpu);
515 else
516 trace_sched_wake_idle_without_ipi(cpu);
517}
518
519void resched_cpu(int cpu)
520{
521 struct rq *rq = cpu_rq(cpu);
522 unsigned long flags;
523
524 raw_spin_lock_irqsave(&rq->lock, flags);
525 if (cpu_online(cpu) || cpu == smp_processor_id())
526 resched_curr(rq);
527 raw_spin_unlock_irqrestore(&rq->lock, flags);
528}
529
530#ifdef CONFIG_SMP
531#ifdef CONFIG_NO_HZ_COMMON
532
533
534
535
536
537
538
539
540int get_nohz_timer_target(void)
541{
542 int i, cpu = smp_processor_id();
543 struct sched_domain *sd;
544
545 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
546 return cpu;
547
548 rcu_read_lock();
549 for_each_domain(cpu, sd) {
550 for_each_cpu(i, sched_domain_span(sd)) {
551 if (cpu == i)
552 continue;
553
554 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
555 cpu = i;
556 goto unlock;
557 }
558 }
559 }
560
561 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
562 cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
563unlock:
564 rcu_read_unlock();
565 return cpu;
566}
567
568
569
570
571
572
573
574
575
576
577
578static void wake_up_idle_cpu(int cpu)
579{
580 struct rq *rq = cpu_rq(cpu);
581
582 if (cpu == smp_processor_id())
583 return;
584
585 if (set_nr_and_not_polling(rq->idle))
586 smp_send_reschedule(cpu);
587 else
588 trace_sched_wake_idle_without_ipi(cpu);
589}
590
591static bool wake_up_full_nohz_cpu(int cpu)
592{
593
594
595
596
597
598
599 if (cpu_is_offline(cpu))
600 return true;
601 if (tick_nohz_full_cpu(cpu)) {
602 if (cpu != smp_processor_id() ||
603 tick_nohz_tick_stopped())
604 tick_nohz_full_kick_cpu(cpu);
605 return true;
606 }
607
608 return false;
609}
610
611
612
613
614
615
616void wake_up_nohz_cpu(int cpu)
617{
618 if (!wake_up_full_nohz_cpu(cpu))
619 wake_up_idle_cpu(cpu);
620}
621
622static inline bool got_nohz_idle_kick(void)
623{
624 int cpu = smp_processor_id();
625
626 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
627 return false;
628
629 if (idle_cpu(cpu) && !need_resched())
630 return true;
631
632
633
634
635
636 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
637 return false;
638}
639
640#else
641
642static inline bool got_nohz_idle_kick(void)
643{
644 return false;
645}
646
647#endif
648
649#ifdef CONFIG_NO_HZ_FULL
650bool sched_can_stop_tick(struct rq *rq)
651{
652 int fifo_nr_running;
653
654
655 if (rq->dl.dl_nr_running)
656 return false;
657
658
659
660
661
662 if (rq->rt.rr_nr_running) {
663 if (rq->rt.rr_nr_running == 1)
664 return true;
665 else
666 return false;
667 }
668
669
670
671
672
673 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
674 if (fifo_nr_running)
675 return true;
676
677
678
679
680
681
682 if (rq->nr_running > 1)
683 return false;
684
685 return true;
686}
687#endif
688#endif
689
690#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
691 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
692
693
694
695
696
697
698int walk_tg_tree_from(struct task_group *from,
699 tg_visitor down, tg_visitor up, void *data)
700{
701 struct task_group *parent, *child;
702 int ret;
703
704 parent = from;
705
706down:
707 ret = (*down)(parent, data);
708 if (ret)
709 goto out;
710 list_for_each_entry_rcu(child, &parent->children, siblings) {
711 parent = child;
712 goto down;
713
714up:
715 continue;
716 }
717 ret = (*up)(parent, data);
718 if (ret || parent == from)
719 goto out;
720
721 child = parent;
722 parent = parent->parent;
723 if (parent)
724 goto up;
725out:
726 return ret;
727}
728
729int tg_nop(struct task_group *tg, void *data)
730{
731 return 0;
732}
733#endif
734
735static void set_load_weight(struct task_struct *p, bool update_load)
736{
737 int prio = p->static_prio - MAX_RT_PRIO;
738 struct load_weight *load = &p->se.load;
739
740
741
742
743 if (task_has_idle_policy(p)) {
744 load->weight = scale_load(WEIGHT_IDLEPRIO);
745 load->inv_weight = WMULT_IDLEPRIO;
746 p->se.runnable_weight = load->weight;
747 return;
748 }
749
750
751
752
753
754 if (update_load && p->sched_class == &fair_sched_class) {
755 reweight_task(p, prio);
756 } else {
757 load->weight = scale_load(sched_prio_to_weight[prio]);
758 load->inv_weight = sched_prio_to_wmult[prio];
759 p->se.runnable_weight = load->weight;
760 }
761}
762
763static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
764{
765 if (!(flags & ENQUEUE_NOCLOCK))
766 update_rq_clock(rq);
767
768 if (!(flags & ENQUEUE_RESTORE)) {
769 sched_info_queued(rq, p);
770 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
771 }
772
773 p->sched_class->enqueue_task(rq, p, flags);
774}
775
776static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
777{
778 if (!(flags & DEQUEUE_NOCLOCK))
779 update_rq_clock(rq);
780
781 if (!(flags & DEQUEUE_SAVE)) {
782 sched_info_dequeued(rq, p);
783 psi_dequeue(p, flags & DEQUEUE_SLEEP);
784 }
785
786 p->sched_class->dequeue_task(rq, p, flags);
787}
788
789void activate_task(struct rq *rq, struct task_struct *p, int flags)
790{
791 if (task_contributes_to_load(p))
792 rq->nr_uninterruptible--;
793
794 enqueue_task(rq, p, flags);
795}
796
797void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
798{
799 if (task_contributes_to_load(p))
800 rq->nr_uninterruptible++;
801
802 dequeue_task(rq, p, flags);
803}
804
805
806
807
808static inline int __normal_prio(struct task_struct *p)
809{
810 return p->static_prio;
811}
812
813
814
815
816
817
818
819
820static inline int normal_prio(struct task_struct *p)
821{
822 int prio;
823
824 if (task_has_dl_policy(p))
825 prio = MAX_DL_PRIO-1;
826 else if (task_has_rt_policy(p))
827 prio = MAX_RT_PRIO-1 - p->rt_priority;
828 else
829 prio = __normal_prio(p);
830 return prio;
831}
832
833
834
835
836
837
838
839
840static int effective_prio(struct task_struct *p)
841{
842 p->normal_prio = normal_prio(p);
843
844
845
846
847
848 if (!rt_prio(p->prio))
849 return p->normal_prio;
850 return p->prio;
851}
852
853
854
855
856
857
858
859inline int task_curr(const struct task_struct *p)
860{
861 return cpu_curr(task_cpu(p)) == p;
862}
863
864
865
866
867
868
869
870
871static inline void check_class_changed(struct rq *rq, struct task_struct *p,
872 const struct sched_class *prev_class,
873 int oldprio)
874{
875 if (prev_class != p->sched_class) {
876 if (prev_class->switched_from)
877 prev_class->switched_from(rq, p);
878
879 p->sched_class->switched_to(rq, p);
880 } else if (oldprio != p->prio || dl_task(p))
881 p->sched_class->prio_changed(rq, p, oldprio);
882}
883
884void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
885{
886 const struct sched_class *class;
887
888 if (p->sched_class == rq->curr->sched_class) {
889 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
890 } else {
891 for_each_class(class) {
892 if (class == rq->curr->sched_class)
893 break;
894 if (class == p->sched_class) {
895 resched_curr(rq);
896 break;
897 }
898 }
899 }
900
901
902
903
904
905 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
906 rq_clock_skip_update(rq);
907}
908
909#ifdef CONFIG_SMP
910
911static inline bool is_per_cpu_kthread(struct task_struct *p)
912{
913 if (!(p->flags & PF_KTHREAD))
914 return false;
915
916 if (p->nr_cpus_allowed != 1)
917 return false;
918
919 return true;
920}
921
922
923
924
925
926static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
927{
928 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
929 return false;
930
931 if (is_per_cpu_kthread(p))
932 return cpu_online(cpu);
933
934 return cpu_active(cpu);
935}
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
957 struct task_struct *p, int new_cpu)
958{
959 lockdep_assert_held(&rq->lock);
960
961 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
962 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
963 set_task_cpu(p, new_cpu);
964 rq_unlock(rq, rf);
965
966 rq = cpu_rq(new_cpu);
967
968 rq_lock(rq, rf);
969 BUG_ON(task_cpu(p) != new_cpu);
970 enqueue_task(rq, p, 0);
971 p->on_rq = TASK_ON_RQ_QUEUED;
972 check_preempt_curr(rq, p, 0);
973
974 return rq;
975}
976
977struct migration_arg {
978 struct task_struct *task;
979 int dest_cpu;
980};
981
982
983
984
985
986
987
988
989
990
991static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
992 struct task_struct *p, int dest_cpu)
993{
994
995 if (!is_cpu_allowed(p, dest_cpu))
996 return rq;
997
998 update_rq_clock(rq);
999 rq = move_queued_task(rq, rf, p, dest_cpu);
1000
1001 return rq;
1002}
1003
1004
1005
1006
1007
1008
1009static int migration_cpu_stop(void *data)
1010{
1011 struct migration_arg *arg = data;
1012 struct task_struct *p = arg->task;
1013 struct rq *rq = this_rq();
1014 struct rq_flags rf;
1015
1016
1017
1018
1019
1020 local_irq_disable();
1021
1022
1023
1024
1025
1026 sched_ttwu_pending();
1027
1028 raw_spin_lock(&p->pi_lock);
1029 rq_lock(rq, &rf);
1030
1031
1032
1033
1034
1035 if (task_rq(p) == rq) {
1036 if (task_on_rq_queued(p))
1037 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1038 else
1039 p->wake_cpu = arg->dest_cpu;
1040 }
1041 rq_unlock(rq, &rf);
1042 raw_spin_unlock(&p->pi_lock);
1043
1044 local_irq_enable();
1045 return 0;
1046}
1047
1048
1049
1050
1051
1052void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1053{
1054 cpumask_copy(&p->cpus_allowed, new_mask);
1055 p->nr_cpus_allowed = cpumask_weight(new_mask);
1056}
1057
1058void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1059{
1060 struct rq *rq = task_rq(p);
1061 bool queued, running;
1062
1063 lockdep_assert_held(&p->pi_lock);
1064
1065 queued = task_on_rq_queued(p);
1066 running = task_current(rq, p);
1067
1068 if (queued) {
1069
1070
1071
1072
1073 lockdep_assert_held(&rq->lock);
1074 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1075 }
1076 if (running)
1077 put_prev_task(rq, p);
1078
1079 p->sched_class->set_cpus_allowed(p, new_mask);
1080
1081 if (queued)
1082 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1083 if (running)
1084 set_curr_task(rq, p);
1085}
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096static int __set_cpus_allowed_ptr(struct task_struct *p,
1097 const struct cpumask *new_mask, bool check)
1098{
1099 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1100 unsigned int dest_cpu;
1101 struct rq_flags rf;
1102 struct rq *rq;
1103 int ret = 0;
1104
1105 rq = task_rq_lock(p, &rf);
1106 update_rq_clock(rq);
1107
1108 if (p->flags & PF_KTHREAD) {
1109
1110
1111
1112 cpu_valid_mask = cpu_online_mask;
1113 }
1114
1115
1116
1117
1118
1119 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1120 ret = -EINVAL;
1121 goto out;
1122 }
1123
1124 if (cpumask_equal(&p->cpus_allowed, new_mask))
1125 goto out;
1126
1127 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1128 ret = -EINVAL;
1129 goto out;
1130 }
1131
1132 do_set_cpus_allowed(p, new_mask);
1133
1134 if (p->flags & PF_KTHREAD) {
1135
1136
1137
1138
1139 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1140 !cpumask_intersects(new_mask, cpu_active_mask) &&
1141 p->nr_cpus_allowed != 1);
1142 }
1143
1144
1145 if (cpumask_test_cpu(task_cpu(p), new_mask))
1146 goto out;
1147
1148 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1149 if (task_running(rq, p) || p->state == TASK_WAKING) {
1150 struct migration_arg arg = { p, dest_cpu };
1151
1152 task_rq_unlock(rq, p, &rf);
1153 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1154 tlb_migrate_finish(p->mm);
1155 return 0;
1156 } else if (task_on_rq_queued(p)) {
1157
1158
1159
1160
1161 rq = move_queued_task(rq, &rf, p, dest_cpu);
1162 }
1163out:
1164 task_rq_unlock(rq, p, &rf);
1165
1166 return ret;
1167}
1168
1169int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1170{
1171 return __set_cpus_allowed_ptr(p, new_mask, false);
1172}
1173EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1174
1175void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1176{
1177#ifdef CONFIG_SCHED_DEBUG
1178
1179
1180
1181
1182 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1183 !p->on_rq);
1184
1185
1186
1187
1188
1189
1190 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1191 p->sched_class == &fair_sched_class &&
1192 (p->on_rq && !task_on_rq_migrating(p)));
1193
1194#ifdef CONFIG_LOCKDEP
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1206 lockdep_is_held(&task_rq(p)->lock)));
1207#endif
1208
1209
1210
1211 WARN_ON_ONCE(!cpu_online(new_cpu));
1212#endif
1213
1214 trace_sched_migrate_task(p, new_cpu);
1215
1216 if (task_cpu(p) != new_cpu) {
1217 if (p->sched_class->migrate_task_rq)
1218 p->sched_class->migrate_task_rq(p, new_cpu);
1219 p->se.nr_migrations++;
1220 rseq_migrate(p);
1221 perf_event_task_migrate(p);
1222 }
1223
1224 __set_task_cpu(p, new_cpu);
1225}
1226
1227#ifdef CONFIG_NUMA_BALANCING
1228static void __migrate_swap_task(struct task_struct *p, int cpu)
1229{
1230 if (task_on_rq_queued(p)) {
1231 struct rq *src_rq, *dst_rq;
1232 struct rq_flags srf, drf;
1233
1234 src_rq = task_rq(p);
1235 dst_rq = cpu_rq(cpu);
1236
1237 rq_pin_lock(src_rq, &srf);
1238 rq_pin_lock(dst_rq, &drf);
1239
1240 p->on_rq = TASK_ON_RQ_MIGRATING;
1241 deactivate_task(src_rq, p, 0);
1242 set_task_cpu(p, cpu);
1243 activate_task(dst_rq, p, 0);
1244 p->on_rq = TASK_ON_RQ_QUEUED;
1245 check_preempt_curr(dst_rq, p, 0);
1246
1247 rq_unpin_lock(dst_rq, &drf);
1248 rq_unpin_lock(src_rq, &srf);
1249
1250 } else {
1251
1252
1253
1254
1255
1256 p->wake_cpu = cpu;
1257 }
1258}
1259
1260struct migration_swap_arg {
1261 struct task_struct *src_task, *dst_task;
1262 int src_cpu, dst_cpu;
1263};
1264
1265static int migrate_swap_stop(void *data)
1266{
1267 struct migration_swap_arg *arg = data;
1268 struct rq *src_rq, *dst_rq;
1269 int ret = -EAGAIN;
1270
1271 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1272 return -EAGAIN;
1273
1274 src_rq = cpu_rq(arg->src_cpu);
1275 dst_rq = cpu_rq(arg->dst_cpu);
1276
1277 double_raw_lock(&arg->src_task->pi_lock,
1278 &arg->dst_task->pi_lock);
1279 double_rq_lock(src_rq, dst_rq);
1280
1281 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1282 goto unlock;
1283
1284 if (task_cpu(arg->src_task) != arg->src_cpu)
1285 goto unlock;
1286
1287 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1288 goto unlock;
1289
1290 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1291 goto unlock;
1292
1293 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1294 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1295
1296 ret = 0;
1297
1298unlock:
1299 double_rq_unlock(src_rq, dst_rq);
1300 raw_spin_unlock(&arg->dst_task->pi_lock);
1301 raw_spin_unlock(&arg->src_task->pi_lock);
1302
1303 return ret;
1304}
1305
1306
1307
1308
1309int migrate_swap(struct task_struct *cur, struct task_struct *p,
1310 int target_cpu, int curr_cpu)
1311{
1312 struct migration_swap_arg arg;
1313 int ret = -EINVAL;
1314
1315 arg = (struct migration_swap_arg){
1316 .src_task = cur,
1317 .src_cpu = curr_cpu,
1318 .dst_task = p,
1319 .dst_cpu = target_cpu,
1320 };
1321
1322 if (arg.src_cpu == arg.dst_cpu)
1323 goto out;
1324
1325
1326
1327
1328
1329 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1330 goto out;
1331
1332 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1333 goto out;
1334
1335 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1336 goto out;
1337
1338 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1339 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1340
1341out:
1342 return ret;
1343}
1344#endif
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1363{
1364 int running, queued;
1365 struct rq_flags rf;
1366 unsigned long ncsw;
1367 struct rq *rq;
1368
1369 for (;;) {
1370
1371
1372
1373
1374
1375
1376 rq = task_rq(p);
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389 while (task_running(rq, p)) {
1390 if (match_state && unlikely(p->state != match_state))
1391 return 0;
1392 cpu_relax();
1393 }
1394
1395
1396
1397
1398
1399
1400 rq = task_rq_lock(p, &rf);
1401 trace_sched_wait_task(p);
1402 running = task_running(rq, p);
1403 queued = task_on_rq_queued(p);
1404 ncsw = 0;
1405 if (!match_state || p->state == match_state)
1406 ncsw = p->nvcsw | LONG_MIN;
1407 task_rq_unlock(rq, p, &rf);
1408
1409
1410
1411
1412 if (unlikely(!ncsw))
1413 break;
1414
1415
1416
1417
1418
1419
1420
1421 if (unlikely(running)) {
1422 cpu_relax();
1423 continue;
1424 }
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435 if (unlikely(queued)) {
1436 ktime_t to = NSEC_PER_SEC / HZ;
1437
1438 set_current_state(TASK_UNINTERRUPTIBLE);
1439 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1440 continue;
1441 }
1442
1443
1444
1445
1446
1447
1448 break;
1449 }
1450
1451 return ncsw;
1452}
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467void kick_process(struct task_struct *p)
1468{
1469 int cpu;
1470
1471 preempt_disable();
1472 cpu = task_cpu(p);
1473 if ((cpu != smp_processor_id()) && task_curr(p))
1474 smp_send_reschedule(cpu);
1475 preempt_enable();
1476}
1477EXPORT_SYMBOL_GPL(kick_process);
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501static int select_fallback_rq(int cpu, struct task_struct *p)
1502{
1503 int nid = cpu_to_node(cpu);
1504 const struct cpumask *nodemask = NULL;
1505 enum { cpuset, possible, fail } state = cpuset;
1506 int dest_cpu;
1507
1508
1509
1510
1511
1512
1513 if (nid != -1) {
1514 nodemask = cpumask_of_node(nid);
1515
1516
1517 for_each_cpu(dest_cpu, nodemask) {
1518 if (!cpu_active(dest_cpu))
1519 continue;
1520 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
1521 return dest_cpu;
1522 }
1523 }
1524
1525 for (;;) {
1526
1527 for_each_cpu(dest_cpu, &p->cpus_allowed) {
1528 if (!is_cpu_allowed(p, dest_cpu))
1529 continue;
1530
1531 goto out;
1532 }
1533
1534
1535 switch (state) {
1536 case cpuset:
1537 if (IS_ENABLED(CONFIG_CPUSETS)) {
1538 cpuset_cpus_allowed_fallback(p);
1539 state = possible;
1540 break;
1541 }
1542
1543 case possible:
1544 do_set_cpus_allowed(p, cpu_possible_mask);
1545 state = fail;
1546 break;
1547
1548 case fail:
1549 BUG();
1550 break;
1551 }
1552 }
1553
1554out:
1555 if (state != cpuset) {
1556
1557
1558
1559
1560
1561 if (p->mm && printk_ratelimit()) {
1562 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1563 task_pid_nr(p), p->comm, cpu);
1564 }
1565 }
1566
1567 return dest_cpu;
1568}
1569
1570
1571
1572
1573static inline
1574int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1575{
1576 lockdep_assert_held(&p->pi_lock);
1577
1578 if (p->nr_cpus_allowed > 1)
1579 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1580 else
1581 cpu = cpumask_any(&p->cpus_allowed);
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593 if (unlikely(!is_cpu_allowed(p, cpu)))
1594 cpu = select_fallback_rq(task_cpu(p), p);
1595
1596 return cpu;
1597}
1598
1599static void update_avg(u64 *avg, u64 sample)
1600{
1601 s64 diff = sample - *avg;
1602 *avg += diff >> 3;
1603}
1604
1605void sched_set_stop_task(int cpu, struct task_struct *stop)
1606{
1607 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1608 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1609
1610 if (stop) {
1611
1612
1613
1614
1615
1616
1617
1618
1619 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
1620
1621 stop->sched_class = &stop_sched_class;
1622 }
1623
1624 cpu_rq(cpu)->stop = stop;
1625
1626 if (old_stop) {
1627
1628
1629
1630
1631 old_stop->sched_class = &rt_sched_class;
1632 }
1633}
1634
1635#else
1636
1637static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1638 const struct cpumask *new_mask, bool check)
1639{
1640 return set_cpus_allowed_ptr(p, new_mask);
1641}
1642
1643#endif
1644
1645static void
1646ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1647{
1648 struct rq *rq;
1649
1650 if (!schedstat_enabled())
1651 return;
1652
1653 rq = this_rq();
1654
1655#ifdef CONFIG_SMP
1656 if (cpu == rq->cpu) {
1657 __schedstat_inc(rq->ttwu_local);
1658 __schedstat_inc(p->se.statistics.nr_wakeups_local);
1659 } else {
1660 struct sched_domain *sd;
1661
1662 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
1663 rcu_read_lock();
1664 for_each_domain(rq->cpu, sd) {
1665 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1666 __schedstat_inc(sd->ttwu_wake_remote);
1667 break;
1668 }
1669 }
1670 rcu_read_unlock();
1671 }
1672
1673 if (wake_flags & WF_MIGRATED)
1674 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1675#endif
1676
1677 __schedstat_inc(rq->ttwu_count);
1678 __schedstat_inc(p->se.statistics.nr_wakeups);
1679
1680 if (wake_flags & WF_SYNC)
1681 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
1682}
1683
1684static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1685{
1686 activate_task(rq, p, en_flags);
1687 p->on_rq = TASK_ON_RQ_QUEUED;
1688
1689
1690 if (p->flags & PF_WQ_WORKER)
1691 wq_worker_waking_up(p, cpu_of(rq));
1692}
1693
1694
1695
1696
1697static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1698 struct rq_flags *rf)
1699{
1700 check_preempt_curr(rq, p, wake_flags);
1701 p->state = TASK_RUNNING;
1702 trace_sched_wakeup(p);
1703
1704#ifdef CONFIG_SMP
1705 if (p->sched_class->task_woken) {
1706
1707
1708
1709
1710 rq_unpin_lock(rq, rf);
1711 p->sched_class->task_woken(rq, p);
1712 rq_repin_lock(rq, rf);
1713 }
1714
1715 if (rq->idle_stamp) {
1716 u64 delta = rq_clock(rq) - rq->idle_stamp;
1717 u64 max = 2*rq->max_idle_balance_cost;
1718
1719 update_avg(&rq->avg_idle, delta);
1720
1721 if (rq->avg_idle > max)
1722 rq->avg_idle = max;
1723
1724 rq->idle_stamp = 0;
1725 }
1726#endif
1727}
1728
1729static void
1730ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1731 struct rq_flags *rf)
1732{
1733 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1734
1735 lockdep_assert_held(&rq->lock);
1736
1737#ifdef CONFIG_SMP
1738 if (p->sched_contributes_to_load)
1739 rq->nr_uninterruptible--;
1740
1741 if (wake_flags & WF_MIGRATED)
1742 en_flags |= ENQUEUE_MIGRATED;
1743#endif
1744
1745 ttwu_activate(rq, p, en_flags);
1746 ttwu_do_wakeup(rq, p, wake_flags, rf);
1747}
1748
1749
1750
1751
1752
1753
1754
1755static int ttwu_remote(struct task_struct *p, int wake_flags)
1756{
1757 struct rq_flags rf;
1758 struct rq *rq;
1759 int ret = 0;
1760
1761 rq = __task_rq_lock(p, &rf);
1762 if (task_on_rq_queued(p)) {
1763
1764 update_rq_clock(rq);
1765 ttwu_do_wakeup(rq, p, wake_flags, &rf);
1766 ret = 1;
1767 }
1768 __task_rq_unlock(rq, &rf);
1769
1770 return ret;
1771}
1772
1773#ifdef CONFIG_SMP
1774void sched_ttwu_pending(void)
1775{
1776 struct rq *rq = this_rq();
1777 struct llist_node *llist = llist_del_all(&rq->wake_list);
1778 struct task_struct *p, *t;
1779 struct rq_flags rf;
1780
1781 if (!llist)
1782 return;
1783
1784 rq_lock_irqsave(rq, &rf);
1785 update_rq_clock(rq);
1786
1787 llist_for_each_entry_safe(p, t, llist, wake_entry)
1788 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
1789
1790 rq_unlock_irqrestore(rq, &rf);
1791}
1792
1793void scheduler_ipi(void)
1794{
1795
1796
1797
1798
1799
1800 preempt_fold_need_resched();
1801
1802 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1803 return;
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818 irq_enter();
1819 sched_ttwu_pending();
1820
1821
1822
1823
1824 if (unlikely(got_nohz_idle_kick())) {
1825 this_rq()->idle_balance = 1;
1826 raise_softirq_irqoff(SCHED_SOFTIRQ);
1827 }
1828 irq_exit();
1829}
1830
1831static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1832{
1833 struct rq *rq = cpu_rq(cpu);
1834
1835 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1836
1837 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1838 if (!set_nr_if_polling(rq->idle))
1839 smp_send_reschedule(cpu);
1840 else
1841 trace_sched_wake_idle_without_ipi(cpu);
1842 }
1843}
1844
1845void wake_up_if_idle(int cpu)
1846{
1847 struct rq *rq = cpu_rq(cpu);
1848 struct rq_flags rf;
1849
1850 rcu_read_lock();
1851
1852 if (!is_idle_task(rcu_dereference(rq->curr)))
1853 goto out;
1854
1855 if (set_nr_if_polling(rq->idle)) {
1856 trace_sched_wake_idle_without_ipi(cpu);
1857 } else {
1858 rq_lock_irqsave(rq, &rf);
1859 if (is_idle_task(rq->curr))
1860 smp_send_reschedule(cpu);
1861
1862 rq_unlock_irqrestore(rq, &rf);
1863 }
1864
1865out:
1866 rcu_read_unlock();
1867}
1868
1869bool cpus_share_cache(int this_cpu, int that_cpu)
1870{
1871 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1872}
1873#endif
1874
1875static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1876{
1877 struct rq *rq = cpu_rq(cpu);
1878 struct rq_flags rf;
1879
1880#if defined(CONFIG_SMP)
1881 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1882 sched_clock_cpu(cpu);
1883 ttwu_queue_remote(p, cpu, wake_flags);
1884 return;
1885 }
1886#endif
1887
1888 rq_lock(rq, &rf);
1889 update_rq_clock(rq);
1890 ttwu_do_activate(rq, p, wake_flags, &rf);
1891 rq_unlock(rq, &rf);
1892}
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996static int
1997try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1998{
1999 unsigned long flags;
2000 int cpu, success = 0;
2001
2002
2003
2004
2005
2006
2007
2008 raw_spin_lock_irqsave(&p->pi_lock, flags);
2009 smp_mb__after_spinlock();
2010 if (!(p->state & state))
2011 goto out;
2012
2013 trace_sched_waking(p);
2014
2015
2016 success = 1;
2017 cpu = task_cpu(p);
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039 smp_rmb();
2040 if (p->on_rq && ttwu_remote(p, wake_flags))
2041 goto stat;
2042
2043#ifdef CONFIG_SMP
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063 smp_rmb();
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074 smp_cond_load_acquire(&p->on_cpu, !VAL);
2075
2076 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2077 p->state = TASK_WAKING;
2078
2079 if (p->in_iowait) {
2080 delayacct_blkio_end(p);
2081 atomic_dec(&task_rq(p)->nr_iowait);
2082 }
2083
2084 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2085 if (task_cpu(p) != cpu) {
2086 wake_flags |= WF_MIGRATED;
2087 psi_ttwu_dequeue(p);
2088 set_task_cpu(p, cpu);
2089 }
2090
2091#else
2092
2093 if (p->in_iowait) {
2094 delayacct_blkio_end(p);
2095 atomic_dec(&task_rq(p)->nr_iowait);
2096 }
2097
2098#endif
2099
2100 ttwu_queue(p, cpu, wake_flags);
2101stat:
2102 ttwu_stat(p, cpu, wake_flags);
2103out:
2104 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2105
2106 return success;
2107}
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2119{
2120 struct rq *rq = task_rq(p);
2121
2122 if (WARN_ON_ONCE(rq != this_rq()) ||
2123 WARN_ON_ONCE(p == current))
2124 return;
2125
2126 lockdep_assert_held(&rq->lock);
2127
2128 if (!raw_spin_trylock(&p->pi_lock)) {
2129
2130
2131
2132
2133
2134
2135 rq_unlock(rq, rf);
2136 raw_spin_lock(&p->pi_lock);
2137 rq_relock(rq, rf);
2138 }
2139
2140 if (!(p->state & TASK_NORMAL))
2141 goto out;
2142
2143 trace_sched_waking(p);
2144
2145 if (!task_on_rq_queued(p)) {
2146 if (p->in_iowait) {
2147 delayacct_blkio_end(p);
2148 atomic_dec(&rq->nr_iowait);
2149 }
2150 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2151 }
2152
2153 ttwu_do_wakeup(rq, p, 0, rf);
2154 ttwu_stat(p, smp_processor_id(), 0);
2155out:
2156 raw_spin_unlock(&p->pi_lock);
2157}
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170int wake_up_process(struct task_struct *p)
2171{
2172 return try_to_wake_up(p, TASK_NORMAL, 0);
2173}
2174EXPORT_SYMBOL(wake_up_process);
2175
2176int wake_up_state(struct task_struct *p, unsigned int state)
2177{
2178 return try_to_wake_up(p, state, 0);
2179}
2180
2181
2182
2183
2184
2185
2186
2187static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2188{
2189 p->on_rq = 0;
2190
2191 p->se.on_rq = 0;
2192 p->se.exec_start = 0;
2193 p->se.sum_exec_runtime = 0;
2194 p->se.prev_sum_exec_runtime = 0;
2195 p->se.nr_migrations = 0;
2196 p->se.vruntime = 0;
2197 INIT_LIST_HEAD(&p->se.group_node);
2198
2199#ifdef CONFIG_FAIR_GROUP_SCHED
2200 p->se.cfs_rq = NULL;
2201#endif
2202
2203#ifdef CONFIG_SCHEDSTATS
2204
2205 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2206#endif
2207
2208 RB_CLEAR_NODE(&p->dl.rb_node);
2209 init_dl_task_timer(&p->dl);
2210 init_dl_inactive_task_timer(&p->dl);
2211 __dl_clear_params(p);
2212
2213 INIT_LIST_HEAD(&p->rt.run_list);
2214 p->rt.timeout = 0;
2215 p->rt.time_slice = sched_rr_timeslice;
2216 p->rt.on_rq = 0;
2217 p->rt.on_list = 0;
2218
2219#ifdef CONFIG_PREEMPT_NOTIFIERS
2220 INIT_HLIST_HEAD(&p->preempt_notifiers);
2221#endif
2222
2223#ifdef CONFIG_COMPACTION
2224 p->capture_control = NULL;
2225#endif
2226 init_numa_balancing(clone_flags, p);
2227}
2228
2229DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2230
2231#ifdef CONFIG_NUMA_BALANCING
2232
2233void set_numabalancing_state(bool enabled)
2234{
2235 if (enabled)
2236 static_branch_enable(&sched_numa_balancing);
2237 else
2238 static_branch_disable(&sched_numa_balancing);
2239}
2240
2241#ifdef CONFIG_PROC_SYSCTL
2242int sysctl_numa_balancing(struct ctl_table *table, int write,
2243 void __user *buffer, size_t *lenp, loff_t *ppos)
2244{
2245 struct ctl_table t;
2246 int err;
2247 int state = static_branch_likely(&sched_numa_balancing);
2248
2249 if (write && !capable(CAP_SYS_ADMIN))
2250 return -EPERM;
2251
2252 t = *table;
2253 t.data = &state;
2254 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2255 if (err < 0)
2256 return err;
2257 if (write)
2258 set_numabalancing_state(state);
2259 return err;
2260}
2261#endif
2262#endif
2263
2264#ifdef CONFIG_SCHEDSTATS
2265
2266DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2267static bool __initdata __sched_schedstats = false;
2268
2269static void set_schedstats(bool enabled)
2270{
2271 if (enabled)
2272 static_branch_enable(&sched_schedstats);
2273 else
2274 static_branch_disable(&sched_schedstats);
2275}
2276
2277void force_schedstat_enabled(void)
2278{
2279 if (!schedstat_enabled()) {
2280 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2281 static_branch_enable(&sched_schedstats);
2282 }
2283}
2284
2285static int __init setup_schedstats(char *str)
2286{
2287 int ret = 0;
2288 if (!str)
2289 goto out;
2290
2291
2292
2293
2294
2295
2296 if (!strcmp(str, "enable")) {
2297 __sched_schedstats = true;
2298 ret = 1;
2299 } else if (!strcmp(str, "disable")) {
2300 __sched_schedstats = false;
2301 ret = 1;
2302 }
2303out:
2304 if (!ret)
2305 pr_warn("Unable to parse schedstats=\n");
2306
2307 return ret;
2308}
2309__setup("schedstats=", setup_schedstats);
2310
2311static void __init init_schedstats(void)
2312{
2313 set_schedstats(__sched_schedstats);
2314}
2315
2316#ifdef CONFIG_PROC_SYSCTL
2317int sysctl_schedstats(struct ctl_table *table, int write,
2318 void __user *buffer, size_t *lenp, loff_t *ppos)
2319{
2320 struct ctl_table t;
2321 int err;
2322 int state = static_branch_likely(&sched_schedstats);
2323
2324 if (write && !capable(CAP_SYS_ADMIN))
2325 return -EPERM;
2326
2327 t = *table;
2328 t.data = &state;
2329 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2330 if (err < 0)
2331 return err;
2332 if (write)
2333 set_schedstats(state);
2334 return err;
2335}
2336#endif
2337#else
2338static inline void init_schedstats(void) {}
2339#endif
2340
2341
2342
2343
2344int sched_fork(unsigned long clone_flags, struct task_struct *p)
2345{
2346 unsigned long flags;
2347
2348 __sched_fork(clone_flags, p);
2349
2350
2351
2352
2353
2354 p->state = TASK_NEW;
2355
2356
2357
2358
2359 p->prio = current->normal_prio;
2360
2361
2362
2363
2364 if (unlikely(p->sched_reset_on_fork)) {
2365 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2366 p->policy = SCHED_NORMAL;
2367 p->static_prio = NICE_TO_PRIO(0);
2368 p->rt_priority = 0;
2369 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2370 p->static_prio = NICE_TO_PRIO(0);
2371
2372 p->prio = p->normal_prio = __normal_prio(p);
2373 set_load_weight(p, false);
2374
2375
2376
2377
2378
2379 p->sched_reset_on_fork = 0;
2380 }
2381
2382 if (dl_prio(p->prio))
2383 return -EAGAIN;
2384 else if (rt_prio(p->prio))
2385 p->sched_class = &rt_sched_class;
2386 else
2387 p->sched_class = &fair_sched_class;
2388
2389 init_entity_runnable_average(&p->se);
2390
2391
2392
2393
2394
2395
2396
2397
2398 raw_spin_lock_irqsave(&p->pi_lock, flags);
2399
2400
2401
2402
2403 __set_task_cpu(p, smp_processor_id());
2404 if (p->sched_class->task_fork)
2405 p->sched_class->task_fork(p);
2406 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2407
2408#ifdef CONFIG_SCHED_INFO
2409 if (likely(sched_info_on()))
2410 memset(&p->sched_info, 0, sizeof(p->sched_info));
2411#endif
2412#if defined(CONFIG_SMP)
2413 p->on_cpu = 0;
2414#endif
2415 init_task_preempt_count(p);
2416#ifdef CONFIG_SMP
2417 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2418 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2419#endif
2420 return 0;
2421}
2422
2423unsigned long to_ratio(u64 period, u64 runtime)
2424{
2425 if (runtime == RUNTIME_INF)
2426 return BW_UNIT;
2427
2428
2429
2430
2431
2432
2433 if (period == 0)
2434 return 0;
2435
2436 return div64_u64(runtime << BW_SHIFT, period);
2437}
2438
2439
2440
2441
2442
2443
2444
2445
2446void wake_up_new_task(struct task_struct *p)
2447{
2448 struct rq_flags rf;
2449 struct rq *rq;
2450
2451 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2452 p->state = TASK_RUNNING;
2453#ifdef CONFIG_SMP
2454
2455
2456
2457
2458
2459
2460
2461
2462 p->recent_used_cpu = task_cpu(p);
2463 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2464#endif
2465 rq = __task_rq_lock(p, &rf);
2466 update_rq_clock(rq);
2467 post_init_entity_util_avg(p);
2468
2469 activate_task(rq, p, ENQUEUE_NOCLOCK);
2470 p->on_rq = TASK_ON_RQ_QUEUED;
2471 trace_sched_wakeup_new(p);
2472 check_preempt_curr(rq, p, WF_FORK);
2473#ifdef CONFIG_SMP
2474 if (p->sched_class->task_woken) {
2475
2476
2477
2478
2479 rq_unpin_lock(rq, &rf);
2480 p->sched_class->task_woken(rq, p);
2481 rq_repin_lock(rq, &rf);
2482 }
2483#endif
2484 task_rq_unlock(rq, p, &rf);
2485}
2486
2487#ifdef CONFIG_PREEMPT_NOTIFIERS
2488
2489static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2490
2491void preempt_notifier_inc(void)
2492{
2493 static_branch_inc(&preempt_notifier_key);
2494}
2495EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2496
2497void preempt_notifier_dec(void)
2498{
2499 static_branch_dec(&preempt_notifier_key);
2500}
2501EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2502
2503
2504
2505
2506
2507void preempt_notifier_register(struct preempt_notifier *notifier)
2508{
2509 if (!static_branch_unlikely(&preempt_notifier_key))
2510 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2511
2512 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2513}
2514EXPORT_SYMBOL_GPL(preempt_notifier_register);
2515
2516
2517
2518
2519
2520
2521
2522void preempt_notifier_unregister(struct preempt_notifier *notifier)
2523{
2524 hlist_del(¬ifier->link);
2525}
2526EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2527
2528static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2529{
2530 struct preempt_notifier *notifier;
2531
2532 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2533 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2534}
2535
2536static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2537{
2538 if (static_branch_unlikely(&preempt_notifier_key))
2539 __fire_sched_in_preempt_notifiers(curr);
2540}
2541
2542static void
2543__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2544 struct task_struct *next)
2545{
2546 struct preempt_notifier *notifier;
2547
2548 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2549 notifier->ops->sched_out(notifier, next);
2550}
2551
2552static __always_inline void
2553fire_sched_out_preempt_notifiers(struct task_struct *curr,
2554 struct task_struct *next)
2555{
2556 if (static_branch_unlikely(&preempt_notifier_key))
2557 __fire_sched_out_preempt_notifiers(curr, next);
2558}
2559
2560#else
2561
2562static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2563{
2564}
2565
2566static inline void
2567fire_sched_out_preempt_notifiers(struct task_struct *curr,
2568 struct task_struct *next)
2569{
2570}
2571
2572#endif
2573
2574static inline void prepare_task(struct task_struct *next)
2575{
2576#ifdef CONFIG_SMP
2577
2578
2579
2580
2581 next->on_cpu = 1;
2582#endif
2583}
2584
2585static inline void finish_task(struct task_struct *prev)
2586{
2587#ifdef CONFIG_SMP
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598 smp_store_release(&prev->on_cpu, 0);
2599#endif
2600}
2601
2602static inline void
2603prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
2604{
2605
2606
2607
2608
2609
2610
2611 rq_unpin_lock(rq, rf);
2612 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2613#ifdef CONFIG_DEBUG_SPINLOCK
2614
2615 rq->lock.owner = next;
2616#endif
2617}
2618
2619static inline void finish_lock_switch(struct rq *rq)
2620{
2621
2622
2623
2624
2625
2626 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
2627 raw_spin_unlock_irq(&rq->lock);
2628}
2629
2630
2631
2632
2633
2634#ifndef prepare_arch_switch
2635# define prepare_arch_switch(next) do { } while (0)
2636#endif
2637
2638#ifndef finish_arch_post_lock_switch
2639# define finish_arch_post_lock_switch() do { } while (0)
2640#endif
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655static inline void
2656prepare_task_switch(struct rq *rq, struct task_struct *prev,
2657 struct task_struct *next)
2658{
2659 kcov_prepare_switch(prev);
2660 sched_info_switch(rq, prev, next);
2661 perf_event_task_sched_out(prev, next);
2662 rseq_preempt(prev);
2663 fire_sched_out_preempt_notifiers(prev, next);
2664 prepare_task(next);
2665 prepare_arch_switch(next);
2666}
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687static struct rq *finish_task_switch(struct task_struct *prev)
2688 __releases(rq->lock)
2689{
2690 struct rq *rq = this_rq();
2691 struct mm_struct *mm = rq->prev_mm;
2692 long prev_state;
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2706 "corrupted preempt_count: %s/%d/0x%x\n",
2707 current->comm, current->pid, preempt_count()))
2708 preempt_count_set(FORK_PREEMPT_COUNT);
2709
2710 rq->prev_mm = NULL;
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723 prev_state = prev->state;
2724 vtime_task_switch(prev);
2725 perf_event_task_sched_in(prev, current);
2726 finish_task(prev);
2727 finish_lock_switch(rq);
2728 finish_arch_post_lock_switch();
2729 kcov_finish_switch(current);
2730
2731 fire_sched_in_preempt_notifiers(current);
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744 if (mm) {
2745 membarrier_mm_sync_core_before_usermode(mm);
2746 mmdrop(mm);
2747 }
2748 if (unlikely(prev_state == TASK_DEAD)) {
2749 if (prev->sched_class->task_dead)
2750 prev->sched_class->task_dead(prev);
2751
2752
2753
2754
2755
2756 kprobe_flush_task(prev);
2757
2758
2759 put_task_stack(prev);
2760
2761 put_task_struct(prev);
2762 }
2763
2764 tick_nohz_task_switch();
2765 return rq;
2766}
2767
2768#ifdef CONFIG_SMP
2769
2770
2771static void __balance_callback(struct rq *rq)
2772{
2773 struct callback_head *head, *next;
2774 void (*func)(struct rq *rq);
2775 unsigned long flags;
2776
2777 raw_spin_lock_irqsave(&rq->lock, flags);
2778 head = rq->balance_callback;
2779 rq->balance_callback = NULL;
2780 while (head) {
2781 func = (void (*)(struct rq *))head->func;
2782 next = head->next;
2783 head->next = NULL;
2784 head = next;
2785
2786 func(rq);
2787 }
2788 raw_spin_unlock_irqrestore(&rq->lock, flags);
2789}
2790
2791static inline void balance_callback(struct rq *rq)
2792{
2793 if (unlikely(rq->balance_callback))
2794 __balance_callback(rq);
2795}
2796
2797#else
2798
2799static inline void balance_callback(struct rq *rq)
2800{
2801}
2802
2803#endif
2804
2805
2806
2807
2808
2809asmlinkage __visible void schedule_tail(struct task_struct *prev)
2810 __releases(rq->lock)
2811{
2812 struct rq *rq;
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823 rq = finish_task_switch(prev);
2824 balance_callback(rq);
2825 preempt_enable();
2826
2827 if (current->set_child_tid)
2828 put_user(task_pid_vnr(current), current->set_child_tid);
2829
2830 calculate_sigpending();
2831}
2832
2833
2834
2835
2836static __always_inline struct rq *
2837context_switch(struct rq *rq, struct task_struct *prev,
2838 struct task_struct *next, struct rq_flags *rf)
2839{
2840 struct mm_struct *mm, *oldmm;
2841
2842 prepare_task_switch(rq, prev, next);
2843
2844 mm = next->mm;
2845 oldmm = prev->active_mm;
2846
2847
2848
2849
2850
2851 arch_start_context_switch(prev);
2852
2853
2854
2855
2856
2857
2858
2859
2860 if (!mm) {
2861 next->active_mm = oldmm;
2862 mmgrab(oldmm);
2863 enter_lazy_tlb(oldmm, next);
2864 } else
2865 switch_mm_irqs_off(oldmm, mm, next);
2866
2867 if (!prev->mm) {
2868 prev->active_mm = NULL;
2869 rq->prev_mm = oldmm;
2870 }
2871
2872 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
2873
2874 prepare_lock_switch(rq, next, rf);
2875
2876
2877 switch_to(prev, next, prev);
2878 barrier();
2879
2880 return finish_task_switch(prev);
2881}
2882
2883
2884
2885
2886
2887
2888
2889unsigned long nr_running(void)
2890{
2891 unsigned long i, sum = 0;
2892
2893 for_each_online_cpu(i)
2894 sum += cpu_rq(i)->nr_running;
2895
2896 return sum;
2897}
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912bool single_task_running(void)
2913{
2914 return raw_rq()->nr_running == 1;
2915}
2916EXPORT_SYMBOL(single_task_running);
2917
2918unsigned long long nr_context_switches(void)
2919{
2920 int i;
2921 unsigned long long sum = 0;
2922
2923 for_each_possible_cpu(i)
2924 sum += cpu_rq(i)->nr_switches;
2925
2926 return sum;
2927}
2928
2929
2930
2931
2932
2933
2934
2935
2936unsigned long nr_iowait_cpu(int cpu)
2937{
2938 return atomic_read(&cpu_rq(cpu)->nr_iowait);
2939}
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971unsigned long nr_iowait(void)
2972{
2973 unsigned long i, sum = 0;
2974
2975 for_each_possible_cpu(i)
2976 sum += nr_iowait_cpu(i);
2977
2978 return sum;
2979}
2980
2981#ifdef CONFIG_SMP
2982
2983
2984
2985
2986
2987void sched_exec(void)
2988{
2989 struct task_struct *p = current;
2990 unsigned long flags;
2991 int dest_cpu;
2992
2993 raw_spin_lock_irqsave(&p->pi_lock, flags);
2994 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2995 if (dest_cpu == smp_processor_id())
2996 goto unlock;
2997
2998 if (likely(cpu_active(dest_cpu))) {
2999 struct migration_arg arg = { p, dest_cpu };
3000
3001 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3002 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3003 return;
3004 }
3005unlock:
3006 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3007}
3008
3009#endif
3010
3011DEFINE_PER_CPU(struct kernel_stat, kstat);
3012DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3013
3014EXPORT_PER_CPU_SYMBOL(kstat);
3015EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3016
3017
3018
3019
3020
3021
3022
3023static inline void prefetch_curr_exec_start(struct task_struct *p)
3024{
3025#ifdef CONFIG_FAIR_GROUP_SCHED
3026 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3027#else
3028 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3029#endif
3030 prefetch(curr);
3031 prefetch(&curr->exec_start);
3032}
3033
3034
3035
3036
3037
3038
3039unsigned long long task_sched_runtime(struct task_struct *p)
3040{
3041 struct rq_flags rf;
3042 struct rq *rq;
3043 u64 ns;
3044
3045#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057 if (!p->on_cpu || !task_on_rq_queued(p))
3058 return p->se.sum_exec_runtime;
3059#endif
3060
3061 rq = task_rq_lock(p, &rf);
3062
3063
3064
3065
3066
3067 if (task_current(rq, p) && task_on_rq_queued(p)) {
3068 prefetch_curr_exec_start(p);
3069 update_rq_clock(rq);
3070 p->sched_class->update_curr(rq);
3071 }
3072 ns = p->se.sum_exec_runtime;
3073 task_rq_unlock(rq, p, &rf);
3074
3075 return ns;
3076}
3077
3078
3079
3080
3081
3082void scheduler_tick(void)
3083{
3084 int cpu = smp_processor_id();
3085 struct rq *rq = cpu_rq(cpu);
3086 struct task_struct *curr = rq->curr;
3087 struct rq_flags rf;
3088
3089 sched_clock_tick();
3090
3091 rq_lock(rq, &rf);
3092
3093 update_rq_clock(rq);
3094 curr->sched_class->task_tick(rq, curr, 0);
3095 cpu_load_update_active(rq);
3096 calc_global_load_tick(rq);
3097 psi_task_tick(rq);
3098
3099 rq_unlock(rq, &rf);
3100
3101 perf_event_task_tick();
3102
3103#ifdef CONFIG_SMP
3104 rq->idle_balance = idle_cpu(cpu);
3105 trigger_load_balance(rq);
3106#endif
3107}
3108
3109#ifdef CONFIG_NO_HZ_FULL
3110
3111struct tick_work {
3112 int cpu;
3113 struct delayed_work work;
3114};
3115
3116static struct tick_work __percpu *tick_work_cpu;
3117
3118static void sched_tick_remote(struct work_struct *work)
3119{
3120 struct delayed_work *dwork = to_delayed_work(work);
3121 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3122 int cpu = twork->cpu;
3123 struct rq *rq = cpu_rq(cpu);
3124 struct task_struct *curr;
3125 struct rq_flags rf;
3126 u64 delta;
3127
3128
3129
3130
3131
3132
3133
3134
3135 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3136 goto out_requeue;
3137
3138 rq_lock_irq(rq, &rf);
3139 curr = rq->curr;
3140 if (is_idle_task(curr))
3141 goto out_unlock;
3142
3143 update_rq_clock(rq);
3144 delta = rq_clock_task(rq) - curr->se.exec_start;
3145
3146
3147
3148
3149
3150 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3151 curr->sched_class->task_tick(rq, curr, 0);
3152
3153out_unlock:
3154 rq_unlock_irq(rq, &rf);
3155
3156out_requeue:
3157
3158
3159
3160
3161
3162 queue_delayed_work(system_unbound_wq, dwork, HZ);
3163}
3164
3165static void sched_tick_start(int cpu)
3166{
3167 struct tick_work *twork;
3168
3169 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3170 return;
3171
3172 WARN_ON_ONCE(!tick_work_cpu);
3173
3174 twork = per_cpu_ptr(tick_work_cpu, cpu);
3175 twork->cpu = cpu;
3176 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3177 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3178}
3179
3180#ifdef CONFIG_HOTPLUG_CPU
3181static void sched_tick_stop(int cpu)
3182{
3183 struct tick_work *twork;
3184
3185 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3186 return;
3187
3188 WARN_ON_ONCE(!tick_work_cpu);
3189
3190 twork = per_cpu_ptr(tick_work_cpu, cpu);
3191 cancel_delayed_work_sync(&twork->work);
3192}
3193#endif
3194
3195int __init sched_tick_offload_init(void)
3196{
3197 tick_work_cpu = alloc_percpu(struct tick_work);
3198 BUG_ON(!tick_work_cpu);
3199
3200 return 0;
3201}
3202
3203#else
3204static inline void sched_tick_start(int cpu) { }
3205static inline void sched_tick_stop(int cpu) { }
3206#endif
3207
3208#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3209 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3210
3211
3212
3213
3214static inline void preempt_latency_start(int val)
3215{
3216 if (preempt_count() == val) {
3217 unsigned long ip = get_lock_parent_ip();
3218#ifdef CONFIG_DEBUG_PREEMPT
3219 current->preempt_disable_ip = ip;
3220#endif
3221 trace_preempt_off(CALLER_ADDR0, ip);
3222 }
3223}
3224
3225void preempt_count_add(int val)
3226{
3227#ifdef CONFIG_DEBUG_PREEMPT
3228
3229
3230
3231 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3232 return;
3233#endif
3234 __preempt_count_add(val);
3235#ifdef CONFIG_DEBUG_PREEMPT
3236
3237
3238
3239 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3240 PREEMPT_MASK - 10);
3241#endif
3242 preempt_latency_start(val);
3243}
3244EXPORT_SYMBOL(preempt_count_add);
3245NOKPROBE_SYMBOL(preempt_count_add);
3246
3247
3248
3249
3250
3251static inline void preempt_latency_stop(int val)
3252{
3253 if (preempt_count() == val)
3254 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3255}
3256
3257void preempt_count_sub(int val)
3258{
3259#ifdef CONFIG_DEBUG_PREEMPT
3260
3261
3262
3263 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3264 return;
3265
3266
3267
3268 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3269 !(preempt_count() & PREEMPT_MASK)))
3270 return;
3271#endif
3272
3273 preempt_latency_stop(val);
3274 __preempt_count_sub(val);
3275}
3276EXPORT_SYMBOL(preempt_count_sub);
3277NOKPROBE_SYMBOL(preempt_count_sub);
3278
3279#else
3280static inline void preempt_latency_start(int val) { }
3281static inline void preempt_latency_stop(int val) { }
3282#endif
3283
3284static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3285{
3286#ifdef CONFIG_DEBUG_PREEMPT
3287 return p->preempt_disable_ip;
3288#else
3289 return 0;
3290#endif
3291}
3292
3293
3294
3295
3296static noinline void __schedule_bug(struct task_struct *prev)
3297{
3298
3299 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3300
3301 if (oops_in_progress)
3302 return;
3303
3304 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3305 prev->comm, prev->pid, preempt_count());
3306
3307 debug_show_held_locks(prev);
3308 print_modules();
3309 if (irqs_disabled())
3310 print_irqtrace_events(prev);
3311 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3312 && in_atomic_preempt_off()) {
3313 pr_err("Preemption disabled at:");
3314 print_ip_sym(preempt_disable_ip);
3315 pr_cont("\n");
3316 }
3317 if (panic_on_warn)
3318 panic("scheduling while atomic\n");
3319
3320 dump_stack();
3321 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3322}
3323
3324
3325
3326
3327static inline void schedule_debug(struct task_struct *prev)
3328{
3329#ifdef CONFIG_SCHED_STACK_END_CHECK
3330 if (task_stack_end_corrupted(prev))
3331 panic("corrupted stack end detected inside scheduler\n");
3332#endif
3333
3334 if (unlikely(in_atomic_preempt_off())) {
3335 __schedule_bug(prev);
3336 preempt_count_set(PREEMPT_DISABLED);
3337 }
3338 rcu_sleep_check();
3339
3340 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3341
3342 schedstat_inc(this_rq()->sched_count);
3343}
3344
3345
3346
3347
3348static inline struct task_struct *
3349pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3350{
3351 const struct sched_class *class;
3352 struct task_struct *p;
3353
3354
3355
3356
3357
3358
3359
3360 if (likely((prev->sched_class == &idle_sched_class ||
3361 prev->sched_class == &fair_sched_class) &&
3362 rq->nr_running == rq->cfs.h_nr_running)) {
3363
3364 p = fair_sched_class.pick_next_task(rq, prev, rf);
3365 if (unlikely(p == RETRY_TASK))
3366 goto again;
3367
3368
3369 if (unlikely(!p))
3370 p = idle_sched_class.pick_next_task(rq, prev, rf);
3371
3372 return p;
3373 }
3374
3375again:
3376 for_each_class(class) {
3377 p = class->pick_next_task(rq, prev, rf);
3378 if (p) {
3379 if (unlikely(p == RETRY_TASK))
3380 goto again;
3381 return p;
3382 }
3383 }
3384
3385
3386 BUG();
3387}
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428static void __sched notrace __schedule(bool preempt)
3429{
3430 struct task_struct *prev, *next;
3431 unsigned long *switch_count;
3432 struct rq_flags rf;
3433 struct rq *rq;
3434 int cpu;
3435
3436 cpu = smp_processor_id();
3437 rq = cpu_rq(cpu);
3438 prev = rq->curr;
3439
3440 schedule_debug(prev);
3441
3442 if (sched_feat(HRTICK))
3443 hrtick_clear(rq);
3444
3445 local_irq_disable();
3446 rcu_note_context_switch(preempt);
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456 rq_lock(rq, &rf);
3457 smp_mb__after_spinlock();
3458
3459
3460 rq->clock_update_flags <<= 1;
3461 update_rq_clock(rq);
3462
3463 switch_count = &prev->nivcsw;
3464 if (!preempt && prev->state) {
3465 if (signal_pending_state(prev->state, prev)) {
3466 prev->state = TASK_RUNNING;
3467 } else {
3468 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3469 prev->on_rq = 0;
3470
3471 if (prev->in_iowait) {
3472 atomic_inc(&rq->nr_iowait);
3473 delayacct_blkio_start();
3474 }
3475
3476
3477
3478
3479
3480
3481 if (prev->flags & PF_WQ_WORKER) {
3482 struct task_struct *to_wakeup;
3483
3484 to_wakeup = wq_worker_sleeping(prev);
3485 if (to_wakeup)
3486 try_to_wake_up_local(to_wakeup, &rf);
3487 }
3488 }
3489 switch_count = &prev->nvcsw;
3490 }
3491
3492 next = pick_next_task(rq, prev, &rf);
3493 clear_tsk_need_resched(prev);
3494 clear_preempt_need_resched();
3495
3496 if (likely(prev != next)) {
3497 rq->nr_switches++;
3498 rq->curr = next;
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513 ++*switch_count;
3514
3515 trace_sched_switch(preempt, prev, next);
3516
3517
3518 rq = context_switch(rq, prev, next, &rf);
3519 } else {
3520 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3521 rq_unlock_irq(rq, &rf);
3522 }
3523
3524 balance_callback(rq);
3525}
3526
3527void __noreturn do_task_dead(void)
3528{
3529
3530 set_special_state(TASK_DEAD);
3531
3532
3533 current->flags |= PF_NOFREEZE;
3534
3535 __schedule(false);
3536 BUG();
3537
3538
3539 for (;;)
3540 cpu_relax();
3541}
3542
3543static inline void sched_submit_work(struct task_struct *tsk)
3544{
3545 if (!tsk->state || tsk_is_pi_blocked(tsk))
3546 return;
3547
3548
3549
3550
3551 if (blk_needs_flush_plug(tsk))
3552 blk_schedule_flush_plug(tsk);
3553}
3554
3555asmlinkage __visible void __sched schedule(void)
3556{
3557 struct task_struct *tsk = current;
3558
3559 sched_submit_work(tsk);
3560 do {
3561 preempt_disable();
3562 __schedule(false);
3563 sched_preempt_enable_no_resched();
3564 } while (need_resched());
3565}
3566EXPORT_SYMBOL(schedule);
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578void __sched schedule_idle(void)
3579{
3580
3581
3582
3583
3584
3585
3586
3587 WARN_ON_ONCE(current->state);
3588 do {
3589 __schedule(false);
3590 } while (need_resched());
3591}
3592
3593#ifdef CONFIG_CONTEXT_TRACKING
3594asmlinkage __visible void __sched schedule_user(void)
3595{
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606 enum ctx_state prev_state = exception_enter();
3607 schedule();
3608 exception_exit(prev_state);
3609}
3610#endif
3611
3612
3613
3614
3615
3616
3617void __sched schedule_preempt_disabled(void)
3618{
3619 sched_preempt_enable_no_resched();
3620 schedule();
3621 preempt_disable();
3622}
3623
3624static void __sched notrace preempt_schedule_common(void)
3625{
3626 do {
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640 preempt_disable_notrace();
3641 preempt_latency_start(1);
3642 __schedule(true);
3643 preempt_latency_stop(1);
3644 preempt_enable_no_resched_notrace();
3645
3646
3647
3648
3649
3650 } while (need_resched());
3651}
3652
3653#ifdef CONFIG_PREEMPT
3654
3655
3656
3657
3658
3659asmlinkage __visible void __sched notrace preempt_schedule(void)
3660{
3661
3662
3663
3664
3665 if (likely(!preemptible()))
3666 return;
3667
3668 preempt_schedule_common();
3669}
3670NOKPROBE_SYMBOL(preempt_schedule);
3671EXPORT_SYMBOL(preempt_schedule);
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3688{
3689 enum ctx_state prev_ctx;
3690
3691 if (likely(!preemptible()))
3692 return;
3693
3694 do {
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708 preempt_disable_notrace();
3709 preempt_latency_start(1);
3710
3711
3712
3713
3714
3715 prev_ctx = exception_enter();
3716 __schedule(true);
3717 exception_exit(prev_ctx);
3718
3719 preempt_latency_stop(1);
3720 preempt_enable_no_resched_notrace();
3721 } while (need_resched());
3722}
3723EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3724
3725#endif
3726
3727
3728
3729
3730
3731
3732
3733asmlinkage __visible void __sched preempt_schedule_irq(void)
3734{
3735 enum ctx_state prev_state;
3736
3737
3738 BUG_ON(preempt_count() || !irqs_disabled());
3739
3740 prev_state = exception_enter();
3741
3742 do {
3743 preempt_disable();
3744 local_irq_enable();
3745 __schedule(true);
3746 local_irq_disable();
3747 sched_preempt_enable_no_resched();
3748 } while (need_resched());
3749
3750 exception_exit(prev_state);
3751}
3752
3753int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
3754 void *key)
3755{
3756 return try_to_wake_up(curr->private, mode, wake_flags);
3757}
3758EXPORT_SYMBOL(default_wake_function);
3759
3760#ifdef CONFIG_RT_MUTEXES
3761
3762static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3763{
3764 if (pi_task)
3765 prio = min(prio, pi_task->prio);
3766
3767 return prio;
3768}
3769
3770static inline int rt_effective_prio(struct task_struct *p, int prio)
3771{
3772 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3773
3774 return __rt_effective_prio(pi_task, prio);
3775}
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3789{
3790 int prio, oldprio, queued, running, queue_flag =
3791 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3792 const struct sched_class *prev_class;
3793 struct rq_flags rf;
3794 struct rq *rq;
3795
3796
3797 prio = __rt_effective_prio(pi_task, p->normal_prio);
3798
3799
3800
3801
3802 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3803 return;
3804
3805 rq = __task_rq_lock(p, &rf);
3806 update_rq_clock(rq);
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817 p->pi_top_task = pi_task;
3818
3819
3820
3821
3822 if (prio == p->prio && !dl_prio(prio))
3823 goto out_unlock;
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837 if (unlikely(p == rq->idle)) {
3838 WARN_ON(p != rq->curr);
3839 WARN_ON(p->pi_blocked_on);
3840 goto out_unlock;
3841 }
3842
3843 trace_sched_pi_setprio(p, pi_task);
3844 oldprio = p->prio;
3845
3846 if (oldprio == prio)
3847 queue_flag &= ~DEQUEUE_MOVE;
3848
3849 prev_class = p->sched_class;
3850 queued = task_on_rq_queued(p);
3851 running = task_current(rq, p);
3852 if (queued)
3853 dequeue_task(rq, p, queue_flag);
3854 if (running)
3855 put_prev_task(rq, p);
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866 if (dl_prio(prio)) {
3867 if (!dl_prio(p->normal_prio) ||
3868 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3869 p->dl.dl_boosted = 1;
3870 queue_flag |= ENQUEUE_REPLENISH;
3871 } else
3872 p->dl.dl_boosted = 0;
3873 p->sched_class = &dl_sched_class;
3874 } else if (rt_prio(prio)) {
3875 if (dl_prio(oldprio))
3876 p->dl.dl_boosted = 0;
3877 if (oldprio < prio)
3878 queue_flag |= ENQUEUE_HEAD;
3879 p->sched_class = &rt_sched_class;
3880 } else {
3881 if (dl_prio(oldprio))
3882 p->dl.dl_boosted = 0;
3883 if (rt_prio(oldprio))
3884 p->rt.timeout = 0;
3885 p->sched_class = &fair_sched_class;
3886 }
3887
3888 p->prio = prio;
3889
3890 if (queued)
3891 enqueue_task(rq, p, queue_flag);
3892 if (running)
3893 set_curr_task(rq, p);
3894
3895 check_class_changed(rq, p, prev_class, oldprio);
3896out_unlock:
3897
3898 preempt_disable();
3899 __task_rq_unlock(rq, &rf);
3900
3901 balance_callback(rq);
3902 preempt_enable();
3903}
3904#else
3905static inline int rt_effective_prio(struct task_struct *p, int prio)
3906{
3907 return prio;
3908}
3909#endif
3910
3911void set_user_nice(struct task_struct *p, long nice)
3912{
3913 bool queued, running;
3914 int old_prio, delta;
3915 struct rq_flags rf;
3916 struct rq *rq;
3917
3918 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3919 return;
3920
3921
3922
3923
3924 rq = task_rq_lock(p, &rf);
3925 update_rq_clock(rq);
3926
3927
3928
3929
3930
3931
3932
3933 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3934 p->static_prio = NICE_TO_PRIO(nice);
3935 goto out_unlock;
3936 }
3937 queued = task_on_rq_queued(p);
3938 running = task_current(rq, p);
3939 if (queued)
3940 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3941 if (running)
3942 put_prev_task(rq, p);
3943
3944 p->static_prio = NICE_TO_PRIO(nice);
3945 set_load_weight(p, true);
3946 old_prio = p->prio;
3947 p->prio = effective_prio(p);
3948 delta = p->prio - old_prio;
3949
3950 if (queued) {
3951 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3952
3953
3954
3955
3956 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3957 resched_curr(rq);
3958 }
3959 if (running)
3960 set_curr_task(rq, p);
3961out_unlock:
3962 task_rq_unlock(rq, p, &rf);
3963}
3964EXPORT_SYMBOL(set_user_nice);
3965
3966
3967
3968
3969
3970
3971int can_nice(const struct task_struct *p, const int nice)
3972{
3973
3974 int nice_rlim = nice_to_rlimit(nice);
3975
3976 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3977 capable(CAP_SYS_NICE));
3978}
3979
3980#ifdef __ARCH_WANT_SYS_NICE
3981
3982
3983
3984
3985
3986
3987
3988
3989SYSCALL_DEFINE1(nice, int, increment)
3990{
3991 long nice, retval;
3992
3993
3994
3995
3996
3997
3998 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3999 nice = task_nice(current) + increment;
4000
4001 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
4002 if (increment < 0 && !can_nice(current, nice))
4003 return -EPERM;
4004
4005 retval = security_task_setnice(current, nice);
4006 if (retval)
4007 return retval;
4008
4009 set_user_nice(current, nice);
4010 return 0;
4011}
4012
4013#endif
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023int task_prio(const struct task_struct *p)
4024{
4025 return p->prio - MAX_RT_PRIO;
4026}
4027
4028
4029
4030
4031
4032
4033
4034int idle_cpu(int cpu)
4035{
4036 struct rq *rq = cpu_rq(cpu);
4037
4038 if (rq->curr != rq->idle)
4039 return 0;
4040
4041 if (rq->nr_running)
4042 return 0;
4043
4044#ifdef CONFIG_SMP
4045 if (!llist_empty(&rq->wake_list))
4046 return 0;
4047#endif
4048
4049 return 1;
4050}
4051
4052
4053
4054
4055
4056
4057
4058int available_idle_cpu(int cpu)
4059{
4060 if (!idle_cpu(cpu))
4061 return 0;
4062
4063 if (vcpu_is_preempted(cpu))
4064 return 0;
4065
4066 return 1;
4067}
4068
4069
4070
4071
4072
4073
4074
4075struct task_struct *idle_task(int cpu)
4076{
4077 return cpu_rq(cpu)->idle;
4078}
4079
4080
4081
4082
4083
4084
4085
4086static struct task_struct *find_process_by_pid(pid_t pid)
4087{
4088 return pid ? find_task_by_vpid(pid) : current;
4089}
4090
4091
4092
4093
4094
4095#define SETPARAM_POLICY -1
4096
4097static void __setscheduler_params(struct task_struct *p,
4098 const struct sched_attr *attr)
4099{
4100 int policy = attr->sched_policy;
4101
4102 if (policy == SETPARAM_POLICY)
4103 policy = p->policy;
4104
4105 p->policy = policy;
4106
4107 if (dl_policy(policy))
4108 __setparam_dl(p, attr);
4109 else if (fair_policy(policy))
4110 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4111
4112
4113
4114
4115
4116
4117 p->rt_priority = attr->sched_priority;
4118 p->normal_prio = normal_prio(p);
4119 set_load_weight(p, true);
4120}
4121
4122
4123static void __setscheduler(struct rq *rq, struct task_struct *p,
4124 const struct sched_attr *attr, bool keep_boost)
4125{
4126 __setscheduler_params(p, attr);
4127
4128
4129
4130
4131
4132 p->prio = normal_prio(p);
4133 if (keep_boost)
4134 p->prio = rt_effective_prio(p, p->prio);
4135
4136 if (dl_prio(p->prio))
4137 p->sched_class = &dl_sched_class;
4138 else if (rt_prio(p->prio))
4139 p->sched_class = &rt_sched_class;
4140 else
4141 p->sched_class = &fair_sched_class;
4142}
4143
4144
4145
4146
4147static bool check_same_owner(struct task_struct *p)
4148{
4149 const struct cred *cred = current_cred(), *pcred;
4150 bool match;
4151
4152 rcu_read_lock();
4153 pcred = __task_cred(p);
4154 match = (uid_eq(cred->euid, pcred->euid) ||
4155 uid_eq(cred->euid, pcred->uid));
4156 rcu_read_unlock();
4157 return match;
4158}
4159
4160static int __sched_setscheduler(struct task_struct *p,
4161 const struct sched_attr *attr,
4162 bool user, bool pi)
4163{
4164 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4165 MAX_RT_PRIO - 1 - attr->sched_priority;
4166 int retval, oldprio, oldpolicy = -1, queued, running;
4167 int new_effective_prio, policy = attr->sched_policy;
4168 const struct sched_class *prev_class;
4169 struct rq_flags rf;
4170 int reset_on_fork;
4171 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4172 struct rq *rq;
4173
4174
4175 BUG_ON(pi && in_interrupt());
4176recheck:
4177
4178 if (policy < 0) {
4179 reset_on_fork = p->sched_reset_on_fork;
4180 policy = oldpolicy = p->policy;
4181 } else {
4182 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4183
4184 if (!valid_policy(policy))
4185 return -EINVAL;
4186 }
4187
4188 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
4189 return -EINVAL;
4190
4191
4192
4193
4194
4195
4196 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4197 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4198 return -EINVAL;
4199 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4200 (rt_policy(policy) != (attr->sched_priority != 0)))
4201 return -EINVAL;
4202
4203
4204
4205
4206 if (user && !capable(CAP_SYS_NICE)) {
4207 if (fair_policy(policy)) {
4208 if (attr->sched_nice < task_nice(p) &&
4209 !can_nice(p, attr->sched_nice))
4210 return -EPERM;
4211 }
4212
4213 if (rt_policy(policy)) {
4214 unsigned long rlim_rtprio =
4215 task_rlimit(p, RLIMIT_RTPRIO);
4216
4217
4218 if (policy != p->policy && !rlim_rtprio)
4219 return -EPERM;
4220
4221
4222 if (attr->sched_priority > p->rt_priority &&
4223 attr->sched_priority > rlim_rtprio)
4224 return -EPERM;
4225 }
4226
4227
4228
4229
4230
4231
4232
4233 if (dl_policy(policy))
4234 return -EPERM;
4235
4236
4237
4238
4239
4240 if (task_has_idle_policy(p) && !idle_policy(policy)) {
4241 if (!can_nice(p, task_nice(p)))
4242 return -EPERM;
4243 }
4244
4245
4246 if (!check_same_owner(p))
4247 return -EPERM;
4248
4249
4250 if (p->sched_reset_on_fork && !reset_on_fork)
4251 return -EPERM;
4252 }
4253
4254 if (user) {
4255 if (attr->sched_flags & SCHED_FLAG_SUGOV)
4256 return -EINVAL;
4257
4258 retval = security_task_setscheduler(p);
4259 if (retval)
4260 return retval;
4261 }
4262
4263
4264
4265
4266
4267
4268
4269
4270 rq = task_rq_lock(p, &rf);
4271 update_rq_clock(rq);
4272
4273
4274
4275
4276 if (p == rq->stop) {
4277 task_rq_unlock(rq, p, &rf);
4278 return -EINVAL;
4279 }
4280
4281
4282
4283
4284
4285 if (unlikely(policy == p->policy)) {
4286 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4287 goto change;
4288 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4289 goto change;
4290 if (dl_policy(policy) && dl_param_changed(p, attr))
4291 goto change;
4292
4293 p->sched_reset_on_fork = reset_on_fork;
4294 task_rq_unlock(rq, p, &rf);
4295 return 0;
4296 }
4297change:
4298
4299 if (user) {
4300#ifdef CONFIG_RT_GROUP_SCHED
4301
4302
4303
4304
4305 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4306 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4307 !task_group_is_autogroup(task_group(p))) {
4308 task_rq_unlock(rq, p, &rf);
4309 return -EPERM;
4310 }
4311#endif
4312#ifdef CONFIG_SMP
4313 if (dl_bandwidth_enabled() && dl_policy(policy) &&
4314 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
4315 cpumask_t *span = rq->rd->span;
4316
4317
4318
4319
4320
4321
4322 if (!cpumask_subset(span, &p->cpus_allowed) ||
4323 rq->rd->dl_bw.bw == 0) {
4324 task_rq_unlock(rq, p, &rf);
4325 return -EPERM;
4326 }
4327 }
4328#endif
4329 }
4330
4331
4332 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4333 policy = oldpolicy = -1;
4334 task_rq_unlock(rq, p, &rf);
4335 goto recheck;
4336 }
4337
4338
4339
4340
4341
4342
4343 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4344 task_rq_unlock(rq, p, &rf);
4345 return -EBUSY;
4346 }
4347
4348 p->sched_reset_on_fork = reset_on_fork;
4349 oldprio = p->prio;
4350
4351 if (pi) {
4352
4353
4354
4355
4356
4357
4358
4359 new_effective_prio = rt_effective_prio(p, newprio);
4360 if (new_effective_prio == oldprio)
4361 queue_flags &= ~DEQUEUE_MOVE;
4362 }
4363
4364 queued = task_on_rq_queued(p);
4365 running = task_current(rq, p);
4366 if (queued)
4367 dequeue_task(rq, p, queue_flags);
4368 if (running)
4369 put_prev_task(rq, p);
4370
4371 prev_class = p->sched_class;
4372 __setscheduler(rq, p, attr, pi);
4373
4374 if (queued) {
4375
4376
4377
4378
4379 if (oldprio < p->prio)
4380 queue_flags |= ENQUEUE_HEAD;
4381
4382 enqueue_task(rq, p, queue_flags);
4383 }
4384 if (running)
4385 set_curr_task(rq, p);
4386
4387 check_class_changed(rq, p, prev_class, oldprio);
4388
4389
4390 preempt_disable();
4391 task_rq_unlock(rq, p, &rf);
4392
4393 if (pi)
4394 rt_mutex_adjust_pi(p);
4395
4396
4397 balance_callback(rq);
4398 preempt_enable();
4399
4400 return 0;
4401}
4402
4403static int _sched_setscheduler(struct task_struct *p, int policy,
4404 const struct sched_param *param, bool check)
4405{
4406 struct sched_attr attr = {
4407 .sched_policy = policy,
4408 .sched_priority = param->sched_priority,
4409 .sched_nice = PRIO_TO_NICE(p->static_prio),
4410 };
4411
4412
4413 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4414 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4415 policy &= ~SCHED_RESET_ON_FORK;
4416 attr.sched_policy = policy;
4417 }
4418
4419 return __sched_setscheduler(p, &attr, check, true);
4420}
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431int sched_setscheduler(struct task_struct *p, int policy,
4432 const struct sched_param *param)
4433{
4434 return _sched_setscheduler(p, policy, param, true);
4435}
4436EXPORT_SYMBOL_GPL(sched_setscheduler);
4437
4438int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4439{
4440 return __sched_setscheduler(p, attr, true, true);
4441}
4442EXPORT_SYMBOL_GPL(sched_setattr);
4443
4444int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
4445{
4446 return __sched_setscheduler(p, attr, false, true);
4447}
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4463 const struct sched_param *param)
4464{
4465 return _sched_setscheduler(p, policy, param, false);
4466}
4467EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4468
4469static int
4470do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4471{
4472 struct sched_param lparam;
4473 struct task_struct *p;
4474 int retval;
4475
4476 if (!param || pid < 0)
4477 return -EINVAL;
4478 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4479 return -EFAULT;
4480
4481 rcu_read_lock();
4482 retval = -ESRCH;
4483 p = find_process_by_pid(pid);
4484 if (p != NULL)
4485 retval = sched_setscheduler(p, policy, &lparam);
4486 rcu_read_unlock();
4487
4488 return retval;
4489}
4490
4491
4492
4493
4494static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4495{
4496 u32 size;
4497 int ret;
4498
4499 if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
4500 return -EFAULT;
4501
4502
4503 memset(attr, 0, sizeof(*attr));
4504
4505 ret = get_user(size, &uattr->size);
4506 if (ret)
4507 return ret;
4508
4509
4510 if (size > PAGE_SIZE)
4511 goto err_size;
4512
4513
4514 if (!size)
4515 size = SCHED_ATTR_SIZE_VER0;
4516
4517 if (size < SCHED_ATTR_SIZE_VER0)
4518 goto err_size;
4519
4520
4521
4522
4523
4524
4525
4526 if (size > sizeof(*attr)) {
4527 unsigned char __user *addr;
4528 unsigned char __user *end;
4529 unsigned char val;
4530
4531 addr = (void __user *)uattr + sizeof(*attr);
4532 end = (void __user *)uattr + size;
4533
4534 for (; addr < end; addr++) {
4535 ret = get_user(val, addr);
4536 if (ret)
4537 return ret;
4538 if (val)
4539 goto err_size;
4540 }
4541 size = sizeof(*attr);
4542 }
4543
4544 ret = copy_from_user(attr, uattr, size);
4545 if (ret)
4546 return -EFAULT;
4547
4548
4549
4550
4551
4552 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4553
4554 return 0;
4555
4556err_size:
4557 put_user(sizeof(*attr), &uattr->size);
4558 return -E2BIG;
4559}
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4570{
4571 if (policy < 0)
4572 return -EINVAL;
4573
4574 return do_sched_setscheduler(pid, policy, param);
4575}
4576
4577
4578
4579
4580
4581
4582
4583
4584SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4585{
4586 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4587}
4588
4589
4590
4591
4592
4593
4594
4595SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4596 unsigned int, flags)
4597{
4598 struct sched_attr attr;
4599 struct task_struct *p;
4600 int retval;
4601
4602 if (!uattr || pid < 0 || flags)
4603 return -EINVAL;
4604
4605 retval = sched_copy_attr(uattr, &attr);
4606 if (retval)
4607 return retval;
4608
4609 if ((int)attr.sched_policy < 0)
4610 return -EINVAL;
4611
4612 rcu_read_lock();
4613 retval = -ESRCH;
4614 p = find_process_by_pid(pid);
4615 if (p != NULL)
4616 retval = sched_setattr(p, &attr);
4617 rcu_read_unlock();
4618
4619 return retval;
4620}
4621
4622
4623
4624
4625
4626
4627
4628
4629SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4630{
4631 struct task_struct *p;
4632 int retval;
4633
4634 if (pid < 0)
4635 return -EINVAL;
4636
4637 retval = -ESRCH;
4638 rcu_read_lock();
4639 p = find_process_by_pid(pid);
4640 if (p) {
4641 retval = security_task_getscheduler(p);
4642 if (!retval)
4643 retval = p->policy
4644 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4645 }
4646 rcu_read_unlock();
4647 return retval;
4648}
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4659{
4660 struct sched_param lp = { .sched_priority = 0 };
4661 struct task_struct *p;
4662 int retval;
4663
4664 if (!param || pid < 0)
4665 return -EINVAL;
4666
4667 rcu_read_lock();
4668 p = find_process_by_pid(pid);
4669 retval = -ESRCH;
4670 if (!p)
4671 goto out_unlock;
4672
4673 retval = security_task_getscheduler(p);
4674 if (retval)
4675 goto out_unlock;
4676
4677 if (task_has_rt_policy(p))
4678 lp.sched_priority = p->rt_priority;
4679 rcu_read_unlock();
4680
4681
4682
4683
4684 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4685
4686 return retval;
4687
4688out_unlock:
4689 rcu_read_unlock();
4690 return retval;
4691}
4692
4693static int sched_read_attr(struct sched_attr __user *uattr,
4694 struct sched_attr *attr,
4695 unsigned int usize)
4696{
4697 int ret;
4698
4699 if (!access_ok(uattr, usize))
4700 return -EFAULT;
4701
4702
4703
4704
4705
4706
4707 if (usize < sizeof(*attr)) {
4708 unsigned char *addr;
4709 unsigned char *end;
4710
4711 addr = (void *)attr + usize;
4712 end = (void *)attr + sizeof(*attr);
4713
4714 for (; addr < end; addr++) {
4715 if (*addr)
4716 return -EFBIG;
4717 }
4718
4719 attr->size = usize;
4720 }
4721
4722 ret = copy_to_user(uattr, attr, attr->size);
4723 if (ret)
4724 return -EFAULT;
4725
4726 return 0;
4727}
4728
4729
4730
4731
4732
4733
4734
4735
4736SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4737 unsigned int, size, unsigned int, flags)
4738{
4739 struct sched_attr attr = {
4740 .size = sizeof(struct sched_attr),
4741 };
4742 struct task_struct *p;
4743 int retval;
4744
4745 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4746 size < SCHED_ATTR_SIZE_VER0 || flags)
4747 return -EINVAL;
4748
4749 rcu_read_lock();
4750 p = find_process_by_pid(pid);
4751 retval = -ESRCH;
4752 if (!p)
4753 goto out_unlock;
4754
4755 retval = security_task_getscheduler(p);
4756 if (retval)
4757 goto out_unlock;
4758
4759 attr.sched_policy = p->policy;
4760 if (p->sched_reset_on_fork)
4761 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4762 if (task_has_dl_policy(p))
4763 __getparam_dl(p, &attr);
4764 else if (task_has_rt_policy(p))
4765 attr.sched_priority = p->rt_priority;
4766 else
4767 attr.sched_nice = task_nice(p);
4768
4769 rcu_read_unlock();
4770
4771 retval = sched_read_attr(uattr, &attr, size);
4772 return retval;
4773
4774out_unlock:
4775 rcu_read_unlock();
4776 return retval;
4777}
4778
4779long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4780{
4781 cpumask_var_t cpus_allowed, new_mask;
4782 struct task_struct *p;
4783 int retval;
4784
4785 rcu_read_lock();
4786
4787 p = find_process_by_pid(pid);
4788 if (!p) {
4789 rcu_read_unlock();
4790 return -ESRCH;
4791 }
4792
4793
4794 get_task_struct(p);
4795 rcu_read_unlock();
4796
4797 if (p->flags & PF_NO_SETAFFINITY) {
4798 retval = -EINVAL;
4799 goto out_put_task;
4800 }
4801 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4802 retval = -ENOMEM;
4803 goto out_put_task;
4804 }
4805 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4806 retval = -ENOMEM;
4807 goto out_free_cpus_allowed;
4808 }
4809 retval = -EPERM;
4810 if (!check_same_owner(p)) {
4811 rcu_read_lock();
4812 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4813 rcu_read_unlock();
4814 goto out_free_new_mask;
4815 }
4816 rcu_read_unlock();
4817 }
4818
4819 retval = security_task_setscheduler(p);
4820 if (retval)
4821 goto out_free_new_mask;
4822
4823
4824 cpuset_cpus_allowed(p, cpus_allowed);
4825 cpumask_and(new_mask, in_mask, cpus_allowed);
4826
4827
4828
4829
4830
4831
4832
4833#ifdef CONFIG_SMP
4834 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4835 rcu_read_lock();
4836 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4837 retval = -EBUSY;
4838 rcu_read_unlock();
4839 goto out_free_new_mask;
4840 }
4841 rcu_read_unlock();
4842 }
4843#endif
4844again:
4845 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4846
4847 if (!retval) {
4848 cpuset_cpus_allowed(p, cpus_allowed);
4849 if (!cpumask_subset(new_mask, cpus_allowed)) {
4850
4851
4852
4853
4854
4855 cpumask_copy(new_mask, cpus_allowed);
4856 goto again;
4857 }
4858 }
4859out_free_new_mask:
4860 free_cpumask_var(new_mask);
4861out_free_cpus_allowed:
4862 free_cpumask_var(cpus_allowed);
4863out_put_task:
4864 put_task_struct(p);
4865 return retval;
4866}
4867
4868static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4869 struct cpumask *new_mask)
4870{
4871 if (len < cpumask_size())
4872 cpumask_clear(new_mask);
4873 else if (len > cpumask_size())
4874 len = cpumask_size();
4875
4876 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4877}
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4888 unsigned long __user *, user_mask_ptr)
4889{
4890 cpumask_var_t new_mask;
4891 int retval;
4892
4893 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4894 return -ENOMEM;
4895
4896 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4897 if (retval == 0)
4898 retval = sched_setaffinity(pid, new_mask);
4899 free_cpumask_var(new_mask);
4900 return retval;
4901}
4902
4903long sched_getaffinity(pid_t pid, struct cpumask *mask)
4904{
4905 struct task_struct *p;
4906 unsigned long flags;
4907 int retval;
4908
4909 rcu_read_lock();
4910
4911 retval = -ESRCH;
4912 p = find_process_by_pid(pid);
4913 if (!p)
4914 goto out_unlock;
4915
4916 retval = security_task_getscheduler(p);
4917 if (retval)
4918 goto out_unlock;
4919
4920 raw_spin_lock_irqsave(&p->pi_lock, flags);
4921 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4922 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4923
4924out_unlock:
4925 rcu_read_unlock();
4926
4927 return retval;
4928}
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4940 unsigned long __user *, user_mask_ptr)
4941{
4942 int ret;
4943 cpumask_var_t mask;
4944
4945 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4946 return -EINVAL;
4947 if (len & (sizeof(unsigned long)-1))
4948 return -EINVAL;
4949
4950 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4951 return -ENOMEM;
4952
4953 ret = sched_getaffinity(pid, mask);
4954 if (ret == 0) {
4955 unsigned int retlen = min(len, cpumask_size());
4956
4957 if (copy_to_user(user_mask_ptr, mask, retlen))
4958 ret = -EFAULT;
4959 else
4960 ret = retlen;
4961 }
4962 free_cpumask_var(mask);
4963
4964 return ret;
4965}
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975static void do_sched_yield(void)
4976{
4977 struct rq_flags rf;
4978 struct rq *rq;
4979
4980 rq = this_rq_lock_irq(&rf);
4981
4982 schedstat_inc(rq->yld_count);
4983 current->sched_class->yield_task(rq);
4984
4985
4986
4987
4988
4989 preempt_disable();
4990 rq_unlock(rq, &rf);
4991 sched_preempt_enable_no_resched();
4992
4993 schedule();
4994}
4995
4996SYSCALL_DEFINE0(sched_yield)
4997{
4998 do_sched_yield();
4999 return 0;
5000}
5001
5002#ifndef CONFIG_PREEMPT
5003int __sched _cond_resched(void)
5004{
5005 if (should_resched(0)) {
5006 preempt_schedule_common();
5007 return 1;
5008 }
5009 rcu_all_qs();
5010 return 0;
5011}
5012EXPORT_SYMBOL(_cond_resched);
5013#endif
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023int __cond_resched_lock(spinlock_t *lock)
5024{
5025 int resched = should_resched(PREEMPT_LOCK_OFFSET);
5026 int ret = 0;
5027
5028 lockdep_assert_held(lock);
5029
5030 if (spin_needbreak(lock) || resched) {
5031 spin_unlock(lock);
5032 if (resched)
5033 preempt_schedule_common();
5034 else
5035 cpu_relax();
5036 ret = 1;
5037 spin_lock(lock);
5038 }
5039 return ret;
5040}
5041EXPORT_SYMBOL(__cond_resched_lock);
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065void __sched yield(void)
5066{
5067 set_current_state(TASK_RUNNING);
5068 do_sched_yield();
5069}
5070EXPORT_SYMBOL(yield);
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087int __sched yield_to(struct task_struct *p, bool preempt)
5088{
5089 struct task_struct *curr = current;
5090 struct rq *rq, *p_rq;
5091 unsigned long flags;
5092 int yielded = 0;
5093
5094 local_irq_save(flags);
5095 rq = this_rq();
5096
5097again:
5098 p_rq = task_rq(p);
5099
5100
5101
5102
5103 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5104 yielded = -ESRCH;
5105 goto out_irq;
5106 }
5107
5108 double_rq_lock(rq, p_rq);
5109 if (task_rq(p) != p_rq) {
5110 double_rq_unlock(rq, p_rq);
5111 goto again;
5112 }
5113
5114 if (!curr->sched_class->yield_to_task)
5115 goto out_unlock;
5116
5117 if (curr->sched_class != p->sched_class)
5118 goto out_unlock;
5119
5120 if (task_running(p_rq, p) || p->state)
5121 goto out_unlock;
5122
5123 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5124 if (yielded) {
5125 schedstat_inc(rq->yld_count);
5126
5127
5128
5129
5130 if (preempt && rq != p_rq)
5131 resched_curr(p_rq);
5132 }
5133
5134out_unlock:
5135 double_rq_unlock(rq, p_rq);
5136out_irq:
5137 local_irq_restore(flags);
5138
5139 if (yielded > 0)
5140 schedule();
5141
5142 return yielded;
5143}
5144EXPORT_SYMBOL_GPL(yield_to);
5145
5146int io_schedule_prepare(void)
5147{
5148 int old_iowait = current->in_iowait;
5149
5150 current->in_iowait = 1;
5151 blk_schedule_flush_plug(current);
5152
5153 return old_iowait;
5154}
5155
5156void io_schedule_finish(int token)
5157{
5158 current->in_iowait = token;
5159}
5160
5161
5162
5163
5164
5165long __sched io_schedule_timeout(long timeout)
5166{
5167 int token;
5168 long ret;
5169
5170 token = io_schedule_prepare();
5171 ret = schedule_timeout(timeout);
5172 io_schedule_finish(token);
5173
5174 return ret;
5175}
5176EXPORT_SYMBOL(io_schedule_timeout);
5177
5178void io_schedule(void)
5179{
5180 int token;
5181
5182 token = io_schedule_prepare();
5183 schedule();
5184 io_schedule_finish(token);
5185}
5186EXPORT_SYMBOL(io_schedule);
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5197{
5198 int ret = -EINVAL;
5199
5200 switch (policy) {
5201 case SCHED_FIFO:
5202 case SCHED_RR:
5203 ret = MAX_USER_RT_PRIO-1;
5204 break;
5205 case SCHED_DEADLINE:
5206 case SCHED_NORMAL:
5207 case SCHED_BATCH:
5208 case SCHED_IDLE:
5209 ret = 0;
5210 break;
5211 }
5212 return ret;
5213}
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5224{
5225 int ret = -EINVAL;
5226
5227 switch (policy) {
5228 case SCHED_FIFO:
5229 case SCHED_RR:
5230 ret = 1;
5231 break;
5232 case SCHED_DEADLINE:
5233 case SCHED_NORMAL:
5234 case SCHED_BATCH:
5235 case SCHED_IDLE:
5236 ret = 0;
5237 }
5238 return ret;
5239}
5240
5241static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
5242{
5243 struct task_struct *p;
5244 unsigned int time_slice;
5245 struct rq_flags rf;
5246 struct rq *rq;
5247 int retval;
5248
5249 if (pid < 0)
5250 return -EINVAL;
5251
5252 retval = -ESRCH;
5253 rcu_read_lock();
5254 p = find_process_by_pid(pid);
5255 if (!p)
5256 goto out_unlock;
5257
5258 retval = security_task_getscheduler(p);
5259 if (retval)
5260 goto out_unlock;
5261
5262 rq = task_rq_lock(p, &rf);
5263 time_slice = 0;
5264 if (p->sched_class->get_rr_interval)
5265 time_slice = p->sched_class->get_rr_interval(rq, p);
5266 task_rq_unlock(rq, p, &rf);
5267
5268 rcu_read_unlock();
5269 jiffies_to_timespec64(time_slice, t);
5270 return 0;
5271
5272out_unlock:
5273 rcu_read_unlock();
5274 return retval;
5275}
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5289 struct __kernel_timespec __user *, interval)
5290{
5291 struct timespec64 t;
5292 int retval = sched_rr_get_interval(pid, &t);
5293
5294 if (retval == 0)
5295 retval = put_timespec64(&t, interval);
5296
5297 return retval;
5298}
5299
5300#ifdef CONFIG_COMPAT_32BIT_TIME
5301SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
5302 struct old_timespec32 __user *, interval)
5303{
5304 struct timespec64 t;
5305 int retval = sched_rr_get_interval(pid, &t);
5306
5307 if (retval == 0)
5308 retval = put_old_timespec32(&t, interval);
5309 return retval;
5310}
5311#endif
5312
5313void sched_show_task(struct task_struct *p)
5314{
5315 unsigned long free = 0;
5316 int ppid;
5317
5318 if (!try_get_task_stack(p))
5319 return;
5320
5321 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
5322
5323 if (p->state == TASK_RUNNING)
5324 printk(KERN_CONT " running task ");
5325#ifdef CONFIG_DEBUG_STACK_USAGE
5326 free = stack_not_used(p);
5327#endif
5328 ppid = 0;
5329 rcu_read_lock();
5330 if (pid_alive(p))
5331 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5332 rcu_read_unlock();
5333 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5334 task_pid_nr(p), ppid,
5335 (unsigned long)task_thread_info(p)->flags);
5336
5337 print_worker_info(KERN_INFO, p);
5338 show_stack(p, NULL);
5339 put_task_stack(p);
5340}
5341EXPORT_SYMBOL_GPL(sched_show_task);
5342
5343static inline bool
5344state_filter_match(unsigned long state_filter, struct task_struct *p)
5345{
5346
5347 if (!state_filter)
5348 return true;
5349
5350
5351 if (!(p->state & state_filter))
5352 return false;
5353
5354
5355
5356
5357
5358 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5359 return false;
5360
5361 return true;
5362}
5363
5364
5365void show_state_filter(unsigned long state_filter)
5366{
5367 struct task_struct *g, *p;
5368
5369#if BITS_PER_LONG == 32
5370 printk(KERN_INFO
5371 " task PC stack pid father\n");
5372#else
5373 printk(KERN_INFO
5374 " task PC stack pid father\n");
5375#endif
5376 rcu_read_lock();
5377 for_each_process_thread(g, p) {
5378
5379
5380
5381
5382
5383
5384
5385 touch_nmi_watchdog();
5386 touch_all_softlockup_watchdogs();
5387 if (state_filter_match(state_filter, p))
5388 sched_show_task(p);
5389 }
5390
5391#ifdef CONFIG_SCHED_DEBUG
5392 if (!state_filter)
5393 sysrq_sched_debug_show();
5394#endif
5395 rcu_read_unlock();
5396
5397
5398
5399 if (!state_filter)
5400 debug_show_all_locks();
5401}
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411void init_idle(struct task_struct *idle, int cpu)
5412{
5413 struct rq *rq = cpu_rq(cpu);
5414 unsigned long flags;
5415
5416 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5417 raw_spin_lock(&rq->lock);
5418
5419 __sched_fork(0, idle);
5420 idle->state = TASK_RUNNING;
5421 idle->se.exec_start = sched_clock();
5422 idle->flags |= PF_IDLE;
5423
5424 kasan_unpoison_task_stack(idle);
5425
5426#ifdef CONFIG_SMP
5427
5428
5429
5430
5431
5432
5433 set_cpus_allowed_common(idle, cpumask_of(cpu));
5434#endif
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445 rcu_read_lock();
5446 __set_task_cpu(idle, cpu);
5447 rcu_read_unlock();
5448
5449 rq->curr = rq->idle = idle;
5450 idle->on_rq = TASK_ON_RQ_QUEUED;
5451#ifdef CONFIG_SMP
5452 idle->on_cpu = 1;
5453#endif
5454 raw_spin_unlock(&rq->lock);
5455 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5456
5457
5458 init_idle_preempt_count(idle, cpu);
5459
5460
5461
5462
5463 idle->sched_class = &idle_sched_class;
5464 ftrace_graph_init_idle_task(idle, cpu);
5465 vtime_init_idle(idle, cpu);
5466#ifdef CONFIG_SMP
5467 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5468#endif
5469}
5470
5471#ifdef CONFIG_SMP
5472
5473int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5474 const struct cpumask *trial)
5475{
5476 int ret = 1;
5477
5478 if (!cpumask_weight(cur))
5479 return ret;
5480
5481 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5482
5483 return ret;
5484}
5485
5486int task_can_attach(struct task_struct *p,
5487 const struct cpumask *cs_cpus_allowed)
5488{
5489 int ret = 0;
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500 if (p->flags & PF_NO_SETAFFINITY) {
5501 ret = -EINVAL;
5502 goto out;
5503 }
5504
5505 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5506 cs_cpus_allowed))
5507 ret = dl_task_can_attach(p, cs_cpus_allowed);
5508
5509out:
5510 return ret;
5511}
5512
5513bool sched_smp_initialized __read_mostly;
5514
5515#ifdef CONFIG_NUMA_BALANCING
5516
5517int migrate_task_to(struct task_struct *p, int target_cpu)
5518{
5519 struct migration_arg arg = { p, target_cpu };
5520 int curr_cpu = task_cpu(p);
5521
5522 if (curr_cpu == target_cpu)
5523 return 0;
5524
5525 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
5526 return -EINVAL;
5527
5528
5529
5530 trace_sched_move_numa(p, curr_cpu, target_cpu);
5531 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5532}
5533
5534
5535
5536
5537
5538void sched_setnuma(struct task_struct *p, int nid)
5539{
5540 bool queued, running;
5541 struct rq_flags rf;
5542 struct rq *rq;
5543
5544 rq = task_rq_lock(p, &rf);
5545 queued = task_on_rq_queued(p);
5546 running = task_current(rq, p);
5547
5548 if (queued)
5549 dequeue_task(rq, p, DEQUEUE_SAVE);
5550 if (running)
5551 put_prev_task(rq, p);
5552
5553 p->numa_preferred_nid = nid;
5554
5555 if (queued)
5556 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5557 if (running)
5558 set_curr_task(rq, p);
5559 task_rq_unlock(rq, p, &rf);
5560}
5561#endif
5562
5563#ifdef CONFIG_HOTPLUG_CPU
5564
5565
5566
5567
5568void idle_task_exit(void)
5569{
5570 struct mm_struct *mm = current->active_mm;
5571
5572 BUG_ON(cpu_online(smp_processor_id()));
5573
5574 if (mm != &init_mm) {
5575 switch_mm(mm, &init_mm, current);
5576 current->active_mm = &init_mm;
5577 finish_arch_post_lock_switch();
5578 }
5579 mmdrop(mm);
5580}
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591static void calc_load_migrate(struct rq *rq)
5592{
5593 long delta = calc_load_fold_active(rq, 1);
5594 if (delta)
5595 atomic_long_add(delta, &calc_load_tasks);
5596}
5597
5598static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5599{
5600}
5601
5602static const struct sched_class fake_sched_class = {
5603 .put_prev_task = put_prev_task_fake,
5604};
5605
5606static struct task_struct fake_task = {
5607
5608
5609
5610 .prio = MAX_PRIO + 1,
5611 .sched_class = &fake_sched_class,
5612};
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5623{
5624 struct rq *rq = dead_rq;
5625 struct task_struct *next, *stop = rq->stop;
5626 struct rq_flags orf = *rf;
5627 int dest_cpu;
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638 rq->stop = NULL;
5639
5640
5641
5642
5643
5644
5645 update_rq_clock(rq);
5646
5647 for (;;) {
5648
5649
5650
5651
5652 if (rq->nr_running == 1)
5653 break;
5654
5655
5656
5657
5658 next = pick_next_task(rq, &fake_task, rf);
5659 BUG_ON(!next);
5660 put_prev_task(rq, next);
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671 rq_unlock(rq, rf);
5672 raw_spin_lock(&next->pi_lock);
5673 rq_relock(rq, rf);
5674
5675
5676
5677
5678
5679
5680 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5681 raw_spin_unlock(&next->pi_lock);
5682 continue;
5683 }
5684
5685
5686 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5687 rq = __migrate_task(rq, rf, next, dest_cpu);
5688 if (rq != dead_rq) {
5689 rq_unlock(rq, rf);
5690 rq = dead_rq;
5691 *rf = orf;
5692 rq_relock(rq, rf);
5693 }
5694 raw_spin_unlock(&next->pi_lock);
5695 }
5696
5697 rq->stop = stop;
5698}
5699#endif
5700
5701void set_rq_online(struct rq *rq)
5702{
5703 if (!rq->online) {
5704 const struct sched_class *class;
5705
5706 cpumask_set_cpu(rq->cpu, rq->rd->online);
5707 rq->online = 1;
5708
5709 for_each_class(class) {
5710 if (class->rq_online)
5711 class->rq_online(rq);
5712 }
5713 }
5714}
5715
5716void set_rq_offline(struct rq *rq)
5717{
5718 if (rq->online) {
5719 const struct sched_class *class;
5720
5721 for_each_class(class) {
5722 if (class->rq_offline)
5723 class->rq_offline(rq);
5724 }
5725
5726 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5727 rq->online = 0;
5728 }
5729}
5730
5731
5732
5733
5734static int num_cpus_frozen;
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744static void cpuset_cpu_active(void)
5745{
5746 if (cpuhp_tasks_frozen) {
5747
5748
5749
5750
5751
5752
5753 partition_sched_domains(1, NULL, NULL);
5754 if (--num_cpus_frozen)
5755 return;
5756
5757
5758
5759
5760
5761 cpuset_force_rebuild();
5762 }
5763 cpuset_update_active_cpus();
5764}
5765
5766static int cpuset_cpu_inactive(unsigned int cpu)
5767{
5768 if (!cpuhp_tasks_frozen) {
5769 if (dl_cpu_busy(cpu))
5770 return -EBUSY;
5771 cpuset_update_active_cpus();
5772 } else {
5773 num_cpus_frozen++;
5774 partition_sched_domains(1, NULL, NULL);
5775 }
5776 return 0;
5777}
5778
5779int sched_cpu_activate(unsigned int cpu)
5780{
5781 struct rq *rq = cpu_rq(cpu);
5782 struct rq_flags rf;
5783
5784#ifdef CONFIG_SCHED_SMT
5785
5786
5787
5788 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5789 static_branch_inc_cpuslocked(&sched_smt_present);
5790#endif
5791 set_cpu_active(cpu, true);
5792
5793 if (sched_smp_initialized) {
5794 sched_domains_numa_masks_set(cpu);
5795 cpuset_cpu_active();
5796 }
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807 rq_lock_irqsave(rq, &rf);
5808 if (rq->rd) {
5809 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5810 set_rq_online(rq);
5811 }
5812 rq_unlock_irqrestore(rq, &rf);
5813
5814 update_max_interval();
5815
5816 return 0;
5817}
5818
5819int sched_cpu_deactivate(unsigned int cpu)
5820{
5821 int ret;
5822
5823 set_cpu_active(cpu, false);
5824
5825
5826
5827
5828
5829
5830
5831 synchronize_rcu();
5832
5833#ifdef CONFIG_SCHED_SMT
5834
5835
5836
5837 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5838 static_branch_dec_cpuslocked(&sched_smt_present);
5839#endif
5840
5841 if (!sched_smp_initialized)
5842 return 0;
5843
5844 ret = cpuset_cpu_inactive(cpu);
5845 if (ret) {
5846 set_cpu_active(cpu, true);
5847 return ret;
5848 }
5849 sched_domains_numa_masks_clear(cpu);
5850 return 0;
5851}
5852
5853static void sched_rq_cpu_starting(unsigned int cpu)
5854{
5855 struct rq *rq = cpu_rq(cpu);
5856
5857 rq->calc_load_update = calc_load_update;
5858 update_max_interval();
5859}
5860
5861int sched_cpu_starting(unsigned int cpu)
5862{
5863 sched_rq_cpu_starting(cpu);
5864 sched_tick_start(cpu);
5865 return 0;
5866}
5867
5868#ifdef CONFIG_HOTPLUG_CPU
5869int sched_cpu_dying(unsigned int cpu)
5870{
5871 struct rq *rq = cpu_rq(cpu);
5872 struct rq_flags rf;
5873
5874
5875 sched_ttwu_pending();
5876 sched_tick_stop(cpu);
5877
5878 rq_lock_irqsave(rq, &rf);
5879 if (rq->rd) {
5880 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5881 set_rq_offline(rq);
5882 }
5883 migrate_tasks(rq, &rf);
5884 BUG_ON(rq->nr_running != 1);
5885 rq_unlock_irqrestore(rq, &rf);
5886
5887 calc_load_migrate(rq);
5888 update_max_interval();
5889 nohz_balance_exit_idle(rq);
5890 hrtick_clear(rq);
5891 return 0;
5892}
5893#endif
5894
5895void __init sched_init_smp(void)
5896{
5897 sched_init_numa();
5898
5899
5900
5901
5902
5903
5904 mutex_lock(&sched_domains_mutex);
5905 sched_init_domains(cpu_active_mask);
5906 mutex_unlock(&sched_domains_mutex);
5907
5908
5909 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
5910 BUG();
5911 sched_init_granularity();
5912
5913 init_sched_rt_class();
5914 init_sched_dl_class();
5915
5916 sched_smp_initialized = true;
5917}
5918
5919static int __init migration_init(void)
5920{
5921 sched_rq_cpu_starting(smp_processor_id());
5922 return 0;
5923}
5924early_initcall(migration_init);
5925
5926#else
5927void __init sched_init_smp(void)
5928{
5929 sched_init_granularity();
5930}
5931#endif
5932
5933int in_sched_functions(unsigned long addr)
5934{
5935 return in_lock_functions(addr) ||
5936 (addr >= (unsigned long)__sched_text_start
5937 && addr < (unsigned long)__sched_text_end);
5938}
5939
5940#ifdef CONFIG_CGROUP_SCHED
5941
5942
5943
5944
5945struct task_group root_task_group;
5946LIST_HEAD(task_groups);
5947
5948
5949static struct kmem_cache *task_group_cache __read_mostly;
5950#endif
5951
5952DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
5953DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5954
5955void __init sched_init(void)
5956{
5957 int i, j;
5958 unsigned long alloc_size = 0, ptr;
5959
5960 wait_bit_init();
5961
5962#ifdef CONFIG_FAIR_GROUP_SCHED
5963 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5964#endif
5965#ifdef CONFIG_RT_GROUP_SCHED
5966 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5967#endif
5968 if (alloc_size) {
5969 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
5970
5971#ifdef CONFIG_FAIR_GROUP_SCHED
5972 root_task_group.se = (struct sched_entity **)ptr;
5973 ptr += nr_cpu_ids * sizeof(void **);
5974
5975 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
5976 ptr += nr_cpu_ids * sizeof(void **);
5977
5978#endif
5979#ifdef CONFIG_RT_GROUP_SCHED
5980 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
5981 ptr += nr_cpu_ids * sizeof(void **);
5982
5983 root_task_group.rt_rq = (struct rt_rq **)ptr;
5984 ptr += nr_cpu_ids * sizeof(void **);
5985
5986#endif
5987 }
5988#ifdef CONFIG_CPUMASK_OFFSTACK
5989 for_each_possible_cpu(i) {
5990 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
5991 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5992 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
5993 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5994 }
5995#endif
5996
5997 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
5998 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
5999
6000#ifdef CONFIG_SMP
6001 init_defrootdomain();
6002#endif
6003
6004#ifdef CONFIG_RT_GROUP_SCHED
6005 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6006 global_rt_period(), global_rt_runtime());
6007#endif
6008
6009#ifdef CONFIG_CGROUP_SCHED
6010 task_group_cache = KMEM_CACHE(task_group, 0);
6011
6012 list_add(&root_task_group.list, &task_groups);
6013 INIT_LIST_HEAD(&root_task_group.children);
6014 INIT_LIST_HEAD(&root_task_group.siblings);
6015 autogroup_init(&init_task);
6016#endif
6017
6018 for_each_possible_cpu(i) {
6019 struct rq *rq;
6020
6021 rq = cpu_rq(i);
6022 raw_spin_lock_init(&rq->lock);
6023 rq->nr_running = 0;
6024 rq->calc_load_active = 0;
6025 rq->calc_load_update = jiffies + LOAD_FREQ;
6026 init_cfs_rq(&rq->cfs);
6027 init_rt_rq(&rq->rt);
6028 init_dl_rq(&rq->dl);
6029#ifdef CONFIG_FAIR_GROUP_SCHED
6030 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6031 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6032 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6053 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6054#endif
6055
6056 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6057#ifdef CONFIG_RT_GROUP_SCHED
6058 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6059#endif
6060
6061 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6062 rq->cpu_load[j] = 0;
6063
6064#ifdef CONFIG_SMP
6065 rq->sd = NULL;
6066 rq->rd = NULL;
6067 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
6068 rq->balance_callback = NULL;
6069 rq->active_balance = 0;
6070 rq->next_balance = jiffies;
6071 rq->push_cpu = 0;
6072 rq->cpu = i;
6073 rq->online = 0;
6074 rq->idle_stamp = 0;
6075 rq->avg_idle = 2*sysctl_sched_migration_cost;
6076 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6077
6078 INIT_LIST_HEAD(&rq->cfs_tasks);
6079
6080 rq_attach_root(rq, &def_root_domain);
6081#ifdef CONFIG_NO_HZ_COMMON
6082 rq->last_load_update_tick = jiffies;
6083 rq->last_blocked_load_update_tick = jiffies;
6084 atomic_set(&rq->nohz_flags, 0);
6085#endif
6086#endif
6087 hrtick_rq_init(rq);
6088 atomic_set(&rq->nr_iowait, 0);
6089 }
6090
6091 set_load_weight(&init_task, false);
6092
6093
6094
6095
6096 mmgrab(&init_mm);
6097 enter_lazy_tlb(&init_mm, current);
6098
6099
6100
6101
6102
6103
6104
6105 init_idle(current, smp_processor_id());
6106
6107 calc_load_update = jiffies + LOAD_FREQ;
6108
6109#ifdef CONFIG_SMP
6110 idle_thread_set_boot_cpu();
6111#endif
6112 init_sched_fair_class();
6113
6114 init_schedstats();
6115
6116 psi_init();
6117
6118 scheduler_running = 1;
6119}
6120
6121#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6122static inline int preempt_count_equals(int preempt_offset)
6123{
6124 int nested = preempt_count() + rcu_preempt_depth();
6125
6126 return (nested == preempt_offset);
6127}
6128
6129void __might_sleep(const char *file, int line, int preempt_offset)
6130{
6131
6132
6133
6134
6135
6136 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6137 "do not call blocking ops when !TASK_RUNNING; "
6138 "state=%lx set at [<%p>] %pS\n",
6139 current->state,
6140 (void *)current->task_state_change,
6141 (void *)current->task_state_change);
6142
6143 ___might_sleep(file, line, preempt_offset);
6144}
6145EXPORT_SYMBOL(__might_sleep);
6146
6147void ___might_sleep(const char *file, int line, int preempt_offset)
6148{
6149
6150 static unsigned long prev_jiffy;
6151
6152 unsigned long preempt_disable_ip;
6153
6154
6155 rcu_sleep_check();
6156
6157 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6158 !is_idle_task(current)) ||
6159 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6160 oops_in_progress)
6161 return;
6162
6163 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6164 return;
6165 prev_jiffy = jiffies;
6166
6167
6168 preempt_disable_ip = get_preempt_disable_ip(current);
6169
6170 printk(KERN_ERR
6171 "BUG: sleeping function called from invalid context at %s:%d\n",
6172 file, line);
6173 printk(KERN_ERR
6174 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6175 in_atomic(), irqs_disabled(),
6176 current->pid, current->comm);
6177
6178 if (task_stack_end_corrupted(current))
6179 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6180
6181 debug_show_held_locks(current);
6182 if (irqs_disabled())
6183 print_irqtrace_events(current);
6184 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6185 && !preempt_count_equals(preempt_offset)) {
6186 pr_err("Preemption disabled at:");
6187 print_ip_sym(preempt_disable_ip);
6188 pr_cont("\n");
6189 }
6190 dump_stack();
6191 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6192}
6193EXPORT_SYMBOL(___might_sleep);
6194
6195void __cant_sleep(const char *file, int line, int preempt_offset)
6196{
6197 static unsigned long prev_jiffy;
6198
6199 if (irqs_disabled())
6200 return;
6201
6202 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
6203 return;
6204
6205 if (preempt_count() > preempt_offset)
6206 return;
6207
6208 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6209 return;
6210 prev_jiffy = jiffies;
6211
6212 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
6213 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6214 in_atomic(), irqs_disabled(),
6215 current->pid, current->comm);
6216
6217 debug_show_held_locks(current);
6218 dump_stack();
6219 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6220}
6221EXPORT_SYMBOL_GPL(__cant_sleep);
6222#endif
6223
6224#ifdef CONFIG_MAGIC_SYSRQ
6225void normalize_rt_tasks(void)
6226{
6227 struct task_struct *g, *p;
6228 struct sched_attr attr = {
6229 .sched_policy = SCHED_NORMAL,
6230 };
6231
6232 read_lock(&tasklist_lock);
6233 for_each_process_thread(g, p) {
6234
6235
6236
6237 if (p->flags & PF_KTHREAD)
6238 continue;
6239
6240 p->se.exec_start = 0;
6241 schedstat_set(p->se.statistics.wait_start, 0);
6242 schedstat_set(p->se.statistics.sleep_start, 0);
6243 schedstat_set(p->se.statistics.block_start, 0);
6244
6245 if (!dl_task(p) && !rt_task(p)) {
6246
6247
6248
6249
6250 if (task_nice(p) < 0)
6251 set_user_nice(p, 0);
6252 continue;
6253 }
6254
6255 __sched_setscheduler(p, &attr, false, false);
6256 }
6257 read_unlock(&tasklist_lock);
6258}
6259
6260#endif
6261
6262#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281struct task_struct *curr_task(int cpu)
6282{
6283 return cpu_curr(cpu);
6284}
6285
6286#endif
6287
6288#ifdef CONFIG_IA64
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304void ia64_set_curr_task(int cpu, struct task_struct *p)
6305{
6306 cpu_curr(cpu) = p;
6307}
6308
6309#endif
6310
6311#ifdef CONFIG_CGROUP_SCHED
6312
6313static DEFINE_SPINLOCK(task_group_lock);
6314
6315static void sched_free_group(struct task_group *tg)
6316{
6317 free_fair_sched_group(tg);
6318 free_rt_sched_group(tg);
6319 autogroup_free(tg);
6320 kmem_cache_free(task_group_cache, tg);
6321}
6322
6323
6324struct task_group *sched_create_group(struct task_group *parent)
6325{
6326 struct task_group *tg;
6327
6328 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6329 if (!tg)
6330 return ERR_PTR(-ENOMEM);
6331
6332 if (!alloc_fair_sched_group(tg, parent))
6333 goto err;
6334
6335 if (!alloc_rt_sched_group(tg, parent))
6336 goto err;
6337
6338 return tg;
6339
6340err:
6341 sched_free_group(tg);
6342 return ERR_PTR(-ENOMEM);
6343}
6344
6345void sched_online_group(struct task_group *tg, struct task_group *parent)
6346{
6347 unsigned long flags;
6348
6349 spin_lock_irqsave(&task_group_lock, flags);
6350 list_add_rcu(&tg->list, &task_groups);
6351
6352
6353 WARN_ON(!parent);
6354
6355 tg->parent = parent;
6356 INIT_LIST_HEAD(&tg->children);
6357 list_add_rcu(&tg->siblings, &parent->children);
6358 spin_unlock_irqrestore(&task_group_lock, flags);
6359
6360 online_fair_sched_group(tg);
6361}
6362
6363
6364static void sched_free_group_rcu(struct rcu_head *rhp)
6365{
6366
6367 sched_free_group(container_of(rhp, struct task_group, rcu));
6368}
6369
6370void sched_destroy_group(struct task_group *tg)
6371{
6372
6373 call_rcu(&tg->rcu, sched_free_group_rcu);
6374}
6375
6376void sched_offline_group(struct task_group *tg)
6377{
6378 unsigned long flags;
6379
6380
6381 unregister_fair_sched_group(tg);
6382
6383 spin_lock_irqsave(&task_group_lock, flags);
6384 list_del_rcu(&tg->list);
6385 list_del_rcu(&tg->siblings);
6386 spin_unlock_irqrestore(&task_group_lock, flags);
6387}
6388
6389static void sched_change_group(struct task_struct *tsk, int type)
6390{
6391 struct task_group *tg;
6392
6393
6394
6395
6396
6397
6398 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6399 struct task_group, css);
6400 tg = autogroup_task_group(tsk, tg);
6401 tsk->sched_task_group = tg;
6402
6403#ifdef CONFIG_FAIR_GROUP_SCHED
6404 if (tsk->sched_class->task_change_group)
6405 tsk->sched_class->task_change_group(tsk, type);
6406 else
6407#endif
6408 set_task_rq(tsk, task_cpu(tsk));
6409}
6410
6411
6412
6413
6414
6415
6416
6417
6418void sched_move_task(struct task_struct *tsk)
6419{
6420 int queued, running, queue_flags =
6421 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6422 struct rq_flags rf;
6423 struct rq *rq;
6424
6425 rq = task_rq_lock(tsk, &rf);
6426 update_rq_clock(rq);
6427
6428 running = task_current(rq, tsk);
6429 queued = task_on_rq_queued(tsk);
6430
6431 if (queued)
6432 dequeue_task(rq, tsk, queue_flags);
6433 if (running)
6434 put_prev_task(rq, tsk);
6435
6436 sched_change_group(tsk, TASK_MOVE_GROUP);
6437
6438 if (queued)
6439 enqueue_task(rq, tsk, queue_flags);
6440 if (running)
6441 set_curr_task(rq, tsk);
6442
6443 task_rq_unlock(rq, tsk, &rf);
6444}
6445
6446static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6447{
6448 return css ? container_of(css, struct task_group, css) : NULL;
6449}
6450
6451static struct cgroup_subsys_state *
6452cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6453{
6454 struct task_group *parent = css_tg(parent_css);
6455 struct task_group *tg;
6456
6457 if (!parent) {
6458
6459 return &root_task_group.css;
6460 }
6461
6462 tg = sched_create_group(parent);
6463 if (IS_ERR(tg))
6464 return ERR_PTR(-ENOMEM);
6465
6466 return &tg->css;
6467}
6468
6469
6470static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6471{
6472 struct task_group *tg = css_tg(css);
6473 struct task_group *parent = css_tg(css->parent);
6474
6475 if (parent)
6476 sched_online_group(tg, parent);
6477 return 0;
6478}
6479
6480static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6481{
6482 struct task_group *tg = css_tg(css);
6483
6484 sched_offline_group(tg);
6485}
6486
6487static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6488{
6489 struct task_group *tg = css_tg(css);
6490
6491
6492
6493
6494 sched_free_group(tg);
6495}
6496
6497
6498
6499
6500
6501static void cpu_cgroup_fork(struct task_struct *task)
6502{
6503 struct rq_flags rf;
6504 struct rq *rq;
6505
6506 rq = task_rq_lock(task, &rf);
6507
6508 update_rq_clock(rq);
6509 sched_change_group(task, TASK_SET_GROUP);
6510
6511 task_rq_unlock(rq, task, &rf);
6512}
6513
6514static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6515{
6516 struct task_struct *task;
6517 struct cgroup_subsys_state *css;
6518 int ret = 0;
6519
6520 cgroup_taskset_for_each(task, css, tset) {
6521#ifdef CONFIG_RT_GROUP_SCHED
6522 if (!sched_rt_can_attach(css_tg(css), task))
6523 return -EINVAL;
6524#else
6525
6526 if (task->sched_class != &fair_sched_class)
6527 return -EINVAL;
6528#endif
6529
6530
6531
6532
6533 raw_spin_lock_irq(&task->pi_lock);
6534
6535
6536
6537
6538
6539 if (task->state == TASK_NEW)
6540 ret = -EINVAL;
6541 raw_spin_unlock_irq(&task->pi_lock);
6542
6543 if (ret)
6544 break;
6545 }
6546 return ret;
6547}
6548
6549static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6550{
6551 struct task_struct *task;
6552 struct cgroup_subsys_state *css;
6553
6554 cgroup_taskset_for_each(task, css, tset)
6555 sched_move_task(task);
6556}
6557
6558#ifdef CONFIG_FAIR_GROUP_SCHED
6559static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6560 struct cftype *cftype, u64 shareval)
6561{
6562 return sched_group_set_shares(css_tg(css), scale_load(shareval));
6563}
6564
6565static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6566 struct cftype *cft)
6567{
6568 struct task_group *tg = css_tg(css);
6569
6570 return (u64) scale_load_down(tg->shares);
6571}
6572
6573#ifdef CONFIG_CFS_BANDWIDTH
6574static DEFINE_MUTEX(cfs_constraints_mutex);
6575
6576const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
6577const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
6578
6579static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
6580
6581static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
6582{
6583 int i, ret = 0, runtime_enabled, runtime_was_enabled;
6584 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6585
6586 if (tg == &root_task_group)
6587 return -EINVAL;
6588
6589
6590
6591
6592
6593
6594 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
6595 return -EINVAL;
6596
6597
6598
6599
6600
6601
6602 if (period > max_cfs_quota_period)
6603 return -EINVAL;
6604
6605
6606
6607
6608
6609 get_online_cpus();
6610 mutex_lock(&cfs_constraints_mutex);
6611 ret = __cfs_schedulable(tg, period, quota);
6612 if (ret)
6613 goto out_unlock;
6614
6615 runtime_enabled = quota != RUNTIME_INF;
6616 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
6617
6618
6619
6620
6621 if (runtime_enabled && !runtime_was_enabled)
6622 cfs_bandwidth_usage_inc();
6623 raw_spin_lock_irq(&cfs_b->lock);
6624 cfs_b->period = ns_to_ktime(period);
6625 cfs_b->quota = quota;
6626
6627 __refill_cfs_bandwidth_runtime(cfs_b);
6628
6629
6630 if (runtime_enabled)
6631 start_cfs_bandwidth(cfs_b);
6632
6633 raw_spin_unlock_irq(&cfs_b->lock);
6634
6635 for_each_online_cpu(i) {
6636 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
6637 struct rq *rq = cfs_rq->rq;
6638 struct rq_flags rf;
6639
6640 rq_lock_irq(rq, &rf);
6641 cfs_rq->runtime_enabled = runtime_enabled;
6642 cfs_rq->runtime_remaining = 0;
6643
6644 if (cfs_rq->throttled)
6645 unthrottle_cfs_rq(cfs_rq);
6646 rq_unlock_irq(rq, &rf);
6647 }
6648 if (runtime_was_enabled && !runtime_enabled)
6649 cfs_bandwidth_usage_dec();
6650out_unlock:
6651 mutex_unlock(&cfs_constraints_mutex);
6652 put_online_cpus();
6653
6654 return ret;
6655}
6656
6657int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
6658{
6659 u64 quota, period;
6660
6661 period = ktime_to_ns(tg->cfs_bandwidth.period);
6662 if (cfs_quota_us < 0)
6663 quota = RUNTIME_INF;
6664 else
6665 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
6666
6667 return tg_set_cfs_bandwidth(tg, period, quota);
6668}
6669
6670long tg_get_cfs_quota(struct task_group *tg)
6671{
6672 u64 quota_us;
6673
6674 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
6675 return -1;
6676
6677 quota_us = tg->cfs_bandwidth.quota;
6678 do_div(quota_us, NSEC_PER_USEC);
6679
6680 return quota_us;
6681}
6682
6683int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
6684{
6685 u64 quota, period;
6686
6687 period = (u64)cfs_period_us * NSEC_PER_USEC;
6688 quota = tg->cfs_bandwidth.quota;
6689
6690 return tg_set_cfs_bandwidth(tg, period, quota);
6691}
6692
6693long tg_get_cfs_period(struct task_group *tg)
6694{
6695 u64 cfs_period_us;
6696
6697 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
6698 do_div(cfs_period_us, NSEC_PER_USEC);
6699
6700 return cfs_period_us;
6701}
6702
6703static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
6704 struct cftype *cft)
6705{
6706 return tg_get_cfs_quota(css_tg(css));
6707}
6708
6709static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
6710 struct cftype *cftype, s64 cfs_quota_us)
6711{
6712 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
6713}
6714
6715static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
6716 struct cftype *cft)
6717{
6718 return tg_get_cfs_period(css_tg(css));
6719}
6720
6721static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
6722 struct cftype *cftype, u64 cfs_period_us)
6723{
6724 return tg_set_cfs_period(css_tg(css), cfs_period_us);
6725}
6726
6727struct cfs_schedulable_data {
6728 struct task_group *tg;
6729 u64 period, quota;
6730};
6731
6732
6733
6734
6735
6736static u64 normalize_cfs_quota(struct task_group *tg,
6737 struct cfs_schedulable_data *d)
6738{
6739 u64 quota, period;
6740
6741 if (tg == d->tg) {
6742 period = d->period;
6743 quota = d->quota;
6744 } else {
6745 period = tg_get_cfs_period(tg);
6746 quota = tg_get_cfs_quota(tg);
6747 }
6748
6749
6750 if (quota == RUNTIME_INF || quota == -1)
6751 return RUNTIME_INF;
6752
6753 return to_ratio(period, quota);
6754}
6755
6756static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
6757{
6758 struct cfs_schedulable_data *d = data;
6759 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6760 s64 quota = 0, parent_quota = -1;
6761
6762 if (!tg->parent) {
6763 quota = RUNTIME_INF;
6764 } else {
6765 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
6766
6767 quota = normalize_cfs_quota(tg, d);
6768 parent_quota = parent_b->hierarchical_quota;
6769
6770
6771
6772
6773
6774
6775 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
6776 quota = min(quota, parent_quota);
6777 } else {
6778 if (quota == RUNTIME_INF)
6779 quota = parent_quota;
6780 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
6781 return -EINVAL;
6782 }
6783 }
6784 cfs_b->hierarchical_quota = quota;
6785
6786 return 0;
6787}
6788
6789static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
6790{
6791 int ret;
6792 struct cfs_schedulable_data data = {
6793 .tg = tg,
6794 .period = period,
6795 .quota = quota,
6796 };
6797
6798 if (quota != RUNTIME_INF) {
6799 do_div(data.period, NSEC_PER_USEC);
6800 do_div(data.quota, NSEC_PER_USEC);
6801 }
6802
6803 rcu_read_lock();
6804 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
6805 rcu_read_unlock();
6806
6807 return ret;
6808}
6809
6810static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
6811{
6812 struct task_group *tg = css_tg(seq_css(sf));
6813 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6814
6815 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
6816 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
6817 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
6818
6819 if (schedstat_enabled() && tg != &root_task_group) {
6820 u64 ws = 0;
6821 int i;
6822
6823 for_each_possible_cpu(i)
6824 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
6825
6826 seq_printf(sf, "wait_sum %llu\n", ws);
6827 }
6828
6829 return 0;
6830}
6831#endif
6832#endif
6833
6834#ifdef CONFIG_RT_GROUP_SCHED
6835static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
6836 struct cftype *cft, s64 val)
6837{
6838 return sched_group_set_rt_runtime(css_tg(css), val);
6839}
6840
6841static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
6842 struct cftype *cft)
6843{
6844 return sched_group_rt_runtime(css_tg(css));
6845}
6846
6847static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
6848 struct cftype *cftype, u64 rt_period_us)
6849{
6850 return sched_group_set_rt_period(css_tg(css), rt_period_us);
6851}
6852
6853static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
6854 struct cftype *cft)
6855{
6856 return sched_group_rt_period(css_tg(css));
6857}
6858#endif
6859
6860static struct cftype cpu_legacy_files[] = {
6861#ifdef CONFIG_FAIR_GROUP_SCHED
6862 {
6863 .name = "shares",
6864 .read_u64 = cpu_shares_read_u64,
6865 .write_u64 = cpu_shares_write_u64,
6866 },
6867#endif
6868#ifdef CONFIG_CFS_BANDWIDTH
6869 {
6870 .name = "cfs_quota_us",
6871 .read_s64 = cpu_cfs_quota_read_s64,
6872 .write_s64 = cpu_cfs_quota_write_s64,
6873 },
6874 {
6875 .name = "cfs_period_us",
6876 .read_u64 = cpu_cfs_period_read_u64,
6877 .write_u64 = cpu_cfs_period_write_u64,
6878 },
6879 {
6880 .name = "stat",
6881 .seq_show = cpu_cfs_stat_show,
6882 },
6883#endif
6884#ifdef CONFIG_RT_GROUP_SCHED
6885 {
6886 .name = "rt_runtime_us",
6887 .read_s64 = cpu_rt_runtime_read,
6888 .write_s64 = cpu_rt_runtime_write,
6889 },
6890 {
6891 .name = "rt_period_us",
6892 .read_u64 = cpu_rt_period_read_uint,
6893 .write_u64 = cpu_rt_period_write_uint,
6894 },
6895#endif
6896 { }
6897};
6898
6899static int cpu_extra_stat_show(struct seq_file *sf,
6900 struct cgroup_subsys_state *css)
6901{
6902#ifdef CONFIG_CFS_BANDWIDTH
6903 {
6904 struct task_group *tg = css_tg(css);
6905 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6906 u64 throttled_usec;
6907
6908 throttled_usec = cfs_b->throttled_time;
6909 do_div(throttled_usec, NSEC_PER_USEC);
6910
6911 seq_printf(sf, "nr_periods %d\n"
6912 "nr_throttled %d\n"
6913 "throttled_usec %llu\n",
6914 cfs_b->nr_periods, cfs_b->nr_throttled,
6915 throttled_usec);
6916 }
6917#endif
6918 return 0;
6919}
6920
6921#ifdef CONFIG_FAIR_GROUP_SCHED
6922static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
6923 struct cftype *cft)
6924{
6925 struct task_group *tg = css_tg(css);
6926 u64 weight = scale_load_down(tg->shares);
6927
6928 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
6929}
6930
6931static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
6932 struct cftype *cft, u64 weight)
6933{
6934
6935
6936
6937
6938
6939
6940
6941 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
6942 return -ERANGE;
6943
6944 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
6945
6946 return sched_group_set_shares(css_tg(css), scale_load(weight));
6947}
6948
6949static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
6950 struct cftype *cft)
6951{
6952 unsigned long weight = scale_load_down(css_tg(css)->shares);
6953 int last_delta = INT_MAX;
6954 int prio, delta;
6955
6956
6957 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
6958 delta = abs(sched_prio_to_weight[prio] - weight);
6959 if (delta >= last_delta)
6960 break;
6961 last_delta = delta;
6962 }
6963
6964 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
6965}
6966
6967static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
6968 struct cftype *cft, s64 nice)
6969{
6970 unsigned long weight;
6971 int idx;
6972
6973 if (nice < MIN_NICE || nice > MAX_NICE)
6974 return -ERANGE;
6975
6976 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
6977 idx = array_index_nospec(idx, 40);
6978 weight = sched_prio_to_weight[idx];
6979
6980 return sched_group_set_shares(css_tg(css), scale_load(weight));
6981}
6982#endif
6983
6984static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
6985 long period, long quota)
6986{
6987 if (quota < 0)
6988 seq_puts(sf, "max");
6989 else
6990 seq_printf(sf, "%ld", quota);
6991
6992 seq_printf(sf, " %ld\n", period);
6993}
6994
6995
6996static int __maybe_unused cpu_period_quota_parse(char *buf,
6997 u64 *periodp, u64 *quotap)
6998{
6999 char tok[21];
7000
7001 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
7002 return -EINVAL;
7003
7004 *periodp *= NSEC_PER_USEC;
7005
7006 if (sscanf(tok, "%llu", quotap))
7007 *quotap *= NSEC_PER_USEC;
7008 else if (!strcmp(tok, "max"))
7009 *quotap = RUNTIME_INF;
7010 else
7011 return -EINVAL;
7012
7013 return 0;
7014}
7015
7016#ifdef CONFIG_CFS_BANDWIDTH
7017static int cpu_max_show(struct seq_file *sf, void *v)
7018{
7019 struct task_group *tg = css_tg(seq_css(sf));
7020
7021 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
7022 return 0;
7023}
7024
7025static ssize_t cpu_max_write(struct kernfs_open_file *of,
7026 char *buf, size_t nbytes, loff_t off)
7027{
7028 struct task_group *tg = css_tg(of_css(of));
7029 u64 period = tg_get_cfs_period(tg);
7030 u64 quota;
7031 int ret;
7032
7033 ret = cpu_period_quota_parse(buf, &period, "a);
7034 if (!ret)
7035 ret = tg_set_cfs_bandwidth(tg, period, quota);
7036 return ret ?: nbytes;
7037}
7038#endif
7039
7040static struct cftype cpu_files[] = {
7041#ifdef CONFIG_FAIR_GROUP_SCHED
7042 {
7043 .name = "weight",
7044 .flags = CFTYPE_NOT_ON_ROOT,
7045 .read_u64 = cpu_weight_read_u64,
7046 .write_u64 = cpu_weight_write_u64,
7047 },
7048 {
7049 .name = "weight.nice",
7050 .flags = CFTYPE_NOT_ON_ROOT,
7051 .read_s64 = cpu_weight_nice_read_s64,
7052 .write_s64 = cpu_weight_nice_write_s64,
7053 },
7054#endif
7055#ifdef CONFIG_CFS_BANDWIDTH
7056 {
7057 .name = "max",
7058 .flags = CFTYPE_NOT_ON_ROOT,
7059 .seq_show = cpu_max_show,
7060 .write = cpu_max_write,
7061 },
7062#endif
7063 { }
7064};
7065
7066struct cgroup_subsys cpu_cgrp_subsys = {
7067 .css_alloc = cpu_cgroup_css_alloc,
7068 .css_online = cpu_cgroup_css_online,
7069 .css_released = cpu_cgroup_css_released,
7070 .css_free = cpu_cgroup_css_free,
7071 .css_extra_stat_show = cpu_extra_stat_show,
7072 .fork = cpu_cgroup_fork,
7073 .can_attach = cpu_cgroup_can_attach,
7074 .attach = cpu_cgroup_attach,
7075 .legacy_cftypes = cpu_legacy_files,
7076 .dfl_cftypes = cpu_files,
7077 .early_init = true,
7078 .threaded = true,
7079};
7080
7081#endif
7082
7083void dump_cpu_task(int cpu)
7084{
7085 pr_info("Task dump for CPU %d:\n", cpu);
7086 sched_show_task(cpu_curr(cpu));
7087}
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101const int sched_prio_to_weight[40] = {
7102 88761, 71755, 56483, 46273, 36291,
7103 29154, 23254, 18705, 14949, 11916,
7104 9548, 7620, 6100, 4904, 3906,
7105 3121, 2501, 1991, 1586, 1277,
7106 1024, 820, 655, 526, 423,
7107 335, 272, 215, 172, 137,
7108 110, 87, 70, 56, 45,
7109 36, 29, 23, 18, 15,
7110};
7111
7112
7113
7114
7115
7116
7117
7118
7119const u32 sched_prio_to_wmult[40] = {
7120 48388, 59856, 76040, 92818, 118348,
7121 147320, 184698, 229616, 287308, 360437,
7122 449829, 563644, 704093, 875809, 1099582,
7123 1376151, 1717300, 2157191, 2708050, 3363326,
7124 4194304, 5237765, 6557202, 8165337, 10153587,
7125 12820798, 15790321, 19976592, 24970740, 31350126,
7126 39045157, 49367440, 61356676, 76695844, 95443717,
7127 119304647, 148102320, 186737708, 238609294, 286331153,
7128};
7129
7130#undef CREATE_TRACE_POINTS
7131