1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140static int psi_bug __read_mostly;
141
142DEFINE_STATIC_KEY_FALSE(psi_disabled);
143DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
144
145#ifdef CONFIG_PSI_DEFAULT_DISABLED
146static bool psi_enable;
147#else
148static bool psi_enable = true;
149#endif
150static int __init setup_psi(char *str)
151{
152 return kstrtobool(str, &psi_enable) == 0;
153}
154__setup("psi=", setup_psi);
155
156
157#define PSI_FREQ (2*HZ+1)
158#define EXP_10s 1677
159#define EXP_60s 1981
160#define EXP_300s 2034
161
162
163#define WINDOW_MIN_US 500000
164#define WINDOW_MAX_US 10000000
165#define UPDATES_PER_WINDOW 10
166
167
168static u64 psi_period __read_mostly;
169
170
171static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
172struct psi_group psi_system = {
173 .pcpu = &system_group_pcpu,
174};
175
176static void psi_avgs_work(struct work_struct *work);
177
178static void poll_timer_fn(struct timer_list *t);
179
180static void group_init(struct psi_group *group)
181{
182 int cpu;
183
184 for_each_possible_cpu(cpu)
185 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
186 group->avg_last_update = sched_clock();
187 group->avg_next_update = group->avg_last_update + psi_period;
188 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
189 mutex_init(&group->avgs_lock);
190
191 mutex_init(&group->trigger_lock);
192 INIT_LIST_HEAD(&group->triggers);
193 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
194 group->poll_states = 0;
195 group->poll_min_period = U32_MAX;
196 memset(group->polling_total, 0, sizeof(group->polling_total));
197 group->polling_next_update = ULLONG_MAX;
198 group->polling_until = 0;
199 init_waitqueue_head(&group->poll_wait);
200 timer_setup(&group->poll_timer, poll_timer_fn, 0);
201 rcu_assign_pointer(group->poll_task, NULL);
202}
203
204void __init psi_init(void)
205{
206 if (!psi_enable) {
207 static_branch_enable(&psi_disabled);
208 return;
209 }
210
211 if (!cgroup_psi_enabled())
212 static_branch_disable(&psi_cgroups_enabled);
213
214 psi_period = jiffies_to_nsecs(PSI_FREQ);
215 group_init(&psi_system);
216}
217
218static bool test_state(unsigned int *tasks, enum psi_states state)
219{
220 switch (state) {
221 case PSI_IO_SOME:
222 return unlikely(tasks[NR_IOWAIT]);
223 case PSI_IO_FULL:
224 return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
225 case PSI_MEM_SOME:
226 return unlikely(tasks[NR_MEMSTALL]);
227 case PSI_MEM_FULL:
228 return unlikely(tasks[NR_MEMSTALL] &&
229 tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
230 case PSI_CPU_SOME:
231 return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
232 case PSI_CPU_FULL:
233 return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
234 case PSI_NONIDLE:
235 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
236 tasks[NR_RUNNING];
237 default:
238 return false;
239 }
240}
241
242static void get_recent_times(struct psi_group *group, int cpu,
243 enum psi_aggregators aggregator, u32 *times,
244 u32 *pchanged_states)
245{
246 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
247 u64 now, state_start;
248 enum psi_states s;
249 unsigned int seq;
250 u32 state_mask;
251
252 *pchanged_states = 0;
253
254
255 do {
256 seq = read_seqcount_begin(&groupc->seq);
257 now = cpu_clock(cpu);
258 memcpy(times, groupc->times, sizeof(groupc->times));
259 state_mask = groupc->state_mask;
260 state_start = groupc->state_start;
261 } while (read_seqcount_retry(&groupc->seq, seq));
262
263
264 for (s = 0; s < NR_PSI_STATES; s++) {
265 u32 delta;
266
267
268
269
270
271
272
273
274
275 if (state_mask & (1 << s))
276 times[s] += now - state_start;
277
278 delta = times[s] - groupc->times_prev[aggregator][s];
279 groupc->times_prev[aggregator][s] = times[s];
280
281 times[s] = delta;
282 if (delta)
283 *pchanged_states |= (1 << s);
284 }
285}
286
287static void calc_avgs(unsigned long avg[3], int missed_periods,
288 u64 time, u64 period)
289{
290 unsigned long pct;
291
292
293 if (missed_periods) {
294 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
295 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
296 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
297 }
298
299
300 pct = div_u64(time * 100, period);
301 pct *= FIXED_1;
302 avg[0] = calc_load(avg[0], EXP_10s, pct);
303 avg[1] = calc_load(avg[1], EXP_60s, pct);
304 avg[2] = calc_load(avg[2], EXP_300s, pct);
305}
306
307static void collect_percpu_times(struct psi_group *group,
308 enum psi_aggregators aggregator,
309 u32 *pchanged_states)
310{
311 u64 deltas[NR_PSI_STATES - 1] = { 0, };
312 unsigned long nonidle_total = 0;
313 u32 changed_states = 0;
314 int cpu;
315 int s;
316
317
318
319
320
321
322
323
324
325 for_each_possible_cpu(cpu) {
326 u32 times[NR_PSI_STATES];
327 u32 nonidle;
328 u32 cpu_changed_states;
329
330 get_recent_times(group, cpu, aggregator, times,
331 &cpu_changed_states);
332 changed_states |= cpu_changed_states;
333
334 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
335 nonidle_total += nonidle;
336
337 for (s = 0; s < PSI_NONIDLE; s++)
338 deltas[s] += (u64)times[s] * nonidle;
339 }
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354 for (s = 0; s < NR_PSI_STATES - 1; s++)
355 group->total[aggregator][s] +=
356 div_u64(deltas[s], max(nonidle_total, 1UL));
357
358 if (pchanged_states)
359 *pchanged_states = changed_states;
360}
361
362static u64 update_averages(struct psi_group *group, u64 now)
363{
364 unsigned long missed_periods = 0;
365 u64 expires, period;
366 u64 avg_next_update;
367 int s;
368
369
370 expires = group->avg_next_update;
371 if (now - expires >= psi_period)
372 missed_periods = div_u64(now - expires, psi_period);
373
374
375
376
377
378
379
380
381 avg_next_update = expires + ((1 + missed_periods) * psi_period);
382 period = now - (group->avg_last_update + (missed_periods * psi_period));
383 group->avg_last_update = now;
384
385 for (s = 0; s < NR_PSI_STATES - 1; s++) {
386 u32 sample;
387
388 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406 if (sample > period)
407 sample = period;
408 group->avg_total[s] += sample;
409 calc_avgs(group->avg[s], missed_periods, sample, period);
410 }
411
412 return avg_next_update;
413}
414
415static void psi_avgs_work(struct work_struct *work)
416{
417 struct delayed_work *dwork;
418 struct psi_group *group;
419 u32 changed_states;
420 bool nonidle;
421 u64 now;
422
423 dwork = to_delayed_work(work);
424 group = container_of(dwork, struct psi_group, avgs_work);
425
426 mutex_lock(&group->avgs_lock);
427
428 now = sched_clock();
429
430 collect_percpu_times(group, PSI_AVGS, &changed_states);
431 nonidle = changed_states & (1 << PSI_NONIDLE);
432
433
434
435
436
437
438
439 if (now >= group->avg_next_update)
440 group->avg_next_update = update_averages(group, now);
441
442 if (nonidle) {
443 schedule_delayed_work(dwork, nsecs_to_jiffies(
444 group->avg_next_update - now) + 1);
445 }
446
447 mutex_unlock(&group->avgs_lock);
448}
449
450
451static void window_reset(struct psi_window *win, u64 now, u64 value,
452 u64 prev_growth)
453{
454 win->start_time = now;
455 win->start_value = value;
456 win->prev_growth = prev_growth;
457}
458
459
460
461
462
463
464
465
466
467
468
469
470static u64 window_update(struct psi_window *win, u64 now, u64 value)
471{
472 u64 elapsed;
473 u64 growth;
474
475 elapsed = now - win->start_time;
476 growth = value - win->start_value;
477
478
479
480
481
482
483
484 if (elapsed > win->size)
485 window_reset(win, now, value, growth);
486 else {
487 u32 remaining;
488
489 remaining = win->size - elapsed;
490 growth += div64_u64(win->prev_growth * remaining, win->size);
491 }
492
493 return growth;
494}
495
496static void init_triggers(struct psi_group *group, u64 now)
497{
498 struct psi_trigger *t;
499
500 list_for_each_entry(t, &group->triggers, node)
501 window_reset(&t->win, now,
502 group->total[PSI_POLL][t->state], 0);
503 memcpy(group->polling_total, group->total[PSI_POLL],
504 sizeof(group->polling_total));
505 group->polling_next_update = now + group->poll_min_period;
506}
507
508static u64 update_triggers(struct psi_group *group, u64 now)
509{
510 struct psi_trigger *t;
511 bool update_total = false;
512 u64 *total = group->total[PSI_POLL];
513
514
515
516
517
518 list_for_each_entry(t, &group->triggers, node) {
519 u64 growth;
520 bool new_stall;
521
522 new_stall = group->polling_total[t->state] != total[t->state];
523
524
525 if (!new_stall && !t->pending_event)
526 continue;
527
528
529
530
531
532
533 if (new_stall) {
534
535
536
537
538
539
540 update_total = true;
541
542
543 growth = window_update(&t->win, now, total[t->state]);
544 if (growth < t->threshold)
545 continue;
546
547 t->pending_event = true;
548 }
549
550 if (now < t->last_event_time + t->win.size)
551 continue;
552
553
554 if (cmpxchg(&t->event, 0, 1) == 0)
555 wake_up_interruptible(&t->event_wait);
556 t->last_event_time = now;
557
558 t->pending_event = false;
559 }
560
561 if (update_total)
562 memcpy(group->polling_total, total,
563 sizeof(group->polling_total));
564
565 return now + group->poll_min_period;
566}
567
568
569static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
570{
571 struct task_struct *task;
572
573
574
575
576
577
578
579 if (timer_pending(&group->poll_timer))
580 return;
581
582 rcu_read_lock();
583
584 task = rcu_dereference(group->poll_task);
585
586
587
588
589 if (likely(task))
590 mod_timer(&group->poll_timer, jiffies + delay);
591
592 rcu_read_unlock();
593}
594
595static void psi_poll_work(struct psi_group *group)
596{
597 u32 changed_states;
598 u64 now;
599
600 mutex_lock(&group->trigger_lock);
601
602 now = sched_clock();
603
604 collect_percpu_times(group, PSI_POLL, &changed_states);
605
606 if (changed_states & group->poll_states) {
607
608 if (now > group->polling_until)
609 init_triggers(group, now);
610
611
612
613
614
615
616 group->polling_until = now +
617 group->poll_min_period * UPDATES_PER_WINDOW;
618 }
619
620 if (now > group->polling_until) {
621 group->polling_next_update = ULLONG_MAX;
622 goto out;
623 }
624
625 if (now >= group->polling_next_update)
626 group->polling_next_update = update_triggers(group, now);
627
628 psi_schedule_poll_work(group,
629 nsecs_to_jiffies(group->polling_next_update - now) + 1);
630
631out:
632 mutex_unlock(&group->trigger_lock);
633}
634
635static int psi_poll_worker(void *data)
636{
637 struct psi_group *group = (struct psi_group *)data;
638
639 sched_set_fifo_low(current);
640
641 while (true) {
642 wait_event_interruptible(group->poll_wait,
643 atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
644 kthread_should_stop());
645 if (kthread_should_stop())
646 break;
647
648 psi_poll_work(group);
649 }
650 return 0;
651}
652
653static void poll_timer_fn(struct timer_list *t)
654{
655 struct psi_group *group = from_timer(group, t, poll_timer);
656
657 atomic_set(&group->poll_wakeup, 1);
658 wake_up_interruptible(&group->poll_wait);
659}
660
661static void record_times(struct psi_group_cpu *groupc, u64 now)
662{
663 u32 delta;
664
665 delta = now - groupc->state_start;
666 groupc->state_start = now;
667
668 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
669 groupc->times[PSI_IO_SOME] += delta;
670 if (groupc->state_mask & (1 << PSI_IO_FULL))
671 groupc->times[PSI_IO_FULL] += delta;
672 }
673
674 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
675 groupc->times[PSI_MEM_SOME] += delta;
676 if (groupc->state_mask & (1 << PSI_MEM_FULL))
677 groupc->times[PSI_MEM_FULL] += delta;
678 }
679
680 if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
681 groupc->times[PSI_CPU_SOME] += delta;
682 if (groupc->state_mask & (1 << PSI_CPU_FULL))
683 groupc->times[PSI_CPU_FULL] += delta;
684 }
685
686 if (groupc->state_mask & (1 << PSI_NONIDLE))
687 groupc->times[PSI_NONIDLE] += delta;
688}
689
690static void psi_group_change(struct psi_group *group, int cpu,
691 unsigned int clear, unsigned int set, u64 now,
692 bool wake_clock)
693{
694 struct psi_group_cpu *groupc;
695 u32 state_mask = 0;
696 unsigned int t, m;
697 enum psi_states s;
698
699 groupc = per_cpu_ptr(group->pcpu, cpu);
700
701
702
703
704
705
706
707
708
709 write_seqcount_begin(&groupc->seq);
710
711 record_times(groupc, now);
712
713 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
714 if (!(m & (1 << t)))
715 continue;
716 if (groupc->tasks[t]) {
717 groupc->tasks[t]--;
718 } else if (!psi_bug) {
719 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
720 cpu, t, groupc->tasks[0],
721 groupc->tasks[1], groupc->tasks[2],
722 groupc->tasks[3], groupc->tasks[4],
723 clear, set);
724 psi_bug = 1;
725 }
726 }
727
728 for (t = 0; set; set &= ~(1 << t), t++)
729 if (set & (1 << t))
730 groupc->tasks[t]++;
731
732
733 for (s = 0; s < NR_PSI_STATES; s++) {
734 if (test_state(groupc->tasks, s))
735 state_mask |= (1 << s);
736 }
737
738
739
740
741
742
743
744
745
746 if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
747 state_mask |= (1 << PSI_MEM_FULL);
748
749 groupc->state_mask = state_mask;
750
751 write_seqcount_end(&groupc->seq);
752
753 if (state_mask & group->poll_states)
754 psi_schedule_poll_work(group, 1);
755
756 if (wake_clock && !delayed_work_pending(&group->avgs_work))
757 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
758}
759
760static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
761{
762 if (*iter == &psi_system)
763 return NULL;
764
765#ifdef CONFIG_CGROUPS
766 if (static_branch_likely(&psi_cgroups_enabled)) {
767 struct cgroup *cgroup = NULL;
768
769 if (!*iter)
770 cgroup = task->cgroups->dfl_cgrp;
771 else
772 cgroup = cgroup_parent(*iter);
773
774 if (cgroup && cgroup_parent(cgroup)) {
775 *iter = cgroup;
776 return cgroup_psi(cgroup);
777 }
778 }
779#endif
780 *iter = &psi_system;
781 return &psi_system;
782}
783
784static void psi_flags_change(struct task_struct *task, int clear, int set)
785{
786 if (((task->psi_flags & set) ||
787 (task->psi_flags & clear) != clear) &&
788 !psi_bug) {
789 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
790 task->pid, task->comm, task_cpu(task),
791 task->psi_flags, clear, set);
792 psi_bug = 1;
793 }
794
795 task->psi_flags &= ~clear;
796 task->psi_flags |= set;
797}
798
799void psi_task_change(struct task_struct *task, int clear, int set)
800{
801 int cpu = task_cpu(task);
802 struct psi_group *group;
803 bool wake_clock = true;
804 void *iter = NULL;
805 u64 now;
806
807 if (!task->pid)
808 return;
809
810 psi_flags_change(task, clear, set);
811
812 now = cpu_clock(cpu);
813
814
815
816
817
818
819 if (unlikely((clear & TSK_RUNNING) &&
820 (task->flags & PF_WQ_WORKER) &&
821 wq_worker_last_func(task) == psi_avgs_work))
822 wake_clock = false;
823
824 while ((group = iterate_groups(task, &iter)))
825 psi_group_change(group, cpu, clear, set, now, wake_clock);
826}
827
828void psi_task_switch(struct task_struct *prev, struct task_struct *next,
829 bool sleep)
830{
831 struct psi_group *group, *common = NULL;
832 int cpu = task_cpu(prev);
833 void *iter;
834 u64 now = cpu_clock(cpu);
835
836 if (next->pid) {
837 bool identical_state;
838
839 psi_flags_change(next, 0, TSK_ONCPU);
840
841
842
843
844
845
846 identical_state = prev->psi_flags == next->psi_flags;
847 iter = NULL;
848 while ((group = iterate_groups(next, &iter))) {
849 if (identical_state &&
850 per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
851 common = group;
852 break;
853 }
854
855 psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
856 }
857 }
858
859 if (prev->pid) {
860 int clear = TSK_ONCPU, set = 0;
861
862
863
864
865
866
867
868 if (sleep) {
869 clear |= TSK_RUNNING;
870 if (prev->in_memstall)
871 clear |= TSK_MEMSTALL_RUNNING;
872 if (prev->in_iowait)
873 set |= TSK_IOWAIT;
874 }
875
876 psi_flags_change(prev, clear, set);
877
878 iter = NULL;
879 while ((group = iterate_groups(prev, &iter)) && group != common)
880 psi_group_change(group, cpu, clear, set, now, true);
881
882
883
884
885
886 if (sleep) {
887 clear &= ~TSK_ONCPU;
888 for (; group; group = iterate_groups(prev, &iter))
889 psi_group_change(group, cpu, clear, set, now, true);
890 }
891 }
892}
893
894
895
896
897
898
899
900
901void psi_memstall_enter(unsigned long *flags)
902{
903 struct rq_flags rf;
904 struct rq *rq;
905
906 if (static_branch_likely(&psi_disabled))
907 return;
908
909 *flags = current->in_memstall;
910 if (*flags)
911 return;
912
913
914
915
916
917 rq = this_rq_lock_irq(&rf);
918
919 current->in_memstall = 1;
920 psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
921
922 rq_unlock_irq(rq, &rf);
923}
924
925
926
927
928
929
930
931void psi_memstall_leave(unsigned long *flags)
932{
933 struct rq_flags rf;
934 struct rq *rq;
935
936 if (static_branch_likely(&psi_disabled))
937 return;
938
939 if (*flags)
940 return;
941
942
943
944
945
946 rq = this_rq_lock_irq(&rf);
947
948 current->in_memstall = 0;
949 psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
950
951 rq_unlock_irq(rq, &rf);
952}
953
954#ifdef CONFIG_CGROUPS
955int psi_cgroup_alloc(struct cgroup *cgroup)
956{
957 if (static_branch_likely(&psi_disabled))
958 return 0;
959
960 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
961 if (!cgroup->psi.pcpu)
962 return -ENOMEM;
963 group_init(&cgroup->psi);
964 return 0;
965}
966
967void psi_cgroup_free(struct cgroup *cgroup)
968{
969 if (static_branch_likely(&psi_disabled))
970 return;
971
972 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
973 free_percpu(cgroup->psi.pcpu);
974
975 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
976}
977
978
979
980
981
982
983
984
985
986
987
988
989
990void cgroup_move_task(struct task_struct *task, struct css_set *to)
991{
992 unsigned int task_flags;
993 struct rq_flags rf;
994 struct rq *rq;
995
996 if (static_branch_likely(&psi_disabled)) {
997
998
999
1000
1001 rcu_assign_pointer(task->cgroups, to);
1002 return;
1003 }
1004
1005 rq = task_rq_lock(task, &rf);
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031 task_flags = task->psi_flags;
1032
1033 if (task_flags)
1034 psi_task_change(task, task_flags, 0);
1035
1036
1037 rcu_assign_pointer(task->cgroups, to);
1038
1039 if (task_flags)
1040 psi_task_change(task, 0, task_flags);
1041
1042 task_rq_unlock(rq, task, &rf);
1043}
1044#endif
1045
1046int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
1047{
1048 int full;
1049 u64 now;
1050
1051 if (static_branch_likely(&psi_disabled))
1052 return -EOPNOTSUPP;
1053
1054
1055 mutex_lock(&group->avgs_lock);
1056 now = sched_clock();
1057 collect_percpu_times(group, PSI_AVGS, NULL);
1058 if (now >= group->avg_next_update)
1059 group->avg_next_update = update_averages(group, now);
1060 mutex_unlock(&group->avgs_lock);
1061
1062 for (full = 0; full < 2; full++) {
1063 unsigned long avg[3] = { 0, };
1064 u64 total = 0;
1065 int w;
1066
1067
1068 if (!(group == &psi_system && res == PSI_CPU && full)) {
1069 for (w = 0; w < 3; w++)
1070 avg[w] = group->avg[res * 2 + full][w];
1071 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
1072 NSEC_PER_USEC);
1073 }
1074
1075 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1076 full ? "full" : "some",
1077 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
1078 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
1079 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
1080 total);
1081 }
1082
1083 return 0;
1084}
1085
1086struct psi_trigger *psi_trigger_create(struct psi_group *group,
1087 char *buf, size_t nbytes, enum psi_res res)
1088{
1089 struct psi_trigger *t;
1090 enum psi_states state;
1091 u32 threshold_us;
1092 u32 window_us;
1093
1094 if (static_branch_likely(&psi_disabled))
1095 return ERR_PTR(-EOPNOTSUPP);
1096
1097 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1098 state = PSI_IO_SOME + res * 2;
1099 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1100 state = PSI_IO_FULL + res * 2;
1101 else
1102 return ERR_PTR(-EINVAL);
1103
1104 if (state >= PSI_NONIDLE)
1105 return ERR_PTR(-EINVAL);
1106
1107 if (window_us < WINDOW_MIN_US ||
1108 window_us > WINDOW_MAX_US)
1109 return ERR_PTR(-EINVAL);
1110
1111
1112 if (threshold_us == 0 || threshold_us > window_us)
1113 return ERR_PTR(-EINVAL);
1114
1115 t = kmalloc(sizeof(*t), GFP_KERNEL);
1116 if (!t)
1117 return ERR_PTR(-ENOMEM);
1118
1119 t->group = group;
1120 t->state = state;
1121 t->threshold = threshold_us * NSEC_PER_USEC;
1122 t->win.size = window_us * NSEC_PER_USEC;
1123 window_reset(&t->win, sched_clock(),
1124 group->total[PSI_POLL][t->state], 0);
1125
1126 t->event = 0;
1127 t->last_event_time = 0;
1128 init_waitqueue_head(&t->event_wait);
1129 t->pending_event = false;
1130
1131 mutex_lock(&group->trigger_lock);
1132
1133 if (!rcu_access_pointer(group->poll_task)) {
1134 struct task_struct *task;
1135
1136 task = kthread_create(psi_poll_worker, group, "psimon");
1137 if (IS_ERR(task)) {
1138 kfree(t);
1139 mutex_unlock(&group->trigger_lock);
1140 return ERR_CAST(task);
1141 }
1142 atomic_set(&group->poll_wakeup, 0);
1143 wake_up_process(task);
1144 rcu_assign_pointer(group->poll_task, task);
1145 }
1146
1147 list_add(&t->node, &group->triggers);
1148 group->poll_min_period = min(group->poll_min_period,
1149 div_u64(t->win.size, UPDATES_PER_WINDOW));
1150 group->nr_triggers[t->state]++;
1151 group->poll_states |= (1 << t->state);
1152
1153 mutex_unlock(&group->trigger_lock);
1154
1155 return t;
1156}
1157
1158void psi_trigger_destroy(struct psi_trigger *t)
1159{
1160 struct psi_group *group;
1161 struct task_struct *task_to_destroy = NULL;
1162
1163
1164
1165
1166
1167 if (!t)
1168 return;
1169
1170 group = t->group;
1171
1172
1173
1174
1175 wake_up_interruptible(&t->event_wait);
1176
1177 mutex_lock(&group->trigger_lock);
1178
1179 if (!list_empty(&t->node)) {
1180 struct psi_trigger *tmp;
1181 u64 period = ULLONG_MAX;
1182
1183 list_del(&t->node);
1184 group->nr_triggers[t->state]--;
1185 if (!group->nr_triggers[t->state])
1186 group->poll_states &= ~(1 << t->state);
1187
1188 list_for_each_entry(tmp, &group->triggers, node)
1189 period = min(period, div_u64(tmp->win.size,
1190 UPDATES_PER_WINDOW));
1191 group->poll_min_period = period;
1192
1193 if (group->poll_states == 0) {
1194 group->polling_until = 0;
1195 task_to_destroy = rcu_dereference_protected(
1196 group->poll_task,
1197 lockdep_is_held(&group->trigger_lock));
1198 rcu_assign_pointer(group->poll_task, NULL);
1199 del_timer(&group->poll_timer);
1200 }
1201 }
1202
1203 mutex_unlock(&group->trigger_lock);
1204
1205
1206
1207
1208
1209
1210 synchronize_rcu();
1211
1212
1213
1214
1215 if (task_to_destroy) {
1216
1217
1218
1219
1220 kthread_stop(task_to_destroy);
1221 }
1222 kfree(t);
1223}
1224
1225__poll_t psi_trigger_poll(void **trigger_ptr,
1226 struct file *file, poll_table *wait)
1227{
1228 __poll_t ret = DEFAULT_POLLMASK;
1229 struct psi_trigger *t;
1230
1231 if (static_branch_likely(&psi_disabled))
1232 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1233
1234 t = smp_load_acquire(trigger_ptr);
1235 if (!t)
1236 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1237
1238 poll_wait(file, &t->event_wait, wait);
1239
1240 if (cmpxchg(&t->event, 1, 0) == 1)
1241 ret |= EPOLLPRI;
1242
1243 return ret;
1244}
1245
1246#ifdef CONFIG_PROC_FS
1247static int psi_io_show(struct seq_file *m, void *v)
1248{
1249 return psi_show(m, &psi_system, PSI_IO);
1250}
1251
1252static int psi_memory_show(struct seq_file *m, void *v)
1253{
1254 return psi_show(m, &psi_system, PSI_MEM);
1255}
1256
1257static int psi_cpu_show(struct seq_file *m, void *v)
1258{
1259 return psi_show(m, &psi_system, PSI_CPU);
1260}
1261
1262static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
1263{
1264 if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
1265 return -EPERM;
1266
1267 return single_open(file, psi_show, NULL);
1268}
1269
1270static int psi_io_open(struct inode *inode, struct file *file)
1271{
1272 return psi_open(file, psi_io_show);
1273}
1274
1275static int psi_memory_open(struct inode *inode, struct file *file)
1276{
1277 return psi_open(file, psi_memory_show);
1278}
1279
1280static int psi_cpu_open(struct inode *inode, struct file *file)
1281{
1282 return psi_open(file, psi_cpu_show);
1283}
1284
1285static ssize_t psi_write(struct file *file, const char __user *user_buf,
1286 size_t nbytes, enum psi_res res)
1287{
1288 char buf[32];
1289 size_t buf_size;
1290 struct seq_file *seq;
1291 struct psi_trigger *new;
1292
1293 if (static_branch_likely(&psi_disabled))
1294 return -EOPNOTSUPP;
1295
1296 if (!nbytes)
1297 return -EINVAL;
1298
1299 buf_size = min(nbytes, sizeof(buf));
1300 if (copy_from_user(buf, user_buf, buf_size))
1301 return -EFAULT;
1302
1303 buf[buf_size - 1] = '\0';
1304
1305 seq = file->private_data;
1306
1307
1308 mutex_lock(&seq->lock);
1309
1310
1311 if (seq->private) {
1312 mutex_unlock(&seq->lock);
1313 return -EBUSY;
1314 }
1315
1316 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1317 if (IS_ERR(new)) {
1318 mutex_unlock(&seq->lock);
1319 return PTR_ERR(new);
1320 }
1321
1322 smp_store_release(&seq->private, new);
1323 mutex_unlock(&seq->lock);
1324
1325 return nbytes;
1326}
1327
1328static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1329 size_t nbytes, loff_t *ppos)
1330{
1331 return psi_write(file, user_buf, nbytes, PSI_IO);
1332}
1333
1334static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1335 size_t nbytes, loff_t *ppos)
1336{
1337 return psi_write(file, user_buf, nbytes, PSI_MEM);
1338}
1339
1340static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1341 size_t nbytes, loff_t *ppos)
1342{
1343 return psi_write(file, user_buf, nbytes, PSI_CPU);
1344}
1345
1346static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1347{
1348 struct seq_file *seq = file->private_data;
1349
1350 return psi_trigger_poll(&seq->private, file, wait);
1351}
1352
1353static int psi_fop_release(struct inode *inode, struct file *file)
1354{
1355 struct seq_file *seq = file->private_data;
1356
1357 psi_trigger_destroy(seq->private);
1358 return single_release(inode, file);
1359}
1360
1361static const struct proc_ops psi_io_proc_ops = {
1362 .proc_open = psi_io_open,
1363 .proc_read = seq_read,
1364 .proc_lseek = seq_lseek,
1365 .proc_write = psi_io_write,
1366 .proc_poll = psi_fop_poll,
1367 .proc_release = psi_fop_release,
1368};
1369
1370static const struct proc_ops psi_memory_proc_ops = {
1371 .proc_open = psi_memory_open,
1372 .proc_read = seq_read,
1373 .proc_lseek = seq_lseek,
1374 .proc_write = psi_memory_write,
1375 .proc_poll = psi_fop_poll,
1376 .proc_release = psi_fop_release,
1377};
1378
1379static const struct proc_ops psi_cpu_proc_ops = {
1380 .proc_open = psi_cpu_open,
1381 .proc_read = seq_read,
1382 .proc_lseek = seq_lseek,
1383 .proc_write = psi_cpu_write,
1384 .proc_poll = psi_fop_poll,
1385 .proc_release = psi_fop_release,
1386};
1387
1388static int __init psi_proc_init(void)
1389{
1390 if (psi_enable) {
1391 proc_mkdir("pressure", NULL);
1392 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
1393 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
1394 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
1395 }
1396 return 0;
1397}
1398module_init(psi_proc_init);
1399
1400#endif
1401