1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133#include "../workqueue_internal.h"
134#include <linux/sched/loadavg.h>
135#include <linux/seq_file.h>
136#include <linux/proc_fs.h>
137#include <linux/seqlock.h>
138#include <linux/uaccess.h>
139#include <linux/cgroup.h>
140#include <linux/module.h>
141#include <linux/sched.h>
142#include <linux/ctype.h>
143#include <linux/file.h>
144#include <linux/poll.h>
145#include <linux/psi.h>
146#include "sched.h"
147
148static int psi_bug __read_mostly;
149
150DEFINE_STATIC_KEY_FALSE(psi_disabled);
151DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
152
153#ifdef CONFIG_PSI_DEFAULT_DISABLED
154static bool psi_enable;
155#else
156static bool psi_enable = true;
157#endif
158static int __init setup_psi(char *str)
159{
160 return kstrtobool(str, &psi_enable) == 0;
161}
162__setup("psi=", setup_psi);
163
164
165#define PSI_FREQ (2*HZ+1)
166#define EXP_10s 1677
167#define EXP_60s 1981
168#define EXP_300s 2034
169
170
171#define WINDOW_MIN_US 500000
172#define WINDOW_MAX_US 10000000
173#define UPDATES_PER_WINDOW 10
174
175
176static u64 psi_period __read_mostly;
177
178
179static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
180struct psi_group psi_system = {
181 .pcpu = &system_group_pcpu,
182};
183
184static void psi_avgs_work(struct work_struct *work);
185
186static void poll_timer_fn(struct timer_list *t);
187
188static void group_init(struct psi_group *group)
189{
190 int cpu;
191
192 for_each_possible_cpu(cpu)
193 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
194 group->avg_last_update = sched_clock();
195 group->avg_next_update = group->avg_last_update + psi_period;
196 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
197 mutex_init(&group->avgs_lock);
198
199 mutex_init(&group->trigger_lock);
200 INIT_LIST_HEAD(&group->triggers);
201 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
202 group->poll_states = 0;
203 group->poll_min_period = U32_MAX;
204 memset(group->polling_total, 0, sizeof(group->polling_total));
205 group->polling_next_update = ULLONG_MAX;
206 group->polling_until = 0;
207 init_waitqueue_head(&group->poll_wait);
208 timer_setup(&group->poll_timer, poll_timer_fn, 0);
209 rcu_assign_pointer(group->poll_task, NULL);
210}
211
212void __init psi_init(void)
213{
214 if (!psi_enable) {
215 static_branch_enable(&psi_disabled);
216 return;
217 }
218
219 if (!cgroup_psi_enabled())
220 static_branch_disable(&psi_cgroups_enabled);
221
222 psi_period = jiffies_to_nsecs(PSI_FREQ);
223 group_init(&psi_system);
224}
225
226static bool test_state(unsigned int *tasks, enum psi_states state)
227{
228 switch (state) {
229 case PSI_IO_SOME:
230 return unlikely(tasks[NR_IOWAIT]);
231 case PSI_IO_FULL:
232 return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
233 case PSI_MEM_SOME:
234 return unlikely(tasks[NR_MEMSTALL]);
235 case PSI_MEM_FULL:
236 return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
237 case PSI_CPU_SOME:
238 return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
239 case PSI_CPU_FULL:
240 return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
241 case PSI_NONIDLE:
242 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
243 tasks[NR_RUNNING];
244 default:
245 return false;
246 }
247}
248
249static void get_recent_times(struct psi_group *group, int cpu,
250 enum psi_aggregators aggregator, u32 *times,
251 u32 *pchanged_states)
252{
253 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
254 u64 now, state_start;
255 enum psi_states s;
256 unsigned int seq;
257 u32 state_mask;
258
259 *pchanged_states = 0;
260
261
262 do {
263 seq = read_seqcount_begin(&groupc->seq);
264 now = cpu_clock(cpu);
265 memcpy(times, groupc->times, sizeof(groupc->times));
266 state_mask = groupc->state_mask;
267 state_start = groupc->state_start;
268 } while (read_seqcount_retry(&groupc->seq, seq));
269
270
271 for (s = 0; s < NR_PSI_STATES; s++) {
272 u32 delta;
273
274
275
276
277
278
279
280
281
282 if (state_mask & (1 << s))
283 times[s] += now - state_start;
284
285 delta = times[s] - groupc->times_prev[aggregator][s];
286 groupc->times_prev[aggregator][s] = times[s];
287
288 times[s] = delta;
289 if (delta)
290 *pchanged_states |= (1 << s);
291 }
292}
293
294static void calc_avgs(unsigned long avg[3], int missed_periods,
295 u64 time, u64 period)
296{
297 unsigned long pct;
298
299
300 if (missed_periods) {
301 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
302 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
303 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
304 }
305
306
307 pct = div_u64(time * 100, period);
308 pct *= FIXED_1;
309 avg[0] = calc_load(avg[0], EXP_10s, pct);
310 avg[1] = calc_load(avg[1], EXP_60s, pct);
311 avg[2] = calc_load(avg[2], EXP_300s, pct);
312}
313
314static void collect_percpu_times(struct psi_group *group,
315 enum psi_aggregators aggregator,
316 u32 *pchanged_states)
317{
318 u64 deltas[NR_PSI_STATES - 1] = { 0, };
319 unsigned long nonidle_total = 0;
320 u32 changed_states = 0;
321 int cpu;
322 int s;
323
324
325
326
327
328
329
330
331
332 for_each_possible_cpu(cpu) {
333 u32 times[NR_PSI_STATES];
334 u32 nonidle;
335 u32 cpu_changed_states;
336
337 get_recent_times(group, cpu, aggregator, times,
338 &cpu_changed_states);
339 changed_states |= cpu_changed_states;
340
341 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
342 nonidle_total += nonidle;
343
344 for (s = 0; s < PSI_NONIDLE; s++)
345 deltas[s] += (u64)times[s] * nonidle;
346 }
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361 for (s = 0; s < NR_PSI_STATES - 1; s++)
362 group->total[aggregator][s] +=
363 div_u64(deltas[s], max(nonidle_total, 1UL));
364
365 if (pchanged_states)
366 *pchanged_states = changed_states;
367}
368
369static u64 update_averages(struct psi_group *group, u64 now)
370{
371 unsigned long missed_periods = 0;
372 u64 expires, period;
373 u64 avg_next_update;
374 int s;
375
376
377 expires = group->avg_next_update;
378 if (now - expires >= psi_period)
379 missed_periods = div_u64(now - expires, psi_period);
380
381
382
383
384
385
386
387
388 avg_next_update = expires + ((1 + missed_periods) * psi_period);
389 period = now - (group->avg_last_update + (missed_periods * psi_period));
390 group->avg_last_update = now;
391
392 for (s = 0; s < NR_PSI_STATES - 1; s++) {
393 u32 sample;
394
395 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413 if (sample > period)
414 sample = period;
415 group->avg_total[s] += sample;
416 calc_avgs(group->avg[s], missed_periods, sample, period);
417 }
418
419 return avg_next_update;
420}
421
422static void psi_avgs_work(struct work_struct *work)
423{
424 struct delayed_work *dwork;
425 struct psi_group *group;
426 u32 changed_states;
427 bool nonidle;
428 u64 now;
429
430 dwork = to_delayed_work(work);
431 group = container_of(dwork, struct psi_group, avgs_work);
432
433 mutex_lock(&group->avgs_lock);
434
435 now = sched_clock();
436
437 collect_percpu_times(group, PSI_AVGS, &changed_states);
438 nonidle = changed_states & (1 << PSI_NONIDLE);
439
440
441
442
443
444
445
446 if (now >= group->avg_next_update)
447 group->avg_next_update = update_averages(group, now);
448
449 if (nonidle) {
450 schedule_delayed_work(dwork, nsecs_to_jiffies(
451 group->avg_next_update - now) + 1);
452 }
453
454 mutex_unlock(&group->avgs_lock);
455}
456
457
458static void window_reset(struct psi_window *win, u64 now, u64 value,
459 u64 prev_growth)
460{
461 win->start_time = now;
462 win->start_value = value;
463 win->prev_growth = prev_growth;
464}
465
466
467
468
469
470
471
472
473
474
475
476
477static u64 window_update(struct psi_window *win, u64 now, u64 value)
478{
479 u64 elapsed;
480 u64 growth;
481
482 elapsed = now - win->start_time;
483 growth = value - win->start_value;
484
485
486
487
488
489
490
491 if (elapsed > win->size)
492 window_reset(win, now, value, growth);
493 else {
494 u32 remaining;
495
496 remaining = win->size - elapsed;
497 growth += div64_u64(win->prev_growth * remaining, win->size);
498 }
499
500 return growth;
501}
502
503static void init_triggers(struct psi_group *group, u64 now)
504{
505 struct psi_trigger *t;
506
507 list_for_each_entry(t, &group->triggers, node)
508 window_reset(&t->win, now,
509 group->total[PSI_POLL][t->state], 0);
510 memcpy(group->polling_total, group->total[PSI_POLL],
511 sizeof(group->polling_total));
512 group->polling_next_update = now + group->poll_min_period;
513}
514
515static u64 update_triggers(struct psi_group *group, u64 now)
516{
517 struct psi_trigger *t;
518 bool new_stall = false;
519 u64 *total = group->total[PSI_POLL];
520
521
522
523
524
525 list_for_each_entry(t, &group->triggers, node) {
526 u64 growth;
527
528
529 if (group->polling_total[t->state] == total[t->state])
530 continue;
531
532
533
534
535
536
537
538 new_stall = true;
539
540
541 growth = window_update(&t->win, now, total[t->state]);
542 if (growth < t->threshold)
543 continue;
544
545
546 if (now < t->last_event_time + t->win.size)
547 continue;
548
549
550 if (cmpxchg(&t->event, 0, 1) == 0)
551 wake_up_interruptible(&t->event_wait);
552 t->last_event_time = now;
553 }
554
555 if (new_stall)
556 memcpy(group->polling_total, total,
557 sizeof(group->polling_total));
558
559 return now + group->poll_min_period;
560}
561
562
563static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
564{
565 struct task_struct *task;
566
567
568
569
570
571
572
573 if (timer_pending(&group->poll_timer))
574 return;
575
576 rcu_read_lock();
577
578 task = rcu_dereference(group->poll_task);
579
580
581
582
583 if (likely(task))
584 mod_timer(&group->poll_timer, jiffies + delay);
585
586 rcu_read_unlock();
587}
588
589static void psi_poll_work(struct psi_group *group)
590{
591 u32 changed_states;
592 u64 now;
593
594 mutex_lock(&group->trigger_lock);
595
596 now = sched_clock();
597
598 collect_percpu_times(group, PSI_POLL, &changed_states);
599
600 if (changed_states & group->poll_states) {
601
602 if (now > group->polling_until)
603 init_triggers(group, now);
604
605
606
607
608
609
610 group->polling_until = now +
611 group->poll_min_period * UPDATES_PER_WINDOW;
612 }
613
614 if (now > group->polling_until) {
615 group->polling_next_update = ULLONG_MAX;
616 goto out;
617 }
618
619 if (now >= group->polling_next_update)
620 group->polling_next_update = update_triggers(group, now);
621
622 psi_schedule_poll_work(group,
623 nsecs_to_jiffies(group->polling_next_update - now) + 1);
624
625out:
626 mutex_unlock(&group->trigger_lock);
627}
628
629static int psi_poll_worker(void *data)
630{
631 struct psi_group *group = (struct psi_group *)data;
632
633 sched_set_fifo_low(current);
634
635 while (true) {
636 wait_event_interruptible(group->poll_wait,
637 atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
638 kthread_should_stop());
639 if (kthread_should_stop())
640 break;
641
642 psi_poll_work(group);
643 }
644 return 0;
645}
646
647static void poll_timer_fn(struct timer_list *t)
648{
649 struct psi_group *group = from_timer(group, t, poll_timer);
650
651 atomic_set(&group->poll_wakeup, 1);
652 wake_up_interruptible(&group->poll_wait);
653}
654
655static void record_times(struct psi_group_cpu *groupc, u64 now)
656{
657 u32 delta;
658
659 delta = now - groupc->state_start;
660 groupc->state_start = now;
661
662 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
663 groupc->times[PSI_IO_SOME] += delta;
664 if (groupc->state_mask & (1 << PSI_IO_FULL))
665 groupc->times[PSI_IO_FULL] += delta;
666 }
667
668 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
669 groupc->times[PSI_MEM_SOME] += delta;
670 if (groupc->state_mask & (1 << PSI_MEM_FULL))
671 groupc->times[PSI_MEM_FULL] += delta;
672 }
673
674 if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
675 groupc->times[PSI_CPU_SOME] += delta;
676 if (groupc->state_mask & (1 << PSI_CPU_FULL))
677 groupc->times[PSI_CPU_FULL] += delta;
678 }
679
680 if (groupc->state_mask & (1 << PSI_NONIDLE))
681 groupc->times[PSI_NONIDLE] += delta;
682}
683
684static void psi_group_change(struct psi_group *group, int cpu,
685 unsigned int clear, unsigned int set, u64 now,
686 bool wake_clock)
687{
688 struct psi_group_cpu *groupc;
689 u32 state_mask = 0;
690 unsigned int t, m;
691 enum psi_states s;
692
693 groupc = per_cpu_ptr(group->pcpu, cpu);
694
695
696
697
698
699
700
701
702
703 write_seqcount_begin(&groupc->seq);
704
705 record_times(groupc, now);
706
707 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
708 if (!(m & (1 << t)))
709 continue;
710 if (groupc->tasks[t]) {
711 groupc->tasks[t]--;
712 } else if (!psi_bug) {
713 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
714 cpu, t, groupc->tasks[0],
715 groupc->tasks[1], groupc->tasks[2],
716 groupc->tasks[3], clear, set);
717 psi_bug = 1;
718 }
719 }
720
721 for (t = 0; set; set &= ~(1 << t), t++)
722 if (set & (1 << t))
723 groupc->tasks[t]++;
724
725
726 for (s = 0; s < NR_PSI_STATES; s++) {
727 if (test_state(groupc->tasks, s))
728 state_mask |= (1 << s);
729 }
730
731
732
733
734
735
736
737
738
739 if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
740 state_mask |= (1 << PSI_MEM_FULL);
741
742 groupc->state_mask = state_mask;
743
744 write_seqcount_end(&groupc->seq);
745
746 if (state_mask & group->poll_states)
747 psi_schedule_poll_work(group, 1);
748
749 if (wake_clock && !delayed_work_pending(&group->avgs_work))
750 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
751}
752
753static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
754{
755 if (*iter == &psi_system)
756 return NULL;
757
758#ifdef CONFIG_CGROUPS
759 if (static_branch_likely(&psi_cgroups_enabled)) {
760 struct cgroup *cgroup = NULL;
761
762 if (!*iter)
763 cgroup = task->cgroups->dfl_cgrp;
764 else
765 cgroup = cgroup_parent(*iter);
766
767 if (cgroup && cgroup_parent(cgroup)) {
768 *iter = cgroup;
769 return cgroup_psi(cgroup);
770 }
771 }
772#endif
773 *iter = &psi_system;
774 return &psi_system;
775}
776
777static void psi_flags_change(struct task_struct *task, int clear, int set)
778{
779 if (((task->psi_flags & set) ||
780 (task->psi_flags & clear) != clear) &&
781 !psi_bug) {
782 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
783 task->pid, task->comm, task_cpu(task),
784 task->psi_flags, clear, set);
785 psi_bug = 1;
786 }
787
788 task->psi_flags &= ~clear;
789 task->psi_flags |= set;
790}
791
792void psi_task_change(struct task_struct *task, int clear, int set)
793{
794 int cpu = task_cpu(task);
795 struct psi_group *group;
796 bool wake_clock = true;
797 void *iter = NULL;
798 u64 now;
799
800 if (!task->pid)
801 return;
802
803 psi_flags_change(task, clear, set);
804
805 now = cpu_clock(cpu);
806
807
808
809
810
811
812 if (unlikely((clear & TSK_RUNNING) &&
813 (task->flags & PF_WQ_WORKER) &&
814 wq_worker_last_func(task) == psi_avgs_work))
815 wake_clock = false;
816
817 while ((group = iterate_groups(task, &iter)))
818 psi_group_change(group, cpu, clear, set, now, wake_clock);
819}
820
821void psi_task_switch(struct task_struct *prev, struct task_struct *next,
822 bool sleep)
823{
824 struct psi_group *group, *common = NULL;
825 int cpu = task_cpu(prev);
826 void *iter;
827 u64 now = cpu_clock(cpu);
828
829 if (next->pid) {
830 bool identical_state;
831
832 psi_flags_change(next, 0, TSK_ONCPU);
833
834
835
836
837
838
839
840 identical_state = prev->psi_flags == next->psi_flags;
841 iter = NULL;
842 while ((group = iterate_groups(next, &iter))) {
843 if (identical_state &&
844 per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
845 common = group;
846 break;
847 }
848
849 psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
850 }
851 }
852
853 if (prev->pid) {
854 int clear = TSK_ONCPU, set = 0;
855
856
857
858
859
860
861 if (sleep) {
862 clear |= TSK_RUNNING;
863 if (prev->in_iowait)
864 set |= TSK_IOWAIT;
865 }
866
867 psi_flags_change(prev, clear, set);
868
869 iter = NULL;
870 while ((group = iterate_groups(prev, &iter)) && group != common)
871 psi_group_change(group, cpu, clear, set, now, true);
872
873
874
875
876
877 if (sleep) {
878 clear &= ~TSK_ONCPU;
879 for (; group; group = iterate_groups(prev, &iter))
880 psi_group_change(group, cpu, clear, set, now, true);
881 }
882 }
883}
884
885
886
887
888
889
890
891
892void psi_memstall_enter(unsigned long *flags)
893{
894 struct rq_flags rf;
895 struct rq *rq;
896
897 if (static_branch_likely(&psi_disabled))
898 return;
899
900 *flags = current->in_memstall;
901 if (*flags)
902 return;
903
904
905
906
907
908 rq = this_rq_lock_irq(&rf);
909
910 current->in_memstall = 1;
911 psi_task_change(current, 0, TSK_MEMSTALL);
912
913 rq_unlock_irq(rq, &rf);
914}
915
916
917
918
919
920
921
922void psi_memstall_leave(unsigned long *flags)
923{
924 struct rq_flags rf;
925 struct rq *rq;
926
927 if (static_branch_likely(&psi_disabled))
928 return;
929
930 if (*flags)
931 return;
932
933
934
935
936
937 rq = this_rq_lock_irq(&rf);
938
939 current->in_memstall = 0;
940 psi_task_change(current, TSK_MEMSTALL, 0);
941
942 rq_unlock_irq(rq, &rf);
943}
944
945#ifdef CONFIG_CGROUPS
946int psi_cgroup_alloc(struct cgroup *cgroup)
947{
948 if (static_branch_likely(&psi_disabled))
949 return 0;
950
951 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
952 if (!cgroup->psi.pcpu)
953 return -ENOMEM;
954 group_init(&cgroup->psi);
955 return 0;
956}
957
958void psi_cgroup_free(struct cgroup *cgroup)
959{
960 if (static_branch_likely(&psi_disabled))
961 return;
962
963 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
964 free_percpu(cgroup->psi.pcpu);
965
966 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
967}
968
969
970
971
972
973
974
975
976
977
978
979
980
981void cgroup_move_task(struct task_struct *task, struct css_set *to)
982{
983 unsigned int task_flags;
984 struct rq_flags rf;
985 struct rq *rq;
986
987 if (static_branch_likely(&psi_disabled)) {
988
989
990
991
992 rcu_assign_pointer(task->cgroups, to);
993 return;
994 }
995
996 rq = task_rq_lock(task, &rf);
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022 task_flags = task->psi_flags;
1023
1024 if (task_flags)
1025 psi_task_change(task, task_flags, 0);
1026
1027
1028 rcu_assign_pointer(task->cgroups, to);
1029
1030 if (task_flags)
1031 psi_task_change(task, 0, task_flags);
1032
1033 task_rq_unlock(rq, task, &rf);
1034}
1035#endif
1036
1037int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
1038{
1039 int full;
1040 u64 now;
1041
1042 if (static_branch_likely(&psi_disabled))
1043 return -EOPNOTSUPP;
1044
1045
1046 mutex_lock(&group->avgs_lock);
1047 now = sched_clock();
1048 collect_percpu_times(group, PSI_AVGS, NULL);
1049 if (now >= group->avg_next_update)
1050 group->avg_next_update = update_averages(group, now);
1051 mutex_unlock(&group->avgs_lock);
1052
1053 for (full = 0; full < 2; full++) {
1054 unsigned long avg[3];
1055 u64 total;
1056 int w;
1057
1058 for (w = 0; w < 3; w++)
1059 avg[w] = group->avg[res * 2 + full][w];
1060 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
1061 NSEC_PER_USEC);
1062
1063 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1064 full ? "full" : "some",
1065 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
1066 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
1067 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
1068 total);
1069 }
1070
1071 return 0;
1072}
1073
1074static int psi_io_show(struct seq_file *m, void *v)
1075{
1076 return psi_show(m, &psi_system, PSI_IO);
1077}
1078
1079static int psi_memory_show(struct seq_file *m, void *v)
1080{
1081 return psi_show(m, &psi_system, PSI_MEM);
1082}
1083
1084static int psi_cpu_show(struct seq_file *m, void *v)
1085{
1086 return psi_show(m, &psi_system, PSI_CPU);
1087}
1088
1089static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
1090{
1091 if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
1092 return -EPERM;
1093
1094 return single_open(file, psi_show, NULL);
1095}
1096
1097static int psi_io_open(struct inode *inode, struct file *file)
1098{
1099 return psi_open(file, psi_io_show);
1100}
1101
1102static int psi_memory_open(struct inode *inode, struct file *file)
1103{
1104 return psi_open(file, psi_memory_show);
1105}
1106
1107static int psi_cpu_open(struct inode *inode, struct file *file)
1108{
1109 return psi_open(file, psi_cpu_show);
1110}
1111
1112struct psi_trigger *psi_trigger_create(struct psi_group *group,
1113 char *buf, size_t nbytes, enum psi_res res)
1114{
1115 struct psi_trigger *t;
1116 enum psi_states state;
1117 u32 threshold_us;
1118 u32 window_us;
1119
1120 if (static_branch_likely(&psi_disabled))
1121 return ERR_PTR(-EOPNOTSUPP);
1122
1123 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1124 state = PSI_IO_SOME + res * 2;
1125 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1126 state = PSI_IO_FULL + res * 2;
1127 else
1128 return ERR_PTR(-EINVAL);
1129
1130 if (state >= PSI_NONIDLE)
1131 return ERR_PTR(-EINVAL);
1132
1133 if (window_us < WINDOW_MIN_US ||
1134 window_us > WINDOW_MAX_US)
1135 return ERR_PTR(-EINVAL);
1136
1137
1138 if (threshold_us == 0 || threshold_us > window_us)
1139 return ERR_PTR(-EINVAL);
1140
1141 t = kmalloc(sizeof(*t), GFP_KERNEL);
1142 if (!t)
1143 return ERR_PTR(-ENOMEM);
1144
1145 t->group = group;
1146 t->state = state;
1147 t->threshold = threshold_us * NSEC_PER_USEC;
1148 t->win.size = window_us * NSEC_PER_USEC;
1149 window_reset(&t->win, 0, 0, 0);
1150
1151 t->event = 0;
1152 t->last_event_time = 0;
1153 init_waitqueue_head(&t->event_wait);
1154 kref_init(&t->refcount);
1155
1156 mutex_lock(&group->trigger_lock);
1157
1158 if (!rcu_access_pointer(group->poll_task)) {
1159 struct task_struct *task;
1160
1161 task = kthread_create(psi_poll_worker, group, "psimon");
1162 if (IS_ERR(task)) {
1163 kfree(t);
1164 mutex_unlock(&group->trigger_lock);
1165 return ERR_CAST(task);
1166 }
1167 atomic_set(&group->poll_wakeup, 0);
1168 wake_up_process(task);
1169 rcu_assign_pointer(group->poll_task, task);
1170 }
1171
1172 list_add(&t->node, &group->triggers);
1173 group->poll_min_period = min(group->poll_min_period,
1174 div_u64(t->win.size, UPDATES_PER_WINDOW));
1175 group->nr_triggers[t->state]++;
1176 group->poll_states |= (1 << t->state);
1177
1178 mutex_unlock(&group->trigger_lock);
1179
1180 return t;
1181}
1182
1183static void psi_trigger_destroy(struct kref *ref)
1184{
1185 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1186 struct psi_group *group = t->group;
1187 struct task_struct *task_to_destroy = NULL;
1188
1189 if (static_branch_likely(&psi_disabled))
1190 return;
1191
1192
1193
1194
1195
1196 wake_up_interruptible(&t->event_wait);
1197
1198 mutex_lock(&group->trigger_lock);
1199
1200 if (!list_empty(&t->node)) {
1201 struct psi_trigger *tmp;
1202 u64 period = ULLONG_MAX;
1203
1204 list_del(&t->node);
1205 group->nr_triggers[t->state]--;
1206 if (!group->nr_triggers[t->state])
1207 group->poll_states &= ~(1 << t->state);
1208
1209 list_for_each_entry(tmp, &group->triggers, node)
1210 period = min(period, div_u64(tmp->win.size,
1211 UPDATES_PER_WINDOW));
1212 group->poll_min_period = period;
1213
1214 if (group->poll_states == 0) {
1215 group->polling_until = 0;
1216 task_to_destroy = rcu_dereference_protected(
1217 group->poll_task,
1218 lockdep_is_held(&group->trigger_lock));
1219 rcu_assign_pointer(group->poll_task, NULL);
1220 del_timer(&group->poll_timer);
1221 }
1222 }
1223
1224 mutex_unlock(&group->trigger_lock);
1225
1226
1227
1228
1229
1230
1231 synchronize_rcu();
1232
1233
1234
1235
1236 if (task_to_destroy) {
1237
1238
1239
1240
1241 kthread_stop(task_to_destroy);
1242 }
1243 kfree(t);
1244}
1245
1246void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1247{
1248 struct psi_trigger *old = *trigger_ptr;
1249
1250 if (static_branch_likely(&psi_disabled))
1251 return;
1252
1253 rcu_assign_pointer(*trigger_ptr, new);
1254 if (old)
1255 kref_put(&old->refcount, psi_trigger_destroy);
1256}
1257
1258__poll_t psi_trigger_poll(void **trigger_ptr,
1259 struct file *file, poll_table *wait)
1260{
1261 __poll_t ret = DEFAULT_POLLMASK;
1262 struct psi_trigger *t;
1263
1264 if (static_branch_likely(&psi_disabled))
1265 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1266
1267 rcu_read_lock();
1268
1269 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1270 if (!t) {
1271 rcu_read_unlock();
1272 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1273 }
1274 kref_get(&t->refcount);
1275
1276 rcu_read_unlock();
1277
1278 poll_wait(file, &t->event_wait, wait);
1279
1280 if (cmpxchg(&t->event, 1, 0) == 1)
1281 ret |= EPOLLPRI;
1282
1283 kref_put(&t->refcount, psi_trigger_destroy);
1284
1285 return ret;
1286}
1287
1288static ssize_t psi_write(struct file *file, const char __user *user_buf,
1289 size_t nbytes, enum psi_res res)
1290{
1291 char buf[32];
1292 size_t buf_size;
1293 struct seq_file *seq;
1294 struct psi_trigger *new;
1295
1296 if (static_branch_likely(&psi_disabled))
1297 return -EOPNOTSUPP;
1298
1299 if (!nbytes)
1300 return -EINVAL;
1301
1302 buf_size = min(nbytes, sizeof(buf));
1303 if (copy_from_user(buf, user_buf, buf_size))
1304 return -EFAULT;
1305
1306 buf[buf_size - 1] = '\0';
1307
1308 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1309 if (IS_ERR(new))
1310 return PTR_ERR(new);
1311
1312 seq = file->private_data;
1313
1314 mutex_lock(&seq->lock);
1315 psi_trigger_replace(&seq->private, new);
1316 mutex_unlock(&seq->lock);
1317
1318 return nbytes;
1319}
1320
1321static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1322 size_t nbytes, loff_t *ppos)
1323{
1324 return psi_write(file, user_buf, nbytes, PSI_IO);
1325}
1326
1327static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1328 size_t nbytes, loff_t *ppos)
1329{
1330 return psi_write(file, user_buf, nbytes, PSI_MEM);
1331}
1332
1333static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1334 size_t nbytes, loff_t *ppos)
1335{
1336 return psi_write(file, user_buf, nbytes, PSI_CPU);
1337}
1338
1339static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1340{
1341 struct seq_file *seq = file->private_data;
1342
1343 return psi_trigger_poll(&seq->private, file, wait);
1344}
1345
1346static int psi_fop_release(struct inode *inode, struct file *file)
1347{
1348 struct seq_file *seq = file->private_data;
1349
1350 psi_trigger_replace(&seq->private, NULL);
1351 return single_release(inode, file);
1352}
1353
1354static const struct proc_ops psi_io_proc_ops = {
1355 .proc_open = psi_io_open,
1356 .proc_read = seq_read,
1357 .proc_lseek = seq_lseek,
1358 .proc_write = psi_io_write,
1359 .proc_poll = psi_fop_poll,
1360 .proc_release = psi_fop_release,
1361};
1362
1363static const struct proc_ops psi_memory_proc_ops = {
1364 .proc_open = psi_memory_open,
1365 .proc_read = seq_read,
1366 .proc_lseek = seq_lseek,
1367 .proc_write = psi_memory_write,
1368 .proc_poll = psi_fop_poll,
1369 .proc_release = psi_fop_release,
1370};
1371
1372static const struct proc_ops psi_cpu_proc_ops = {
1373 .proc_open = psi_cpu_open,
1374 .proc_read = seq_read,
1375 .proc_lseek = seq_lseek,
1376 .proc_write = psi_cpu_write,
1377 .proc_poll = psi_fop_poll,
1378 .proc_release = psi_fop_release,
1379};
1380
1381static int __init psi_proc_init(void)
1382{
1383 if (psi_enable) {
1384 proc_mkdir("pressure", NULL);
1385 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
1386 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
1387 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
1388 }
1389 return 0;
1390}
1391module_init(psi_proc_init);
1392