1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130#include "../workqueue_internal.h"
131#include <linux/sched/loadavg.h>
132#include <linux/seq_file.h>
133#include <linux/proc_fs.h>
134#include <linux/seqlock.h>
135#include <linux/uaccess.h>
136#include <linux/cgroup.h>
137#include <linux/module.h>
138#include <linux/sched.h>
139#include <linux/ctype.h>
140#include <linux/file.h>
141#include <linux/poll.h>
142#include <linux/psi.h>
143#include "sched.h"
144
145static int psi_bug __read_mostly;
146
147DEFINE_STATIC_KEY_FALSE(psi_disabled);
148
149#ifdef CONFIG_PSI_DEFAULT_DISABLED
150static bool psi_enable;
151#else
152static bool psi_enable = true;
153#endif
154static int __init setup_psi(char *str)
155{
156 return kstrtobool(str, &psi_enable) == 0;
157}
158__setup("psi=", setup_psi);
159
160
161#define PSI_FREQ (2*HZ+1)
162#define EXP_10s 1677
163#define EXP_60s 1981
164#define EXP_300s 2034
165
166
167#define WINDOW_MIN_US 500000
168#define WINDOW_MAX_US 10000000
169#define UPDATES_PER_WINDOW 10
170
171
172static u64 psi_period __read_mostly;
173
174
175static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
176struct psi_group psi_system = {
177 .pcpu = &system_group_pcpu,
178};
179
180static void psi_avgs_work(struct work_struct *work);
181
182static void group_init(struct psi_group *group)
183{
184 int cpu;
185
186 for_each_possible_cpu(cpu)
187 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
188 group->avg_last_update = sched_clock();
189 group->avg_next_update = group->avg_last_update + psi_period;
190 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
191 mutex_init(&group->avgs_lock);
192
193 mutex_init(&group->trigger_lock);
194 INIT_LIST_HEAD(&group->triggers);
195 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
196 group->poll_states = 0;
197 group->poll_min_period = U32_MAX;
198 memset(group->polling_total, 0, sizeof(group->polling_total));
199 group->polling_next_update = ULLONG_MAX;
200 group->polling_until = 0;
201 rcu_assign_pointer(group->poll_task, NULL);
202}
203
204void __init psi_init(void)
205{
206 if (!psi_enable) {
207 static_branch_enable(&psi_disabled);
208 return;
209 }
210
211 psi_period = jiffies_to_nsecs(PSI_FREQ);
212 group_init(&psi_system);
213}
214
215static bool test_state(unsigned int *tasks, enum psi_states state)
216{
217 switch (state) {
218 case PSI_IO_SOME:
219 return tasks[NR_IOWAIT];
220 case PSI_IO_FULL:
221 return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
222 case PSI_MEM_SOME:
223 return tasks[NR_MEMSTALL];
224 case PSI_MEM_FULL:
225 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
226 case PSI_CPU_SOME:
227 return tasks[NR_RUNNING] > 1;
228 case PSI_NONIDLE:
229 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
230 tasks[NR_RUNNING];
231 default:
232 return false;
233 }
234}
235
236static void get_recent_times(struct psi_group *group, int cpu,
237 enum psi_aggregators aggregator, u32 *times,
238 u32 *pchanged_states)
239{
240 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
241 u64 now, state_start;
242 enum psi_states s;
243 unsigned int seq;
244 u32 state_mask;
245
246 *pchanged_states = 0;
247
248
249 do {
250 seq = read_seqcount_begin(&groupc->seq);
251 now = cpu_clock(cpu);
252 memcpy(times, groupc->times, sizeof(groupc->times));
253 state_mask = groupc->state_mask;
254 state_start = groupc->state_start;
255 } while (read_seqcount_retry(&groupc->seq, seq));
256
257
258 for (s = 0; s < NR_PSI_STATES; s++) {
259 u32 delta;
260
261
262
263
264
265
266
267
268
269 if (state_mask & (1 << s))
270 times[s] += now - state_start;
271
272 delta = times[s] - groupc->times_prev[aggregator][s];
273 groupc->times_prev[aggregator][s] = times[s];
274
275 times[s] = delta;
276 if (delta)
277 *pchanged_states |= (1 << s);
278 }
279}
280
281static void calc_avgs(unsigned long avg[3], int missed_periods,
282 u64 time, u64 period)
283{
284 unsigned long pct;
285
286
287 if (missed_periods) {
288 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
289 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
290 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
291 }
292
293
294 pct = div_u64(time * 100, period);
295 pct *= FIXED_1;
296 avg[0] = calc_load(avg[0], EXP_10s, pct);
297 avg[1] = calc_load(avg[1], EXP_60s, pct);
298 avg[2] = calc_load(avg[2], EXP_300s, pct);
299}
300
301static void collect_percpu_times(struct psi_group *group,
302 enum psi_aggregators aggregator,
303 u32 *pchanged_states)
304{
305 u64 deltas[NR_PSI_STATES - 1] = { 0, };
306 unsigned long nonidle_total = 0;
307 u32 changed_states = 0;
308 int cpu;
309 int s;
310
311
312
313
314
315
316
317
318
319 for_each_possible_cpu(cpu) {
320 u32 times[NR_PSI_STATES];
321 u32 nonidle;
322 u32 cpu_changed_states;
323
324 get_recent_times(group, cpu, aggregator, times,
325 &cpu_changed_states);
326 changed_states |= cpu_changed_states;
327
328 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
329 nonidle_total += nonidle;
330
331 for (s = 0; s < PSI_NONIDLE; s++)
332 deltas[s] += (u64)times[s] * nonidle;
333 }
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348 for (s = 0; s < NR_PSI_STATES - 1; s++)
349 group->total[aggregator][s] +=
350 div_u64(deltas[s], max(nonidle_total, 1UL));
351
352 if (pchanged_states)
353 *pchanged_states = changed_states;
354}
355
356static u64 update_averages(struct psi_group *group, u64 now)
357{
358 unsigned long missed_periods = 0;
359 u64 expires, period;
360 u64 avg_next_update;
361 int s;
362
363
364 expires = group->avg_next_update;
365 if (now - expires >= psi_period)
366 missed_periods = div_u64(now - expires, psi_period);
367
368
369
370
371
372
373
374
375 avg_next_update = expires + ((1 + missed_periods) * psi_period);
376 period = now - (group->avg_last_update + (missed_periods * psi_period));
377 group->avg_last_update = now;
378
379 for (s = 0; s < NR_PSI_STATES - 1; s++) {
380 u32 sample;
381
382 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400 if (sample > period)
401 sample = period;
402 group->avg_total[s] += sample;
403 calc_avgs(group->avg[s], missed_periods, sample, period);
404 }
405
406 return avg_next_update;
407}
408
409static void psi_avgs_work(struct work_struct *work)
410{
411 struct delayed_work *dwork;
412 struct psi_group *group;
413 u32 changed_states;
414 bool nonidle;
415 u64 now;
416
417 dwork = to_delayed_work(work);
418 group = container_of(dwork, struct psi_group, avgs_work);
419
420 mutex_lock(&group->avgs_lock);
421
422 now = sched_clock();
423
424 collect_percpu_times(group, PSI_AVGS, &changed_states);
425 nonidle = changed_states & (1 << PSI_NONIDLE);
426
427
428
429
430
431
432
433 if (now >= group->avg_next_update)
434 group->avg_next_update = update_averages(group, now);
435
436 if (nonidle) {
437 schedule_delayed_work(dwork, nsecs_to_jiffies(
438 group->avg_next_update - now) + 1);
439 }
440
441 mutex_unlock(&group->avgs_lock);
442}
443
444
445static void window_reset(struct psi_window *win, u64 now, u64 value,
446 u64 prev_growth)
447{
448 win->start_time = now;
449 win->start_value = value;
450 win->prev_growth = prev_growth;
451}
452
453
454
455
456
457
458
459
460
461
462
463
464static u64 window_update(struct psi_window *win, u64 now, u64 value)
465{
466 u64 elapsed;
467 u64 growth;
468
469 elapsed = now - win->start_time;
470 growth = value - win->start_value;
471
472
473
474
475
476
477
478 if (elapsed > win->size)
479 window_reset(win, now, value, growth);
480 else {
481 u32 remaining;
482
483 remaining = win->size - elapsed;
484 growth += div64_u64(win->prev_growth * remaining, win->size);
485 }
486
487 return growth;
488}
489
490static void init_triggers(struct psi_group *group, u64 now)
491{
492 struct psi_trigger *t;
493
494 list_for_each_entry(t, &group->triggers, node)
495 window_reset(&t->win, now,
496 group->total[PSI_POLL][t->state], 0);
497 memcpy(group->polling_total, group->total[PSI_POLL],
498 sizeof(group->polling_total));
499 group->polling_next_update = now + group->poll_min_period;
500}
501
502static u64 update_triggers(struct psi_group *group, u64 now)
503{
504 struct psi_trigger *t;
505 bool new_stall = false;
506 u64 *total = group->total[PSI_POLL];
507
508
509
510
511
512 list_for_each_entry(t, &group->triggers, node) {
513 u64 growth;
514
515
516 if (group->polling_total[t->state] == total[t->state])
517 continue;
518
519
520
521
522
523
524
525 new_stall = true;
526
527
528 growth = window_update(&t->win, now, total[t->state]);
529 if (growth < t->threshold)
530 continue;
531
532
533 if (now < t->last_event_time + t->win.size)
534 continue;
535
536
537 if (cmpxchg(&t->event, 0, 1) == 0)
538 wake_up_interruptible(&t->event_wait);
539 t->last_event_time = now;
540 }
541
542 if (new_stall)
543 memcpy(group->polling_total, total,
544 sizeof(group->polling_total));
545
546 return now + group->poll_min_period;
547}
548
549
550static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
551{
552 struct task_struct *task;
553
554
555
556
557
558
559
560 if (timer_pending(&group->poll_timer))
561 return;
562
563 rcu_read_lock();
564
565 task = rcu_dereference(group->poll_task);
566
567
568
569
570 if (likely(task))
571 mod_timer(&group->poll_timer, jiffies + delay);
572
573 rcu_read_unlock();
574}
575
576static void psi_poll_work(struct psi_group *group)
577{
578 u32 changed_states;
579 u64 now;
580
581 mutex_lock(&group->trigger_lock);
582
583 now = sched_clock();
584
585 collect_percpu_times(group, PSI_POLL, &changed_states);
586
587 if (changed_states & group->poll_states) {
588
589 if (now > group->polling_until)
590 init_triggers(group, now);
591
592
593
594
595
596
597 group->polling_until = now +
598 group->poll_min_period * UPDATES_PER_WINDOW;
599 }
600
601 if (now > group->polling_until) {
602 group->polling_next_update = ULLONG_MAX;
603 goto out;
604 }
605
606 if (now >= group->polling_next_update)
607 group->polling_next_update = update_triggers(group, now);
608
609 psi_schedule_poll_work(group,
610 nsecs_to_jiffies(group->polling_next_update - now) + 1);
611
612out:
613 mutex_unlock(&group->trigger_lock);
614}
615
616static int psi_poll_worker(void *data)
617{
618 struct psi_group *group = (struct psi_group *)data;
619 struct sched_param param = {
620 .sched_priority = 1,
621 };
622
623 sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m);
624
625 while (true) {
626 wait_event_interruptible(group->poll_wait,
627 atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
628 kthread_should_stop());
629 if (kthread_should_stop())
630 break;
631
632 psi_poll_work(group);
633 }
634 return 0;
635}
636
637static void poll_timer_fn(struct timer_list *t)
638{
639 struct psi_group *group = from_timer(group, t, poll_timer);
640
641 atomic_set(&group->poll_wakeup, 1);
642 wake_up_interruptible(&group->poll_wait);
643}
644
645static void record_times(struct psi_group_cpu *groupc, int cpu,
646 bool memstall_tick)
647{
648 u32 delta;
649 u64 now;
650
651 now = cpu_clock(cpu);
652 delta = now - groupc->state_start;
653 groupc->state_start = now;
654
655 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
656 groupc->times[PSI_IO_SOME] += delta;
657 if (groupc->state_mask & (1 << PSI_IO_FULL))
658 groupc->times[PSI_IO_FULL] += delta;
659 }
660
661 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
662 groupc->times[PSI_MEM_SOME] += delta;
663 if (groupc->state_mask & (1 << PSI_MEM_FULL))
664 groupc->times[PSI_MEM_FULL] += delta;
665 else if (memstall_tick) {
666 u32 sample;
667
668
669
670
671
672
673
674
675
676
677
678
679 sample = min(delta, (u32)jiffies_to_nsecs(1));
680 groupc->times[PSI_MEM_FULL] += sample;
681 }
682 }
683
684 if (groupc->state_mask & (1 << PSI_CPU_SOME))
685 groupc->times[PSI_CPU_SOME] += delta;
686
687 if (groupc->state_mask & (1 << PSI_NONIDLE))
688 groupc->times[PSI_NONIDLE] += delta;
689}
690
691static u32 psi_group_change(struct psi_group *group, int cpu,
692 unsigned int clear, unsigned int set)
693{
694 struct psi_group_cpu *groupc;
695 unsigned int t, m;
696 enum psi_states s;
697 u32 state_mask = 0;
698
699 groupc = per_cpu_ptr(group->pcpu, cpu);
700
701
702
703
704
705
706
707
708
709 write_seqcount_begin(&groupc->seq);
710
711 record_times(groupc, cpu, false);
712
713 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
714 if (!(m & (1 << t)))
715 continue;
716 if (groupc->tasks[t] == 0 && !psi_bug) {
717 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
718 cpu, t, groupc->tasks[0],
719 groupc->tasks[1], groupc->tasks[2],
720 clear, set);
721 psi_bug = 1;
722 }
723 groupc->tasks[t]--;
724 }
725
726 for (t = 0; set; set &= ~(1 << t), t++)
727 if (set & (1 << t))
728 groupc->tasks[t]++;
729
730
731 for (s = 0; s < NR_PSI_STATES; s++) {
732 if (test_state(groupc->tasks, s))
733 state_mask |= (1 << s);
734 }
735 groupc->state_mask = state_mask;
736
737 write_seqcount_end(&groupc->seq);
738
739 return state_mask;
740}
741
742static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
743{
744#ifdef CONFIG_CGROUPS
745 struct cgroup *cgroup = NULL;
746
747 if (!*iter)
748 cgroup = task->cgroups->dfl_cgrp;
749 else if (*iter == &psi_system)
750 return NULL;
751 else
752 cgroup = cgroup_parent(*iter);
753
754 if (cgroup && cgroup_parent(cgroup)) {
755 *iter = cgroup;
756 return cgroup_psi(cgroup);
757 }
758#else
759 if (*iter)
760 return NULL;
761#endif
762 *iter = &psi_system;
763 return &psi_system;
764}
765
766void psi_task_change(struct task_struct *task, int clear, int set)
767{
768 int cpu = task_cpu(task);
769 struct psi_group *group;
770 bool wake_clock = true;
771 void *iter = NULL;
772
773 if (!task->pid)
774 return;
775
776 if (((task->psi_flags & set) ||
777 (task->psi_flags & clear) != clear) &&
778 !psi_bug) {
779 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
780 task->pid, task->comm, cpu,
781 task->psi_flags, clear, set);
782 psi_bug = 1;
783 }
784
785 task->psi_flags &= ~clear;
786 task->psi_flags |= set;
787
788
789
790
791
792
793
794 if (unlikely((clear & TSK_RUNNING) &&
795 (task->flags & PF_WQ_WORKER) &&
796 wq_worker_last_func(task) == psi_avgs_work))
797 wake_clock = false;
798
799 while ((group = iterate_groups(task, &iter))) {
800 u32 state_mask = psi_group_change(group, cpu, clear, set);
801
802 if (state_mask & group->poll_states)
803 psi_schedule_poll_work(group, 1);
804
805 if (wake_clock && !delayed_work_pending(&group->avgs_work))
806 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
807 }
808}
809
810void psi_memstall_tick(struct task_struct *task, int cpu)
811{
812 struct psi_group *group;
813 void *iter = NULL;
814
815 while ((group = iterate_groups(task, &iter))) {
816 struct psi_group_cpu *groupc;
817
818 groupc = per_cpu_ptr(group->pcpu, cpu);
819 write_seqcount_begin(&groupc->seq);
820 record_times(groupc, cpu, true);
821 write_seqcount_end(&groupc->seq);
822 }
823}
824
825
826
827
828
829
830
831
832void psi_memstall_enter(unsigned long *flags)
833{
834 struct rq_flags rf;
835 struct rq *rq;
836
837 if (static_branch_likely(&psi_disabled))
838 return;
839
840 *flags = current->flags & PF_MEMSTALL;
841 if (*flags)
842 return;
843
844
845
846
847
848 rq = this_rq_lock_irq(&rf);
849
850 current->flags |= PF_MEMSTALL;
851 psi_task_change(current, 0, TSK_MEMSTALL);
852
853 rq_unlock_irq(rq, &rf);
854}
855
856
857
858
859
860
861
862void psi_memstall_leave(unsigned long *flags)
863{
864 struct rq_flags rf;
865 struct rq *rq;
866
867 if (static_branch_likely(&psi_disabled))
868 return;
869
870 if (*flags)
871 return;
872
873
874
875
876
877 rq = this_rq_lock_irq(&rf);
878
879 current->flags &= ~PF_MEMSTALL;
880 psi_task_change(current, TSK_MEMSTALL, 0);
881
882 rq_unlock_irq(rq, &rf);
883}
884
885#ifdef CONFIG_CGROUPS
886int psi_cgroup_alloc(struct cgroup *cgroup)
887{
888 if (static_branch_likely(&psi_disabled))
889 return 0;
890
891 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
892 if (!cgroup->psi.pcpu)
893 return -ENOMEM;
894 group_init(&cgroup->psi);
895 return 0;
896}
897
898void psi_cgroup_free(struct cgroup *cgroup)
899{
900 if (static_branch_likely(&psi_disabled))
901 return;
902
903 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
904 free_percpu(cgroup->psi.pcpu);
905
906 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
907}
908
909
910
911
912
913
914
915
916
917
918
919
920
921void cgroup_move_task(struct task_struct *task, struct css_set *to)
922{
923 unsigned int task_flags = 0;
924 struct rq_flags rf;
925 struct rq *rq;
926
927 if (static_branch_likely(&psi_disabled)) {
928
929
930
931
932 rcu_assign_pointer(task->cgroups, to);
933 return;
934 }
935
936 rq = task_rq_lock(task, &rf);
937
938 if (task_on_rq_queued(task))
939 task_flags = TSK_RUNNING;
940 else if (task->in_iowait)
941 task_flags = TSK_IOWAIT;
942
943 if (task->flags & PF_MEMSTALL)
944 task_flags |= TSK_MEMSTALL;
945
946 if (task_flags)
947 psi_task_change(task, task_flags, 0);
948
949
950 rcu_assign_pointer(task->cgroups, to);
951
952 if (task_flags)
953 psi_task_change(task, 0, task_flags);
954
955 task_rq_unlock(rq, task, &rf);
956}
957#endif
958
959int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
960{
961 int full;
962 u64 now;
963
964 if (static_branch_likely(&psi_disabled))
965 return -EOPNOTSUPP;
966
967
968 mutex_lock(&group->avgs_lock);
969 now = sched_clock();
970 collect_percpu_times(group, PSI_AVGS, NULL);
971 if (now >= group->avg_next_update)
972 group->avg_next_update = update_averages(group, now);
973 mutex_unlock(&group->avgs_lock);
974
975 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
976 unsigned long avg[3];
977 u64 total;
978 int w;
979
980 for (w = 0; w < 3; w++)
981 avg[w] = group->avg[res * 2 + full][w];
982 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
983 NSEC_PER_USEC);
984
985 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
986 full ? "full" : "some",
987 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
988 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
989 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
990 total);
991 }
992
993 return 0;
994}
995
996static int psi_io_show(struct seq_file *m, void *v)
997{
998 return psi_show(m, &psi_system, PSI_IO);
999}
1000
1001static int psi_memory_show(struct seq_file *m, void *v)
1002{
1003 return psi_show(m, &psi_system, PSI_MEM);
1004}
1005
1006static int psi_cpu_show(struct seq_file *m, void *v)
1007{
1008 return psi_show(m, &psi_system, PSI_CPU);
1009}
1010
1011static int psi_io_open(struct inode *inode, struct file *file)
1012{
1013 return single_open(file, psi_io_show, NULL);
1014}
1015
1016static int psi_memory_open(struct inode *inode, struct file *file)
1017{
1018 return single_open(file, psi_memory_show, NULL);
1019}
1020
1021static int psi_cpu_open(struct inode *inode, struct file *file)
1022{
1023 return single_open(file, psi_cpu_show, NULL);
1024}
1025
1026struct psi_trigger *psi_trigger_create(struct psi_group *group,
1027 char *buf, size_t nbytes, enum psi_res res)
1028{
1029 struct psi_trigger *t;
1030 enum psi_states state;
1031 u32 threshold_us;
1032 u32 window_us;
1033
1034 if (static_branch_likely(&psi_disabled))
1035 return ERR_PTR(-EOPNOTSUPP);
1036
1037 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1038 state = PSI_IO_SOME + res * 2;
1039 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1040 state = PSI_IO_FULL + res * 2;
1041 else
1042 return ERR_PTR(-EINVAL);
1043
1044 if (state >= PSI_NONIDLE)
1045 return ERR_PTR(-EINVAL);
1046
1047 if (window_us < WINDOW_MIN_US ||
1048 window_us > WINDOW_MAX_US)
1049 return ERR_PTR(-EINVAL);
1050
1051
1052 if (threshold_us == 0 || threshold_us > window_us)
1053 return ERR_PTR(-EINVAL);
1054
1055 t = kmalloc(sizeof(*t), GFP_KERNEL);
1056 if (!t)
1057 return ERR_PTR(-ENOMEM);
1058
1059 t->group = group;
1060 t->state = state;
1061 t->threshold = threshold_us * NSEC_PER_USEC;
1062 t->win.size = window_us * NSEC_PER_USEC;
1063 window_reset(&t->win, 0, 0, 0);
1064
1065 t->event = 0;
1066 t->last_event_time = 0;
1067 init_waitqueue_head(&t->event_wait);
1068 kref_init(&t->refcount);
1069
1070 mutex_lock(&group->trigger_lock);
1071
1072 if (!rcu_access_pointer(group->poll_task)) {
1073 struct task_struct *task;
1074
1075 task = kthread_create(psi_poll_worker, group, "psimon");
1076 if (IS_ERR(task)) {
1077 kfree(t);
1078 mutex_unlock(&group->trigger_lock);
1079 return ERR_CAST(task);
1080 }
1081 atomic_set(&group->poll_wakeup, 0);
1082 init_waitqueue_head(&group->poll_wait);
1083 wake_up_process(task);
1084 timer_setup(&group->poll_timer, poll_timer_fn, 0);
1085 rcu_assign_pointer(group->poll_task, task);
1086 }
1087
1088 list_add(&t->node, &group->triggers);
1089 group->poll_min_period = min(group->poll_min_period,
1090 div_u64(t->win.size, UPDATES_PER_WINDOW));
1091 group->nr_triggers[t->state]++;
1092 group->poll_states |= (1 << t->state);
1093
1094 mutex_unlock(&group->trigger_lock);
1095
1096 return t;
1097}
1098
1099static void psi_trigger_destroy(struct kref *ref)
1100{
1101 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1102 struct psi_group *group = t->group;
1103 struct task_struct *task_to_destroy = NULL;
1104
1105 if (static_branch_likely(&psi_disabled))
1106 return;
1107
1108
1109
1110
1111
1112 wake_up_interruptible(&t->event_wait);
1113
1114 mutex_lock(&group->trigger_lock);
1115
1116 if (!list_empty(&t->node)) {
1117 struct psi_trigger *tmp;
1118 u64 period = ULLONG_MAX;
1119
1120 list_del(&t->node);
1121 group->nr_triggers[t->state]--;
1122 if (!group->nr_triggers[t->state])
1123 group->poll_states &= ~(1 << t->state);
1124
1125 list_for_each_entry(tmp, &group->triggers, node)
1126 period = min(period, div_u64(tmp->win.size,
1127 UPDATES_PER_WINDOW));
1128 group->poll_min_period = period;
1129
1130 if (group->poll_states == 0) {
1131 group->polling_until = 0;
1132 task_to_destroy = rcu_dereference_protected(
1133 group->poll_task,
1134 lockdep_is_held(&group->trigger_lock));
1135 rcu_assign_pointer(group->poll_task, NULL);
1136 }
1137 }
1138
1139 mutex_unlock(&group->trigger_lock);
1140
1141
1142
1143
1144
1145
1146 synchronize_rcu();
1147
1148
1149
1150
1151 if (task_to_destroy) {
1152
1153
1154
1155
1156
1157
1158 del_timer_sync(&group->poll_timer);
1159 kthread_stop(task_to_destroy);
1160 }
1161 kfree(t);
1162}
1163
1164void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1165{
1166 struct psi_trigger *old = *trigger_ptr;
1167
1168 if (static_branch_likely(&psi_disabled))
1169 return;
1170
1171 rcu_assign_pointer(*trigger_ptr, new);
1172 if (old)
1173 kref_put(&old->refcount, psi_trigger_destroy);
1174}
1175
1176__poll_t psi_trigger_poll(void **trigger_ptr,
1177 struct file *file, poll_table *wait)
1178{
1179 __poll_t ret = DEFAULT_POLLMASK;
1180 struct psi_trigger *t;
1181
1182 if (static_branch_likely(&psi_disabled))
1183 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1184
1185 rcu_read_lock();
1186
1187 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1188 if (!t) {
1189 rcu_read_unlock();
1190 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1191 }
1192 kref_get(&t->refcount);
1193
1194 rcu_read_unlock();
1195
1196 poll_wait(file, &t->event_wait, wait);
1197
1198 if (cmpxchg(&t->event, 1, 0) == 1)
1199 ret |= EPOLLPRI;
1200
1201 kref_put(&t->refcount, psi_trigger_destroy);
1202
1203 return ret;
1204}
1205
1206static ssize_t psi_write(struct file *file, const char __user *user_buf,
1207 size_t nbytes, enum psi_res res)
1208{
1209 char buf[32];
1210 size_t buf_size;
1211 struct seq_file *seq;
1212 struct psi_trigger *new;
1213
1214 if (static_branch_likely(&psi_disabled))
1215 return -EOPNOTSUPP;
1216
1217 if (!nbytes)
1218 return -EINVAL;
1219
1220 buf_size = min(nbytes, sizeof(buf));
1221 if (copy_from_user(buf, user_buf, buf_size))
1222 return -EFAULT;
1223
1224 buf[buf_size - 1] = '\0';
1225
1226 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1227 if (IS_ERR(new))
1228 return PTR_ERR(new);
1229
1230 seq = file->private_data;
1231
1232 mutex_lock(&seq->lock);
1233 psi_trigger_replace(&seq->private, new);
1234 mutex_unlock(&seq->lock);
1235
1236 return nbytes;
1237}
1238
1239static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1240 size_t nbytes, loff_t *ppos)
1241{
1242 return psi_write(file, user_buf, nbytes, PSI_IO);
1243}
1244
1245static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1246 size_t nbytes, loff_t *ppos)
1247{
1248 return psi_write(file, user_buf, nbytes, PSI_MEM);
1249}
1250
1251static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1252 size_t nbytes, loff_t *ppos)
1253{
1254 return psi_write(file, user_buf, nbytes, PSI_CPU);
1255}
1256
1257static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1258{
1259 struct seq_file *seq = file->private_data;
1260
1261 return psi_trigger_poll(&seq->private, file, wait);
1262}
1263
1264static int psi_fop_release(struct inode *inode, struct file *file)
1265{
1266 struct seq_file *seq = file->private_data;
1267
1268 psi_trigger_replace(&seq->private, NULL);
1269 return single_release(inode, file);
1270}
1271
1272static const struct file_operations psi_io_fops = {
1273 .open = psi_io_open,
1274 .read = seq_read,
1275 .llseek = seq_lseek,
1276 .write = psi_io_write,
1277 .poll = psi_fop_poll,
1278 .release = psi_fop_release,
1279};
1280
1281static const struct file_operations psi_memory_fops = {
1282 .open = psi_memory_open,
1283 .read = seq_read,
1284 .llseek = seq_lseek,
1285 .write = psi_memory_write,
1286 .poll = psi_fop_poll,
1287 .release = psi_fop_release,
1288};
1289
1290static const struct file_operations psi_cpu_fops = {
1291 .open = psi_cpu_open,
1292 .read = seq_read,
1293 .llseek = seq_lseek,
1294 .write = psi_cpu_write,
1295 .poll = psi_fop_poll,
1296 .release = psi_fop_release,
1297};
1298
1299static int __init psi_proc_init(void)
1300{
1301 if (psi_enable) {
1302 proc_mkdir("pressure", NULL);
1303 proc_create("pressure/io", 0, NULL, &psi_io_fops);
1304 proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
1305 proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
1306 }
1307 return 0;
1308}
1309module_init(psi_proc_init);
1310