1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130#include "../workqueue_internal.h"
131#include <linux/sched/loadavg.h>
132#include <linux/seq_file.h>
133#include <linux/proc_fs.h>
134#include <linux/seqlock.h>
135#include <linux/uaccess.h>
136#include <linux/cgroup.h>
137#include <linux/module.h>
138#include <linux/sched.h>
139#include <linux/ctype.h>
140#include <linux/file.h>
141#include <linux/poll.h>
142#include <linux/psi.h>
143#include "sched.h"
144
145static int psi_bug __read_mostly;
146
147DEFINE_STATIC_KEY_FALSE(psi_disabled);
148
149#ifdef CONFIG_PSI_DEFAULT_DISABLED
150static bool psi_enable;
151#else
152static bool psi_enable = true;
153#endif
154static int __init setup_psi(char *str)
155{
156 return kstrtobool(str, &psi_enable) == 0;
157}
158__setup("psi=", setup_psi);
159
160
161#define PSI_FREQ (2*HZ+1)
162#define EXP_10s 1677
163#define EXP_60s 1981
164#define EXP_300s 2034
165
166
167#define WINDOW_MIN_US 500000
168#define WINDOW_MAX_US 10000000
169#define UPDATES_PER_WINDOW 10
170
171
172static u64 psi_period __read_mostly;
173
174
175static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
176struct psi_group psi_system = {
177 .pcpu = &system_group_pcpu,
178};
179
180static void psi_avgs_work(struct work_struct *work);
181
182static void group_init(struct psi_group *group)
183{
184 int cpu;
185
186 for_each_possible_cpu(cpu)
187 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
188 group->avg_last_update = sched_clock();
189 group->avg_next_update = group->avg_last_update + psi_period;
190 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
191 mutex_init(&group->avgs_lock);
192
193 atomic_set(&group->poll_scheduled, 0);
194 mutex_init(&group->trigger_lock);
195 INIT_LIST_HEAD(&group->triggers);
196 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
197 group->poll_states = 0;
198 group->poll_min_period = U32_MAX;
199 memset(group->polling_total, 0, sizeof(group->polling_total));
200 group->polling_next_update = ULLONG_MAX;
201 group->polling_until = 0;
202 rcu_assign_pointer(group->poll_kworker, NULL);
203}
204
205void __init psi_init(void)
206{
207 if (!psi_enable) {
208 static_branch_enable(&psi_disabled);
209 return;
210 }
211
212 psi_period = jiffies_to_nsecs(PSI_FREQ);
213 group_init(&psi_system);
214}
215
216static bool test_state(unsigned int *tasks, enum psi_states state)
217{
218 switch (state) {
219 case PSI_IO_SOME:
220 return tasks[NR_IOWAIT];
221 case PSI_IO_FULL:
222 return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
223 case PSI_MEM_SOME:
224 return tasks[NR_MEMSTALL];
225 case PSI_MEM_FULL:
226 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
227 case PSI_CPU_SOME:
228 return tasks[NR_RUNNING] > 1;
229 case PSI_NONIDLE:
230 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
231 tasks[NR_RUNNING];
232 default:
233 return false;
234 }
235}
236
237static void get_recent_times(struct psi_group *group, int cpu,
238 enum psi_aggregators aggregator, u32 *times,
239 u32 *pchanged_states)
240{
241 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
242 u64 now, state_start;
243 enum psi_states s;
244 unsigned int seq;
245 u32 state_mask;
246
247 *pchanged_states = 0;
248
249
250 do {
251 seq = read_seqcount_begin(&groupc->seq);
252 now = cpu_clock(cpu);
253 memcpy(times, groupc->times, sizeof(groupc->times));
254 state_mask = groupc->state_mask;
255 state_start = groupc->state_start;
256 } while (read_seqcount_retry(&groupc->seq, seq));
257
258
259 for (s = 0; s < NR_PSI_STATES; s++) {
260 u32 delta;
261
262
263
264
265
266
267
268
269
270 if (state_mask & (1 << s))
271 times[s] += now - state_start;
272
273 delta = times[s] - groupc->times_prev[aggregator][s];
274 groupc->times_prev[aggregator][s] = times[s];
275
276 times[s] = delta;
277 if (delta)
278 *pchanged_states |= (1 << s);
279 }
280}
281
282static void calc_avgs(unsigned long avg[3], int missed_periods,
283 u64 time, u64 period)
284{
285 unsigned long pct;
286
287
288 if (missed_periods) {
289 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
290 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
291 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
292 }
293
294
295 pct = div_u64(time * 100, period);
296 pct *= FIXED_1;
297 avg[0] = calc_load(avg[0], EXP_10s, pct);
298 avg[1] = calc_load(avg[1], EXP_60s, pct);
299 avg[2] = calc_load(avg[2], EXP_300s, pct);
300}
301
302static void collect_percpu_times(struct psi_group *group,
303 enum psi_aggregators aggregator,
304 u32 *pchanged_states)
305{
306 u64 deltas[NR_PSI_STATES - 1] = { 0, };
307 unsigned long nonidle_total = 0;
308 u32 changed_states = 0;
309 int cpu;
310 int s;
311
312
313
314
315
316
317
318
319
320 for_each_possible_cpu(cpu) {
321 u32 times[NR_PSI_STATES];
322 u32 nonidle;
323 u32 cpu_changed_states;
324
325 get_recent_times(group, cpu, aggregator, times,
326 &cpu_changed_states);
327 changed_states |= cpu_changed_states;
328
329 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
330 nonidle_total += nonidle;
331
332 for (s = 0; s < PSI_NONIDLE; s++)
333 deltas[s] += (u64)times[s] * nonidle;
334 }
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349 for (s = 0; s < NR_PSI_STATES - 1; s++)
350 group->total[aggregator][s] +=
351 div_u64(deltas[s], max(nonidle_total, 1UL));
352
353 if (pchanged_states)
354 *pchanged_states = changed_states;
355}
356
357static u64 update_averages(struct psi_group *group, u64 now)
358{
359 unsigned long missed_periods = 0;
360 u64 expires, period;
361 u64 avg_next_update;
362 int s;
363
364
365 expires = group->avg_next_update;
366 if (now - expires >= psi_period)
367 missed_periods = div_u64(now - expires, psi_period);
368
369
370
371
372
373
374
375
376 avg_next_update = expires + ((1 + missed_periods) * psi_period);
377 period = now - (group->avg_last_update + (missed_periods * psi_period));
378 group->avg_last_update = now;
379
380 for (s = 0; s < NR_PSI_STATES - 1; s++) {
381 u32 sample;
382
383 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401 if (sample > period)
402 sample = period;
403 group->avg_total[s] += sample;
404 calc_avgs(group->avg[s], missed_periods, sample, period);
405 }
406
407 return avg_next_update;
408}
409
410static void psi_avgs_work(struct work_struct *work)
411{
412 struct delayed_work *dwork;
413 struct psi_group *group;
414 u32 changed_states;
415 bool nonidle;
416 u64 now;
417
418 dwork = to_delayed_work(work);
419 group = container_of(dwork, struct psi_group, avgs_work);
420
421 mutex_lock(&group->avgs_lock);
422
423 now = sched_clock();
424
425 collect_percpu_times(group, PSI_AVGS, &changed_states);
426 nonidle = changed_states & (1 << PSI_NONIDLE);
427
428
429
430
431
432
433
434 if (now >= group->avg_next_update)
435 group->avg_next_update = update_averages(group, now);
436
437 if (nonidle) {
438 schedule_delayed_work(dwork, nsecs_to_jiffies(
439 group->avg_next_update - now) + 1);
440 }
441
442 mutex_unlock(&group->avgs_lock);
443}
444
445
446static void window_reset(struct psi_window *win, u64 now, u64 value,
447 u64 prev_growth)
448{
449 win->start_time = now;
450 win->start_value = value;
451 win->prev_growth = prev_growth;
452}
453
454
455
456
457
458
459
460
461
462
463
464
465static u64 window_update(struct psi_window *win, u64 now, u64 value)
466{
467 u64 elapsed;
468 u64 growth;
469
470 elapsed = now - win->start_time;
471 growth = value - win->start_value;
472
473
474
475
476
477
478
479 if (elapsed > win->size)
480 window_reset(win, now, value, growth);
481 else {
482 u32 remaining;
483
484 remaining = win->size - elapsed;
485 growth += div64_u64(win->prev_growth * remaining, win->size);
486 }
487
488 return growth;
489}
490
491static void init_triggers(struct psi_group *group, u64 now)
492{
493 struct psi_trigger *t;
494
495 list_for_each_entry(t, &group->triggers, node)
496 window_reset(&t->win, now,
497 group->total[PSI_POLL][t->state], 0);
498 memcpy(group->polling_total, group->total[PSI_POLL],
499 sizeof(group->polling_total));
500 group->polling_next_update = now + group->poll_min_period;
501}
502
503static u64 update_triggers(struct psi_group *group, u64 now)
504{
505 struct psi_trigger *t;
506 bool new_stall = false;
507 u64 *total = group->total[PSI_POLL];
508
509
510
511
512
513 list_for_each_entry(t, &group->triggers, node) {
514 u64 growth;
515
516
517 if (group->polling_total[t->state] == total[t->state])
518 continue;
519
520
521
522
523
524
525
526 new_stall = true;
527
528
529 growth = window_update(&t->win, now, total[t->state]);
530 if (growth < t->threshold)
531 continue;
532
533
534 if (now < t->last_event_time + t->win.size)
535 continue;
536
537
538 if (cmpxchg(&t->event, 0, 1) == 0)
539 wake_up_interruptible(&t->event_wait);
540 t->last_event_time = now;
541 }
542
543 if (new_stall)
544 memcpy(group->polling_total, total,
545 sizeof(group->polling_total));
546
547 return now + group->poll_min_period;
548}
549
550
551
552
553
554
555
556static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
557{
558 struct kthread_worker *kworker;
559
560
561 if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
562 return;
563
564 rcu_read_lock();
565
566 kworker = rcu_dereference(group->poll_kworker);
567
568
569
570
571 if (likely(kworker))
572 kthread_queue_delayed_work(kworker, &group->poll_work, delay);
573 else
574 atomic_set(&group->poll_scheduled, 0);
575
576 rcu_read_unlock();
577}
578
579static void psi_poll_work(struct kthread_work *work)
580{
581 struct kthread_delayed_work *dwork;
582 struct psi_group *group;
583 u32 changed_states;
584 u64 now;
585
586 dwork = container_of(work, struct kthread_delayed_work, work);
587 group = container_of(dwork, struct psi_group, poll_work);
588
589 atomic_set(&group->poll_scheduled, 0);
590
591 mutex_lock(&group->trigger_lock);
592
593 now = sched_clock();
594
595 collect_percpu_times(group, PSI_POLL, &changed_states);
596
597 if (changed_states & group->poll_states) {
598
599 if (now > group->polling_until)
600 init_triggers(group, now);
601
602
603
604
605
606
607 group->polling_until = now +
608 group->poll_min_period * UPDATES_PER_WINDOW;
609 }
610
611 if (now > group->polling_until) {
612 group->polling_next_update = ULLONG_MAX;
613 goto out;
614 }
615
616 if (now >= group->polling_next_update)
617 group->polling_next_update = update_triggers(group, now);
618
619 psi_schedule_poll_work(group,
620 nsecs_to_jiffies(group->polling_next_update - now) + 1);
621
622out:
623 mutex_unlock(&group->trigger_lock);
624}
625
626static void record_times(struct psi_group_cpu *groupc, int cpu,
627 bool memstall_tick)
628{
629 u32 delta;
630 u64 now;
631
632 now = cpu_clock(cpu);
633 delta = now - groupc->state_start;
634 groupc->state_start = now;
635
636 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
637 groupc->times[PSI_IO_SOME] += delta;
638 if (groupc->state_mask & (1 << PSI_IO_FULL))
639 groupc->times[PSI_IO_FULL] += delta;
640 }
641
642 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
643 groupc->times[PSI_MEM_SOME] += delta;
644 if (groupc->state_mask & (1 << PSI_MEM_FULL))
645 groupc->times[PSI_MEM_FULL] += delta;
646 else if (memstall_tick) {
647 u32 sample;
648
649
650
651
652
653
654
655
656
657
658
659
660 sample = min(delta, (u32)jiffies_to_nsecs(1));
661 groupc->times[PSI_MEM_FULL] += sample;
662 }
663 }
664
665 if (groupc->state_mask & (1 << PSI_CPU_SOME))
666 groupc->times[PSI_CPU_SOME] += delta;
667
668 if (groupc->state_mask & (1 << PSI_NONIDLE))
669 groupc->times[PSI_NONIDLE] += delta;
670}
671
672static u32 psi_group_change(struct psi_group *group, int cpu,
673 unsigned int clear, unsigned int set)
674{
675 struct psi_group_cpu *groupc;
676 unsigned int t, m;
677 enum psi_states s;
678 u32 state_mask = 0;
679
680 groupc = per_cpu_ptr(group->pcpu, cpu);
681
682
683
684
685
686
687
688
689
690 write_seqcount_begin(&groupc->seq);
691
692 record_times(groupc, cpu, false);
693
694 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
695 if (!(m & (1 << t)))
696 continue;
697 if (groupc->tasks[t] == 0 && !psi_bug) {
698 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
699 cpu, t, groupc->tasks[0],
700 groupc->tasks[1], groupc->tasks[2],
701 clear, set);
702 psi_bug = 1;
703 }
704 groupc->tasks[t]--;
705 }
706
707 for (t = 0; set; set &= ~(1 << t), t++)
708 if (set & (1 << t))
709 groupc->tasks[t]++;
710
711
712 for (s = 0; s < NR_PSI_STATES; s++) {
713 if (test_state(groupc->tasks, s))
714 state_mask |= (1 << s);
715 }
716 groupc->state_mask = state_mask;
717
718 write_seqcount_end(&groupc->seq);
719
720 return state_mask;
721}
722
723static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
724{
725#ifdef CONFIG_CGROUPS
726 struct cgroup *cgroup = NULL;
727
728 if (!*iter)
729 cgroup = task->cgroups->dfl_cgrp;
730 else if (*iter == &psi_system)
731 return NULL;
732 else
733 cgroup = cgroup_parent(*iter);
734
735 if (cgroup && cgroup_parent(cgroup)) {
736 *iter = cgroup;
737 return cgroup_psi(cgroup);
738 }
739#else
740 if (*iter)
741 return NULL;
742#endif
743 *iter = &psi_system;
744 return &psi_system;
745}
746
747void psi_task_change(struct task_struct *task, int clear, int set)
748{
749 int cpu = task_cpu(task);
750 struct psi_group *group;
751 bool wake_clock = true;
752 void *iter = NULL;
753
754 if (!task->pid)
755 return;
756
757 if (((task->psi_flags & set) ||
758 (task->psi_flags & clear) != clear) &&
759 !psi_bug) {
760 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
761 task->pid, task->comm, cpu,
762 task->psi_flags, clear, set);
763 psi_bug = 1;
764 }
765
766 task->psi_flags &= ~clear;
767 task->psi_flags |= set;
768
769
770
771
772
773
774
775 if (unlikely((clear & TSK_RUNNING) &&
776 (task->flags & PF_WQ_WORKER) &&
777 wq_worker_last_func(task) == psi_avgs_work))
778 wake_clock = false;
779
780 while ((group = iterate_groups(task, &iter))) {
781 u32 state_mask = psi_group_change(group, cpu, clear, set);
782
783 if (state_mask & group->poll_states)
784 psi_schedule_poll_work(group, 1);
785
786 if (wake_clock && !delayed_work_pending(&group->avgs_work))
787 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
788 }
789}
790
791void psi_memstall_tick(struct task_struct *task, int cpu)
792{
793 struct psi_group *group;
794 void *iter = NULL;
795
796 while ((group = iterate_groups(task, &iter))) {
797 struct psi_group_cpu *groupc;
798
799 groupc = per_cpu_ptr(group->pcpu, cpu);
800 write_seqcount_begin(&groupc->seq);
801 record_times(groupc, cpu, true);
802 write_seqcount_end(&groupc->seq);
803 }
804}
805
806
807
808
809
810
811
812
813void psi_memstall_enter(unsigned long *flags)
814{
815 struct rq_flags rf;
816 struct rq *rq;
817
818 if (static_branch_likely(&psi_disabled))
819 return;
820
821 *flags = current->flags & PF_MEMSTALL;
822 if (*flags)
823 return;
824
825
826
827
828
829 rq = this_rq_lock_irq(&rf);
830
831 current->flags |= PF_MEMSTALL;
832 psi_task_change(current, 0, TSK_MEMSTALL);
833
834 rq_unlock_irq(rq, &rf);
835}
836
837
838
839
840
841
842
843void psi_memstall_leave(unsigned long *flags)
844{
845 struct rq_flags rf;
846 struct rq *rq;
847
848 if (static_branch_likely(&psi_disabled))
849 return;
850
851 if (*flags)
852 return;
853
854
855
856
857
858 rq = this_rq_lock_irq(&rf);
859
860 current->flags &= ~PF_MEMSTALL;
861 psi_task_change(current, TSK_MEMSTALL, 0);
862
863 rq_unlock_irq(rq, &rf);
864}
865
866#ifdef CONFIG_CGROUPS
867int psi_cgroup_alloc(struct cgroup *cgroup)
868{
869 if (static_branch_likely(&psi_disabled))
870 return 0;
871
872 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
873 if (!cgroup->psi.pcpu)
874 return -ENOMEM;
875 group_init(&cgroup->psi);
876 return 0;
877}
878
879void psi_cgroup_free(struct cgroup *cgroup)
880{
881 if (static_branch_likely(&psi_disabled))
882 return;
883
884 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
885 free_percpu(cgroup->psi.pcpu);
886
887 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
888}
889
890
891
892
893
894
895
896
897
898
899
900
901
902void cgroup_move_task(struct task_struct *task, struct css_set *to)
903{
904 unsigned int task_flags = 0;
905 struct rq_flags rf;
906 struct rq *rq;
907
908 if (static_branch_likely(&psi_disabled)) {
909
910
911
912
913 rcu_assign_pointer(task->cgroups, to);
914 return;
915 }
916
917 rq = task_rq_lock(task, &rf);
918
919 if (task_on_rq_queued(task))
920 task_flags = TSK_RUNNING;
921 else if (task->in_iowait)
922 task_flags = TSK_IOWAIT;
923
924 if (task->flags & PF_MEMSTALL)
925 task_flags |= TSK_MEMSTALL;
926
927 if (task_flags)
928 psi_task_change(task, task_flags, 0);
929
930
931 rcu_assign_pointer(task->cgroups, to);
932
933 if (task_flags)
934 psi_task_change(task, 0, task_flags);
935
936 task_rq_unlock(rq, task, &rf);
937}
938#endif
939
940int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
941{
942 int full;
943 u64 now;
944
945 if (static_branch_likely(&psi_disabled))
946 return -EOPNOTSUPP;
947
948
949 mutex_lock(&group->avgs_lock);
950 now = sched_clock();
951 collect_percpu_times(group, PSI_AVGS, NULL);
952 if (now >= group->avg_next_update)
953 group->avg_next_update = update_averages(group, now);
954 mutex_unlock(&group->avgs_lock);
955
956 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
957 unsigned long avg[3];
958 u64 total;
959 int w;
960
961 for (w = 0; w < 3; w++)
962 avg[w] = group->avg[res * 2 + full][w];
963 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
964 NSEC_PER_USEC);
965
966 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
967 full ? "full" : "some",
968 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
969 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
970 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
971 total);
972 }
973
974 return 0;
975}
976
977static int psi_io_show(struct seq_file *m, void *v)
978{
979 return psi_show(m, &psi_system, PSI_IO);
980}
981
982static int psi_memory_show(struct seq_file *m, void *v)
983{
984 return psi_show(m, &psi_system, PSI_MEM);
985}
986
987static int psi_cpu_show(struct seq_file *m, void *v)
988{
989 return psi_show(m, &psi_system, PSI_CPU);
990}
991
992static int psi_io_open(struct inode *inode, struct file *file)
993{
994 return single_open(file, psi_io_show, NULL);
995}
996
997static int psi_memory_open(struct inode *inode, struct file *file)
998{
999 return single_open(file, psi_memory_show, NULL);
1000}
1001
1002static int psi_cpu_open(struct inode *inode, struct file *file)
1003{
1004 return single_open(file, psi_cpu_show, NULL);
1005}
1006
1007struct psi_trigger *psi_trigger_create(struct psi_group *group,
1008 char *buf, size_t nbytes, enum psi_res res)
1009{
1010 struct psi_trigger *t;
1011 enum psi_states state;
1012 u32 threshold_us;
1013 u32 window_us;
1014
1015 if (static_branch_likely(&psi_disabled))
1016 return ERR_PTR(-EOPNOTSUPP);
1017
1018 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1019 state = PSI_IO_SOME + res * 2;
1020 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1021 state = PSI_IO_FULL + res * 2;
1022 else
1023 return ERR_PTR(-EINVAL);
1024
1025 if (state >= PSI_NONIDLE)
1026 return ERR_PTR(-EINVAL);
1027
1028 if (window_us < WINDOW_MIN_US ||
1029 window_us > WINDOW_MAX_US)
1030 return ERR_PTR(-EINVAL);
1031
1032
1033 if (threshold_us == 0 || threshold_us > window_us)
1034 return ERR_PTR(-EINVAL);
1035
1036 t = kmalloc(sizeof(*t), GFP_KERNEL);
1037 if (!t)
1038 return ERR_PTR(-ENOMEM);
1039
1040 t->group = group;
1041 t->state = state;
1042 t->threshold = threshold_us * NSEC_PER_USEC;
1043 t->win.size = window_us * NSEC_PER_USEC;
1044 window_reset(&t->win, 0, 0, 0);
1045
1046 t->event = 0;
1047 t->last_event_time = 0;
1048 init_waitqueue_head(&t->event_wait);
1049 kref_init(&t->refcount);
1050
1051 mutex_lock(&group->trigger_lock);
1052
1053 if (!rcu_access_pointer(group->poll_kworker)) {
1054 struct sched_param param = {
1055 .sched_priority = 1,
1056 };
1057 struct kthread_worker *kworker;
1058
1059 kworker = kthread_create_worker(0, "psimon");
1060 if (IS_ERR(kworker)) {
1061 kfree(t);
1062 mutex_unlock(&group->trigger_lock);
1063 return ERR_CAST(kworker);
1064 }
1065 sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
1066 kthread_init_delayed_work(&group->poll_work,
1067 psi_poll_work);
1068 rcu_assign_pointer(group->poll_kworker, kworker);
1069 }
1070
1071 list_add(&t->node, &group->triggers);
1072 group->poll_min_period = min(group->poll_min_period,
1073 div_u64(t->win.size, UPDATES_PER_WINDOW));
1074 group->nr_triggers[t->state]++;
1075 group->poll_states |= (1 << t->state);
1076
1077 mutex_unlock(&group->trigger_lock);
1078
1079 return t;
1080}
1081
1082static void psi_trigger_destroy(struct kref *ref)
1083{
1084 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1085 struct psi_group *group = t->group;
1086 struct kthread_worker *kworker_to_destroy = NULL;
1087
1088 if (static_branch_likely(&psi_disabled))
1089 return;
1090
1091
1092
1093
1094
1095 wake_up_interruptible(&t->event_wait);
1096
1097 mutex_lock(&group->trigger_lock);
1098
1099 if (!list_empty(&t->node)) {
1100 struct psi_trigger *tmp;
1101 u64 period = ULLONG_MAX;
1102
1103 list_del(&t->node);
1104 group->nr_triggers[t->state]--;
1105 if (!group->nr_triggers[t->state])
1106 group->poll_states &= ~(1 << t->state);
1107
1108 list_for_each_entry(tmp, &group->triggers, node)
1109 period = min(period, div_u64(tmp->win.size,
1110 UPDATES_PER_WINDOW));
1111 group->poll_min_period = period;
1112
1113 if (group->poll_states == 0) {
1114 group->polling_until = 0;
1115 kworker_to_destroy = rcu_dereference_protected(
1116 group->poll_kworker,
1117 lockdep_is_held(&group->trigger_lock));
1118 rcu_assign_pointer(group->poll_kworker, NULL);
1119 }
1120 }
1121
1122 mutex_unlock(&group->trigger_lock);
1123
1124
1125
1126
1127
1128
1129 synchronize_rcu();
1130
1131
1132
1133
1134 if (kworker_to_destroy) {
1135 kthread_cancel_delayed_work_sync(&group->poll_work);
1136 kthread_destroy_worker(kworker_to_destroy);
1137 }
1138 kfree(t);
1139}
1140
1141void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1142{
1143 struct psi_trigger *old = *trigger_ptr;
1144
1145 if (static_branch_likely(&psi_disabled))
1146 return;
1147
1148 rcu_assign_pointer(*trigger_ptr, new);
1149 if (old)
1150 kref_put(&old->refcount, psi_trigger_destroy);
1151}
1152
1153__poll_t psi_trigger_poll(void **trigger_ptr,
1154 struct file *file, poll_table *wait)
1155{
1156 __poll_t ret = DEFAULT_POLLMASK;
1157 struct psi_trigger *t;
1158
1159 if (static_branch_likely(&psi_disabled))
1160 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1161
1162 rcu_read_lock();
1163
1164 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1165 if (!t) {
1166 rcu_read_unlock();
1167 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1168 }
1169 kref_get(&t->refcount);
1170
1171 rcu_read_unlock();
1172
1173 poll_wait(file, &t->event_wait, wait);
1174
1175 if (cmpxchg(&t->event, 1, 0) == 1)
1176 ret |= EPOLLPRI;
1177
1178 kref_put(&t->refcount, psi_trigger_destroy);
1179
1180 return ret;
1181}
1182
1183static ssize_t psi_write(struct file *file, const char __user *user_buf,
1184 size_t nbytes, enum psi_res res)
1185{
1186 char buf[32];
1187 size_t buf_size;
1188 struct seq_file *seq;
1189 struct psi_trigger *new;
1190
1191 if (static_branch_likely(&psi_disabled))
1192 return -EOPNOTSUPP;
1193
1194 if (!nbytes)
1195 return -EINVAL;
1196
1197 buf_size = min(nbytes, sizeof(buf));
1198 if (copy_from_user(buf, user_buf, buf_size))
1199 return -EFAULT;
1200
1201 buf[buf_size - 1] = '\0';
1202
1203 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1204 if (IS_ERR(new))
1205 return PTR_ERR(new);
1206
1207 seq = file->private_data;
1208
1209 mutex_lock(&seq->lock);
1210 psi_trigger_replace(&seq->private, new);
1211 mutex_unlock(&seq->lock);
1212
1213 return nbytes;
1214}
1215
1216static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1217 size_t nbytes, loff_t *ppos)
1218{
1219 return psi_write(file, user_buf, nbytes, PSI_IO);
1220}
1221
1222static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1223 size_t nbytes, loff_t *ppos)
1224{
1225 return psi_write(file, user_buf, nbytes, PSI_MEM);
1226}
1227
1228static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1229 size_t nbytes, loff_t *ppos)
1230{
1231 return psi_write(file, user_buf, nbytes, PSI_CPU);
1232}
1233
1234static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1235{
1236 struct seq_file *seq = file->private_data;
1237
1238 return psi_trigger_poll(&seq->private, file, wait);
1239}
1240
1241static int psi_fop_release(struct inode *inode, struct file *file)
1242{
1243 struct seq_file *seq = file->private_data;
1244
1245 psi_trigger_replace(&seq->private, NULL);
1246 return single_release(inode, file);
1247}
1248
1249static const struct file_operations psi_io_fops = {
1250 .open = psi_io_open,
1251 .read = seq_read,
1252 .llseek = seq_lseek,
1253 .write = psi_io_write,
1254 .poll = psi_fop_poll,
1255 .release = psi_fop_release,
1256};
1257
1258static const struct file_operations psi_memory_fops = {
1259 .open = psi_memory_open,
1260 .read = seq_read,
1261 .llseek = seq_lseek,
1262 .write = psi_memory_write,
1263 .poll = psi_fop_poll,
1264 .release = psi_fop_release,
1265};
1266
1267static const struct file_operations psi_cpu_fops = {
1268 .open = psi_cpu_open,
1269 .read = seq_read,
1270 .llseek = seq_lseek,
1271 .write = psi_cpu_write,
1272 .poll = psi_fop_poll,
1273 .release = psi_fop_release,
1274};
1275
1276static int __init psi_proc_init(void)
1277{
1278 if (psi_enable) {
1279 proc_mkdir("pressure", NULL);
1280 proc_create("pressure/io", 0, NULL, &psi_io_fops);
1281 proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
1282 proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
1283 }
1284 return 0;
1285}
1286module_init(psi_proc_init);
1287