1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53
54#include "internal.h"
55
56#include <asm/irq_regs.h>
57
58typedef int (*remote_function_f)(void *);
59
60struct remote_function_call {
61 struct task_struct *p;
62 remote_function_f func;
63 void *info;
64 int ret;
65};
66
67static void remote_function(void *data)
68{
69 struct remote_function_call *tfc = data;
70 struct task_struct *p = tfc->p;
71
72 if (p) {
73
74 if (task_cpu(p) != smp_processor_id())
75 return;
76
77
78
79
80
81
82 tfc->ret = -ESRCH;
83 if (p != current)
84 return;
85 }
86
87 tfc->ret = tfc->func(tfc->info);
88}
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103static int
104task_function_call(struct task_struct *p, remote_function_f func, void *info)
105{
106 struct remote_function_call data = {
107 .p = p,
108 .func = func,
109 .info = info,
110 .ret = -EAGAIN,
111 };
112 int ret;
113
114 do {
115 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
116 if (!ret)
117 ret = data.ret;
118 } while (ret == -EAGAIN);
119
120 return ret;
121}
122
123
124
125
126
127
128
129
130
131
132static int cpu_function_call(int cpu, remote_function_f func, void *info)
133{
134 struct remote_function_call data = {
135 .p = NULL,
136 .func = func,
137 .info = info,
138 .ret = -ENXIO,
139 };
140
141 smp_call_function_single(cpu, remote_function, &data, 1);
142
143 return data.ret;
144}
145
146static inline struct perf_cpu_context *
147__get_cpu_context(struct perf_event_context *ctx)
148{
149 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
150}
151
152static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
153 struct perf_event_context *ctx)
154{
155 raw_spin_lock(&cpuctx->ctx.lock);
156 if (ctx)
157 raw_spin_lock(&ctx->lock);
158}
159
160static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
161 struct perf_event_context *ctx)
162{
163 if (ctx)
164 raw_spin_unlock(&ctx->lock);
165 raw_spin_unlock(&cpuctx->ctx.lock);
166}
167
168#define TASK_TOMBSTONE ((void *)-1L)
169
170static bool is_kernel_event(struct perf_event *event)
171{
172 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
173}
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
195 struct perf_event_context *, void *);
196
197struct event_function_struct {
198 struct perf_event *event;
199 event_f func;
200 void *data;
201};
202
203static int event_function(void *info)
204{
205 struct event_function_struct *efs = info;
206 struct perf_event *event = efs->event;
207 struct perf_event_context *ctx = event->ctx;
208 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
209 struct perf_event_context *task_ctx = cpuctx->task_ctx;
210 int ret = 0;
211
212 WARN_ON_ONCE(!irqs_disabled());
213
214 perf_ctx_lock(cpuctx, task_ctx);
215
216
217
218
219 if (ctx->task) {
220 if (ctx->task != current) {
221 ret = -ESRCH;
222 goto unlock;
223 }
224
225
226
227
228
229
230
231
232 WARN_ON_ONCE(!ctx->is_active);
233
234
235
236
237 WARN_ON_ONCE(task_ctx != ctx);
238 } else {
239 WARN_ON_ONCE(&cpuctx->ctx != ctx);
240 }
241
242 efs->func(event, cpuctx, ctx, efs->data);
243unlock:
244 perf_ctx_unlock(cpuctx, task_ctx);
245
246 return ret;
247}
248
249static void event_function_call(struct perf_event *event, event_f func, void *data)
250{
251 struct perf_event_context *ctx = event->ctx;
252 struct task_struct *task = READ_ONCE(ctx->task);
253 struct event_function_struct efs = {
254 .event = event,
255 .func = func,
256 .data = data,
257 };
258
259 if (!event->parent) {
260
261
262
263
264
265 lockdep_assert_held(&ctx->mutex);
266 }
267
268 if (!task) {
269 cpu_function_call(event->cpu, event_function, &efs);
270 return;
271 }
272
273 if (task == TASK_TOMBSTONE)
274 return;
275
276again:
277 if (!task_function_call(task, event_function, &efs))
278 return;
279
280 raw_spin_lock_irq(&ctx->lock);
281
282
283
284
285 task = ctx->task;
286 if (task == TASK_TOMBSTONE) {
287 raw_spin_unlock_irq(&ctx->lock);
288 return;
289 }
290 if (ctx->is_active) {
291 raw_spin_unlock_irq(&ctx->lock);
292 goto again;
293 }
294 func(event, NULL, ctx, data);
295 raw_spin_unlock_irq(&ctx->lock);
296}
297
298
299
300
301
302static void event_function_local(struct perf_event *event, event_f func, void *data)
303{
304 struct perf_event_context *ctx = event->ctx;
305 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
306 struct task_struct *task = READ_ONCE(ctx->task);
307 struct perf_event_context *task_ctx = NULL;
308
309 WARN_ON_ONCE(!irqs_disabled());
310
311 if (task) {
312 if (task == TASK_TOMBSTONE)
313 return;
314
315 task_ctx = ctx;
316 }
317
318 perf_ctx_lock(cpuctx, task_ctx);
319
320 task = ctx->task;
321 if (task == TASK_TOMBSTONE)
322 goto unlock;
323
324 if (task) {
325
326
327
328
329
330 if (ctx->is_active) {
331 if (WARN_ON_ONCE(task != current))
332 goto unlock;
333
334 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
335 goto unlock;
336 }
337 } else {
338 WARN_ON_ONCE(&cpuctx->ctx != ctx);
339 }
340
341 func(event, cpuctx, ctx, data);
342unlock:
343 perf_ctx_unlock(cpuctx, task_ctx);
344}
345
346#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
347 PERF_FLAG_FD_OUTPUT |\
348 PERF_FLAG_PID_CGROUP |\
349 PERF_FLAG_FD_CLOEXEC)
350
351
352
353
354#define PERF_SAMPLE_BRANCH_PERM_PLM \
355 (PERF_SAMPLE_BRANCH_KERNEL |\
356 PERF_SAMPLE_BRANCH_HV)
357
358enum event_type_t {
359 EVENT_FLEXIBLE = 0x1,
360 EVENT_PINNED = 0x2,
361 EVENT_TIME = 0x4,
362
363 EVENT_CPU = 0x8,
364 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
365};
366
367
368
369
370
371
372static void perf_sched_delayed(struct work_struct *work);
373DEFINE_STATIC_KEY_FALSE(perf_sched_events);
374static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
375static DEFINE_MUTEX(perf_sched_mutex);
376static atomic_t perf_sched_count;
377
378static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
379static DEFINE_PER_CPU(int, perf_sched_cb_usages);
380static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
381
382static atomic_t nr_mmap_events __read_mostly;
383static atomic_t nr_comm_events __read_mostly;
384static atomic_t nr_namespaces_events __read_mostly;
385static atomic_t nr_task_events __read_mostly;
386static atomic_t nr_freq_events __read_mostly;
387static atomic_t nr_switch_events __read_mostly;
388
389static LIST_HEAD(pmus);
390static DEFINE_MUTEX(pmus_lock);
391static struct srcu_struct pmus_srcu;
392static cpumask_var_t perf_online_mask;
393
394
395
396
397
398
399
400
401int sysctl_perf_event_paranoid __read_mostly = 2;
402
403
404int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
405
406
407
408
409#define DEFAULT_MAX_SAMPLE_RATE 100000
410#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
411#define DEFAULT_CPU_TIME_MAX_PERCENT 25
412
413int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
414
415static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
416static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
417
418static int perf_sample_allowed_ns __read_mostly =
419 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
420
421static void update_perf_cpu_limits(void)
422{
423 u64 tmp = perf_sample_period_ns;
424
425 tmp *= sysctl_perf_cpu_time_max_percent;
426 tmp = div_u64(tmp, 100);
427 if (!tmp)
428 tmp = 1;
429
430 WRITE_ONCE(perf_sample_allowed_ns, tmp);
431}
432
433static int perf_rotate_context(struct perf_cpu_context *cpuctx);
434
435int perf_proc_update_handler(struct ctl_table *table, int write,
436 void __user *buffer, size_t *lenp,
437 loff_t *ppos)
438{
439 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
440
441 if (ret || !write)
442 return ret;
443
444
445
446
447 if (sysctl_perf_cpu_time_max_percent == 100 ||
448 sysctl_perf_cpu_time_max_percent == 0)
449 return -EINVAL;
450
451 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
452 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
453 update_perf_cpu_limits();
454
455 return 0;
456}
457
458int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
459
460int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
461 void __user *buffer, size_t *lenp,
462 loff_t *ppos)
463{
464 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465
466 if (ret || !write)
467 return ret;
468
469 if (sysctl_perf_cpu_time_max_percent == 100 ||
470 sysctl_perf_cpu_time_max_percent == 0) {
471 printk(KERN_WARNING
472 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
473 WRITE_ONCE(perf_sample_allowed_ns, 0);
474 } else {
475 update_perf_cpu_limits();
476 }
477
478 return 0;
479}
480
481
482
483
484
485
486
487#define NR_ACCUMULATED_SAMPLES 128
488static DEFINE_PER_CPU(u64, running_sample_length);
489
490static u64 __report_avg;
491static u64 __report_allowed;
492
493static void perf_duration_warn(struct irq_work *w)
494{
495 printk_ratelimited(KERN_INFO
496 "perf: interrupt took too long (%lld > %lld), lowering "
497 "kernel.perf_event_max_sample_rate to %d\n",
498 __report_avg, __report_allowed,
499 sysctl_perf_event_sample_rate);
500}
501
502static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
503
504void perf_sample_event_took(u64 sample_len_ns)
505{
506 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
507 u64 running_len;
508 u64 avg_len;
509 u32 max;
510
511 if (max_len == 0)
512 return;
513
514
515 running_len = __this_cpu_read(running_sample_length);
516 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
517 running_len += sample_len_ns;
518 __this_cpu_write(running_sample_length, running_len);
519
520
521
522
523
524
525 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
526 if (avg_len <= max_len)
527 return;
528
529 __report_avg = avg_len;
530 __report_allowed = max_len;
531
532
533
534
535 avg_len += avg_len / 4;
536 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
537 if (avg_len < max)
538 max /= (u32)avg_len;
539 else
540 max = 1;
541
542 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
543 WRITE_ONCE(max_samples_per_tick, max);
544
545 sysctl_perf_event_sample_rate = max * HZ;
546 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
547
548 if (!irq_work_queue(&perf_duration_work)) {
549 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
550 "kernel.perf_event_max_sample_rate to %d\n",
551 __report_avg, __report_allowed,
552 sysctl_perf_event_sample_rate);
553 }
554}
555
556static atomic64_t perf_event_id;
557
558static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
559 enum event_type_t event_type);
560
561static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
562 enum event_type_t event_type,
563 struct task_struct *task);
564
565static void update_context_time(struct perf_event_context *ctx);
566static u64 perf_event_time(struct perf_event *event);
567
568void __weak perf_event_print_debug(void) { }
569
570extern __weak const char *perf_pmu_name(void)
571{
572 return "pmu";
573}
574
575static inline u64 perf_clock(void)
576{
577 return local_clock();
578}
579
580static inline u64 perf_event_clock(struct perf_event *event)
581{
582 return event->clock();
583}
584
585#ifdef CONFIG_CGROUP_PERF
586
587static inline bool
588perf_cgroup_match(struct perf_event *event)
589{
590 struct perf_event_context *ctx = event->ctx;
591 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
592
593
594 if (!event->cgrp)
595 return true;
596
597
598 if (!cpuctx->cgrp)
599 return false;
600
601
602
603
604
605
606
607 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
608 event->cgrp->css.cgroup);
609}
610
611static inline void perf_detach_cgroup(struct perf_event *event)
612{
613 css_put(&event->cgrp->css);
614 event->cgrp = NULL;
615}
616
617static inline int is_cgroup_event(struct perf_event *event)
618{
619 return event->cgrp != NULL;
620}
621
622static inline u64 perf_cgroup_event_time(struct perf_event *event)
623{
624 struct perf_cgroup_info *t;
625
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 return t->time;
628}
629
630static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
631{
632 struct perf_cgroup_info *info;
633 u64 now;
634
635 now = perf_clock();
636
637 info = this_cpu_ptr(cgrp->info);
638
639 info->time += now - info->timestamp;
640 info->timestamp = now;
641}
642
643static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
644{
645 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
646 if (cgrp_out)
647 __update_cgrp_time(cgrp_out);
648}
649
650static inline void update_cgrp_time_from_event(struct perf_event *event)
651{
652 struct perf_cgroup *cgrp;
653
654
655
656
657
658 if (!is_cgroup_event(event))
659 return;
660
661 cgrp = perf_cgroup_from_task(current, event->ctx);
662
663
664
665 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
666 __update_cgrp_time(event->cgrp);
667}
668
669static inline void
670perf_cgroup_set_timestamp(struct task_struct *task,
671 struct perf_event_context *ctx)
672{
673 struct perf_cgroup *cgrp;
674 struct perf_cgroup_info *info;
675
676
677
678
679
680
681 if (!task || !ctx->nr_cgroups)
682 return;
683
684 cgrp = perf_cgroup_from_task(task, ctx);
685 info = this_cpu_ptr(cgrp->info);
686 info->timestamp = ctx->timestamp;
687}
688
689static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
690
691#define PERF_CGROUP_SWOUT 0x1
692#define PERF_CGROUP_SWIN 0x2
693
694
695
696
697
698
699
700static void perf_cgroup_switch(struct task_struct *task, int mode)
701{
702 struct perf_cpu_context *cpuctx;
703 struct list_head *list;
704 unsigned long flags;
705
706
707
708
709
710 local_irq_save(flags);
711
712 list = this_cpu_ptr(&cgrp_cpuctx_list);
713 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
714 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
715
716 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
717 perf_pmu_disable(cpuctx->ctx.pmu);
718
719 if (mode & PERF_CGROUP_SWOUT) {
720 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
721
722
723
724
725 cpuctx->cgrp = NULL;
726 }
727
728 if (mode & PERF_CGROUP_SWIN) {
729 WARN_ON_ONCE(cpuctx->cgrp);
730
731
732
733
734
735
736
737 cpuctx->cgrp = perf_cgroup_from_task(task,
738 &cpuctx->ctx);
739 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
740 }
741 perf_pmu_enable(cpuctx->ctx.pmu);
742 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
743 }
744
745 local_irq_restore(flags);
746}
747
748static inline void perf_cgroup_sched_out(struct task_struct *task,
749 struct task_struct *next)
750{
751 struct perf_cgroup *cgrp1;
752 struct perf_cgroup *cgrp2 = NULL;
753
754 rcu_read_lock();
755
756
757
758
759
760 cgrp1 = perf_cgroup_from_task(task, NULL);
761 cgrp2 = perf_cgroup_from_task(next, NULL);
762
763
764
765
766
767
768 if (cgrp1 != cgrp2)
769 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
770
771 rcu_read_unlock();
772}
773
774static inline void perf_cgroup_sched_in(struct task_struct *prev,
775 struct task_struct *task)
776{
777 struct perf_cgroup *cgrp1;
778 struct perf_cgroup *cgrp2 = NULL;
779
780 rcu_read_lock();
781
782
783
784
785
786 cgrp1 = perf_cgroup_from_task(task, NULL);
787 cgrp2 = perf_cgroup_from_task(prev, NULL);
788
789
790
791
792
793
794 if (cgrp1 != cgrp2)
795 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
796
797 rcu_read_unlock();
798}
799
800static inline int perf_cgroup_connect(int fd, struct perf_event *event,
801 struct perf_event_attr *attr,
802 struct perf_event *group_leader)
803{
804 struct perf_cgroup *cgrp;
805 struct cgroup_subsys_state *css;
806 struct fd f = fdget(fd);
807 int ret = 0;
808
809 if (!f.file)
810 return -EBADF;
811
812 css = css_tryget_online_from_dir(f.file->f_path.dentry,
813 &perf_event_cgrp_subsys);
814 if (IS_ERR(css)) {
815 ret = PTR_ERR(css);
816 goto out;
817 }
818
819 cgrp = container_of(css, struct perf_cgroup, css);
820 event->cgrp = cgrp;
821
822
823
824
825
826
827 if (group_leader && group_leader->cgrp != cgrp) {
828 perf_detach_cgroup(event);
829 ret = -EINVAL;
830 }
831out:
832 fdput(f);
833 return ret;
834}
835
836static inline void
837perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
838{
839 struct perf_cgroup_info *t;
840 t = per_cpu_ptr(event->cgrp->info, event->cpu);
841 event->shadow_ctx_time = now - t->timestamp;
842}
843
844static inline void
845perf_cgroup_defer_enabled(struct perf_event *event)
846{
847
848
849
850
851
852
853 if (is_cgroup_event(event) && !perf_cgroup_match(event))
854 event->cgrp_defer_enabled = 1;
855}
856
857static inline void
858perf_cgroup_mark_enabled(struct perf_event *event,
859 struct perf_event_context *ctx)
860{
861 struct perf_event *sub;
862 u64 tstamp = perf_event_time(event);
863
864 if (!event->cgrp_defer_enabled)
865 return;
866
867 event->cgrp_defer_enabled = 0;
868
869 event->tstamp_enabled = tstamp - event->total_time_enabled;
870 list_for_each_entry(sub, &event->sibling_list, group_entry) {
871 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
872 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
873 sub->cgrp_defer_enabled = 0;
874 }
875 }
876}
877
878
879
880
881
882static inline void
883list_update_cgroup_event(struct perf_event *event,
884 struct perf_event_context *ctx, bool add)
885{
886 struct perf_cpu_context *cpuctx;
887 struct list_head *cpuctx_entry;
888
889 if (!is_cgroup_event(event))
890 return;
891
892 if (add && ctx->nr_cgroups++)
893 return;
894 else if (!add && --ctx->nr_cgroups)
895 return;
896
897
898
899
900 cpuctx = __get_cpu_context(ctx);
901 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
902
903 if (add) {
904 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
905
906 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
907 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
908 cpuctx->cgrp = cgrp;
909 } else {
910 list_del(cpuctx_entry);
911 cpuctx->cgrp = NULL;
912 }
913}
914
915#else
916
917static inline bool
918perf_cgroup_match(struct perf_event *event)
919{
920 return true;
921}
922
923static inline void perf_detach_cgroup(struct perf_event *event)
924{}
925
926static inline int is_cgroup_event(struct perf_event *event)
927{
928 return 0;
929}
930
931static inline void update_cgrp_time_from_event(struct perf_event *event)
932{
933}
934
935static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
936{
937}
938
939static inline void perf_cgroup_sched_out(struct task_struct *task,
940 struct task_struct *next)
941{
942}
943
944static inline void perf_cgroup_sched_in(struct task_struct *prev,
945 struct task_struct *task)
946{
947}
948
949static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
950 struct perf_event_attr *attr,
951 struct perf_event *group_leader)
952{
953 return -EINVAL;
954}
955
956static inline void
957perf_cgroup_set_timestamp(struct task_struct *task,
958 struct perf_event_context *ctx)
959{
960}
961
962void
963perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
964{
965}
966
967static inline void
968perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
969{
970}
971
972static inline u64 perf_cgroup_event_time(struct perf_event *event)
973{
974 return 0;
975}
976
977static inline void
978perf_cgroup_defer_enabled(struct perf_event *event)
979{
980}
981
982static inline void
983perf_cgroup_mark_enabled(struct perf_event *event,
984 struct perf_event_context *ctx)
985{
986}
987
988static inline void
989list_update_cgroup_event(struct perf_event *event,
990 struct perf_event_context *ctx, bool add)
991{
992}
993
994#endif
995
996
997
998
999
1000#define PERF_CPU_HRTIMER (1000 / HZ)
1001
1002
1003
1004static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1005{
1006 struct perf_cpu_context *cpuctx;
1007 int rotations = 0;
1008
1009 WARN_ON(!irqs_disabled());
1010
1011 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1012 rotations = perf_rotate_context(cpuctx);
1013
1014 raw_spin_lock(&cpuctx->hrtimer_lock);
1015 if (rotations)
1016 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1017 else
1018 cpuctx->hrtimer_active = 0;
1019 raw_spin_unlock(&cpuctx->hrtimer_lock);
1020
1021 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1022}
1023
1024static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1025{
1026 struct hrtimer *timer = &cpuctx->hrtimer;
1027 struct pmu *pmu = cpuctx->ctx.pmu;
1028 u64 interval;
1029
1030
1031 if (pmu->task_ctx_nr == perf_sw_context)
1032 return;
1033
1034
1035
1036
1037
1038 interval = pmu->hrtimer_interval_ms;
1039 if (interval < 1)
1040 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1041
1042 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1043
1044 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1045 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1046 timer->function = perf_mux_hrtimer_handler;
1047}
1048
1049static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1050{
1051 struct hrtimer *timer = &cpuctx->hrtimer;
1052 struct pmu *pmu = cpuctx->ctx.pmu;
1053 unsigned long flags;
1054
1055
1056 if (pmu->task_ctx_nr == perf_sw_context)
1057 return 0;
1058
1059 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1060 if (!cpuctx->hrtimer_active) {
1061 cpuctx->hrtimer_active = 1;
1062 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1063 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1064 }
1065 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1066
1067 return 0;
1068}
1069
1070void perf_pmu_disable(struct pmu *pmu)
1071{
1072 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1073 if (!(*count)++)
1074 pmu->pmu_disable(pmu);
1075}
1076
1077void perf_pmu_enable(struct pmu *pmu)
1078{
1079 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1080 if (!--(*count))
1081 pmu->pmu_enable(pmu);
1082}
1083
1084static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1085
1086
1087
1088
1089
1090
1091
1092static void perf_event_ctx_activate(struct perf_event_context *ctx)
1093{
1094 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1095
1096 WARN_ON(!irqs_disabled());
1097
1098 WARN_ON(!list_empty(&ctx->active_ctx_list));
1099
1100 list_add(&ctx->active_ctx_list, head);
1101}
1102
1103static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1104{
1105 WARN_ON(!irqs_disabled());
1106
1107 WARN_ON(list_empty(&ctx->active_ctx_list));
1108
1109 list_del_init(&ctx->active_ctx_list);
1110}
1111
1112static void get_ctx(struct perf_event_context *ctx)
1113{
1114 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1115}
1116
1117static void free_ctx(struct rcu_head *head)
1118{
1119 struct perf_event_context *ctx;
1120
1121 ctx = container_of(head, struct perf_event_context, rcu_head);
1122 kfree(ctx->task_ctx_data);
1123 kfree(ctx);
1124}
1125
1126static void put_ctx(struct perf_event_context *ctx)
1127{
1128 if (atomic_dec_and_test(&ctx->refcount)) {
1129 if (ctx->parent_ctx)
1130 put_ctx(ctx->parent_ctx);
1131 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1132 put_task_struct(ctx->task);
1133 call_rcu(&ctx->rcu_head, free_ctx);
1134 }
1135}
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198static struct perf_event_context *
1199perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1200{
1201 struct perf_event_context *ctx;
1202
1203again:
1204 rcu_read_lock();
1205 ctx = ACCESS_ONCE(event->ctx);
1206 if (!atomic_inc_not_zero(&ctx->refcount)) {
1207 rcu_read_unlock();
1208 goto again;
1209 }
1210 rcu_read_unlock();
1211
1212 mutex_lock_nested(&ctx->mutex, nesting);
1213 if (event->ctx != ctx) {
1214 mutex_unlock(&ctx->mutex);
1215 put_ctx(ctx);
1216 goto again;
1217 }
1218
1219 return ctx;
1220}
1221
1222static inline struct perf_event_context *
1223perf_event_ctx_lock(struct perf_event *event)
1224{
1225 return perf_event_ctx_lock_nested(event, 0);
1226}
1227
1228static void perf_event_ctx_unlock(struct perf_event *event,
1229 struct perf_event_context *ctx)
1230{
1231 mutex_unlock(&ctx->mutex);
1232 put_ctx(ctx);
1233}
1234
1235
1236
1237
1238
1239
1240static __must_check struct perf_event_context *
1241unclone_ctx(struct perf_event_context *ctx)
1242{
1243 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1244
1245 lockdep_assert_held(&ctx->lock);
1246
1247 if (parent_ctx)
1248 ctx->parent_ctx = NULL;
1249 ctx->generation++;
1250
1251 return parent_ctx;
1252}
1253
1254static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1255 enum pid_type type)
1256{
1257 u32 nr;
1258
1259
1260
1261 if (event->parent)
1262 event = event->parent;
1263
1264 nr = __task_pid_nr_ns(p, type, event->ns);
1265
1266 if (!nr && !pid_alive(p))
1267 nr = -1;
1268 return nr;
1269}
1270
1271static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1272{
1273 return perf_event_pid_type(event, p, __PIDTYPE_TGID);
1274}
1275
1276static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1277{
1278 return perf_event_pid_type(event, p, PIDTYPE_PID);
1279}
1280
1281
1282
1283
1284
1285static u64 primary_event_id(struct perf_event *event)
1286{
1287 u64 id = event->id;
1288
1289 if (event->parent)
1290 id = event->parent->id;
1291
1292 return id;
1293}
1294
1295
1296
1297
1298
1299
1300
1301static struct perf_event_context *
1302perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1303{
1304 struct perf_event_context *ctx;
1305
1306retry:
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316 local_irq_save(*flags);
1317 rcu_read_lock();
1318 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1319 if (ctx) {
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330 raw_spin_lock(&ctx->lock);
1331 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1332 raw_spin_unlock(&ctx->lock);
1333 rcu_read_unlock();
1334 local_irq_restore(*flags);
1335 goto retry;
1336 }
1337
1338 if (ctx->task == TASK_TOMBSTONE ||
1339 !atomic_inc_not_zero(&ctx->refcount)) {
1340 raw_spin_unlock(&ctx->lock);
1341 ctx = NULL;
1342 } else {
1343 WARN_ON_ONCE(ctx->task != task);
1344 }
1345 }
1346 rcu_read_unlock();
1347 if (!ctx)
1348 local_irq_restore(*flags);
1349 return ctx;
1350}
1351
1352
1353
1354
1355
1356
1357static struct perf_event_context *
1358perf_pin_task_context(struct task_struct *task, int ctxn)
1359{
1360 struct perf_event_context *ctx;
1361 unsigned long flags;
1362
1363 ctx = perf_lock_task_context(task, ctxn, &flags);
1364 if (ctx) {
1365 ++ctx->pin_count;
1366 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1367 }
1368 return ctx;
1369}
1370
1371static void perf_unpin_context(struct perf_event_context *ctx)
1372{
1373 unsigned long flags;
1374
1375 raw_spin_lock_irqsave(&ctx->lock, flags);
1376 --ctx->pin_count;
1377 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1378}
1379
1380
1381
1382
1383static void update_context_time(struct perf_event_context *ctx)
1384{
1385 u64 now = perf_clock();
1386
1387 ctx->time += now - ctx->timestamp;
1388 ctx->timestamp = now;
1389}
1390
1391static u64 perf_event_time(struct perf_event *event)
1392{
1393 struct perf_event_context *ctx = event->ctx;
1394
1395 if (is_cgroup_event(event))
1396 return perf_cgroup_event_time(event);
1397
1398 return ctx ? ctx->time : 0;
1399}
1400
1401
1402
1403
1404static void update_event_times(struct perf_event *event)
1405{
1406 struct perf_event_context *ctx = event->ctx;
1407 u64 run_end;
1408
1409 lockdep_assert_held(&ctx->lock);
1410
1411 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1412 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1413 return;
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425 if (is_cgroup_event(event))
1426 run_end = perf_cgroup_event_time(event);
1427 else if (ctx->is_active)
1428 run_end = ctx->time;
1429 else
1430 run_end = event->tstamp_stopped;
1431
1432 event->total_time_enabled = run_end - event->tstamp_enabled;
1433
1434 if (event->state == PERF_EVENT_STATE_INACTIVE)
1435 run_end = event->tstamp_stopped;
1436 else
1437 run_end = perf_event_time(event);
1438
1439 event->total_time_running = run_end - event->tstamp_running;
1440
1441}
1442
1443
1444
1445
1446static void update_group_times(struct perf_event *leader)
1447{
1448 struct perf_event *event;
1449
1450 update_event_times(leader);
1451 list_for_each_entry(event, &leader->sibling_list, group_entry)
1452 update_event_times(event);
1453}
1454
1455static enum event_type_t get_event_type(struct perf_event *event)
1456{
1457 struct perf_event_context *ctx = event->ctx;
1458 enum event_type_t event_type;
1459
1460 lockdep_assert_held(&ctx->lock);
1461
1462
1463
1464
1465
1466 if (event->group_leader != event)
1467 event = event->group_leader;
1468
1469 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1470 if (!ctx->task)
1471 event_type |= EVENT_CPU;
1472
1473 return event_type;
1474}
1475
1476static struct list_head *
1477ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1478{
1479 if (event->attr.pinned)
1480 return &ctx->pinned_groups;
1481 else
1482 return &ctx->flexible_groups;
1483}
1484
1485
1486
1487
1488
1489static void
1490list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1491{
1492 lockdep_assert_held(&ctx->lock);
1493
1494 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1495 event->attach_state |= PERF_ATTACH_CONTEXT;
1496
1497
1498
1499
1500
1501
1502 if (event->group_leader == event) {
1503 struct list_head *list;
1504
1505 event->group_caps = event->event_caps;
1506
1507 list = ctx_group_list(event, ctx);
1508 list_add_tail(&event->group_entry, list);
1509 }
1510
1511 list_update_cgroup_event(event, ctx, true);
1512
1513 list_add_rcu(&event->event_entry, &ctx->event_list);
1514 ctx->nr_events++;
1515 if (event->attr.inherit_stat)
1516 ctx->nr_stat++;
1517
1518 ctx->generation++;
1519}
1520
1521
1522
1523
1524static inline void perf_event__state_init(struct perf_event *event)
1525{
1526 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1527 PERF_EVENT_STATE_INACTIVE;
1528}
1529
1530static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1531{
1532 int entry = sizeof(u64);
1533 int size = 0;
1534 int nr = 1;
1535
1536 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1537 size += sizeof(u64);
1538
1539 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1540 size += sizeof(u64);
1541
1542 if (event->attr.read_format & PERF_FORMAT_ID)
1543 entry += sizeof(u64);
1544
1545 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1546 nr += nr_siblings;
1547 size += sizeof(u64);
1548 }
1549
1550 size += entry * nr;
1551 event->read_size = size;
1552}
1553
1554static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1555{
1556 struct perf_sample_data *data;
1557 u16 size = 0;
1558
1559 if (sample_type & PERF_SAMPLE_IP)
1560 size += sizeof(data->ip);
1561
1562 if (sample_type & PERF_SAMPLE_ADDR)
1563 size += sizeof(data->addr);
1564
1565 if (sample_type & PERF_SAMPLE_PERIOD)
1566 size += sizeof(data->period);
1567
1568 if (sample_type & PERF_SAMPLE_WEIGHT)
1569 size += sizeof(data->weight);
1570
1571 if (sample_type & PERF_SAMPLE_READ)
1572 size += event->read_size;
1573
1574 if (sample_type & PERF_SAMPLE_DATA_SRC)
1575 size += sizeof(data->data_src.val);
1576
1577 if (sample_type & PERF_SAMPLE_TRANSACTION)
1578 size += sizeof(data->txn);
1579
1580 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1581 size += sizeof(data->phys_addr);
1582
1583 event->header_size = size;
1584}
1585
1586
1587
1588
1589
1590static void perf_event__header_size(struct perf_event *event)
1591{
1592 __perf_event_read_size(event,
1593 event->group_leader->nr_siblings);
1594 __perf_event_header_size(event, event->attr.sample_type);
1595}
1596
1597static void perf_event__id_header_size(struct perf_event *event)
1598{
1599 struct perf_sample_data *data;
1600 u64 sample_type = event->attr.sample_type;
1601 u16 size = 0;
1602
1603 if (sample_type & PERF_SAMPLE_TID)
1604 size += sizeof(data->tid_entry);
1605
1606 if (sample_type & PERF_SAMPLE_TIME)
1607 size += sizeof(data->time);
1608
1609 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1610 size += sizeof(data->id);
1611
1612 if (sample_type & PERF_SAMPLE_ID)
1613 size += sizeof(data->id);
1614
1615 if (sample_type & PERF_SAMPLE_STREAM_ID)
1616 size += sizeof(data->stream_id);
1617
1618 if (sample_type & PERF_SAMPLE_CPU)
1619 size += sizeof(data->cpu_entry);
1620
1621 event->id_header_size = size;
1622}
1623
1624static bool perf_event_validate_size(struct perf_event *event)
1625{
1626
1627
1628
1629
1630 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1631 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1632 perf_event__id_header_size(event);
1633
1634
1635
1636
1637
1638 if (event->read_size + event->header_size +
1639 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1640 return false;
1641
1642 return true;
1643}
1644
1645static void perf_group_attach(struct perf_event *event)
1646{
1647 struct perf_event *group_leader = event->group_leader, *pos;
1648
1649 lockdep_assert_held(&event->ctx->lock);
1650
1651
1652
1653
1654 if (event->attach_state & PERF_ATTACH_GROUP)
1655 return;
1656
1657 event->attach_state |= PERF_ATTACH_GROUP;
1658
1659 if (group_leader == event)
1660 return;
1661
1662 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1663
1664 group_leader->group_caps &= event->event_caps;
1665
1666 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1667 group_leader->nr_siblings++;
1668
1669 perf_event__header_size(group_leader);
1670
1671 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1672 perf_event__header_size(pos);
1673}
1674
1675
1676
1677
1678
1679static void
1680list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1681{
1682 WARN_ON_ONCE(event->ctx != ctx);
1683 lockdep_assert_held(&ctx->lock);
1684
1685
1686
1687
1688 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1689 return;
1690
1691 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1692
1693 list_update_cgroup_event(event, ctx, false);
1694
1695 ctx->nr_events--;
1696 if (event->attr.inherit_stat)
1697 ctx->nr_stat--;
1698
1699 list_del_rcu(&event->event_entry);
1700
1701 if (event->group_leader == event)
1702 list_del_init(&event->group_entry);
1703
1704 update_group_times(event);
1705
1706
1707
1708
1709
1710
1711
1712
1713 if (event->state > PERF_EVENT_STATE_OFF)
1714 event->state = PERF_EVENT_STATE_OFF;
1715
1716 ctx->generation++;
1717}
1718
1719static void perf_group_detach(struct perf_event *event)
1720{
1721 struct perf_event *sibling, *tmp;
1722 struct list_head *list = NULL;
1723
1724 lockdep_assert_held(&event->ctx->lock);
1725
1726
1727
1728
1729 if (!(event->attach_state & PERF_ATTACH_GROUP))
1730 return;
1731
1732 event->attach_state &= ~PERF_ATTACH_GROUP;
1733
1734
1735
1736
1737 if (event->group_leader != event) {
1738 list_del_init(&event->group_entry);
1739 event->group_leader->nr_siblings--;
1740 goto out;
1741 }
1742
1743 if (!list_empty(&event->group_entry))
1744 list = &event->group_entry;
1745
1746
1747
1748
1749
1750
1751 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1752 if (list)
1753 list_move_tail(&sibling->group_entry, list);
1754 sibling->group_leader = sibling;
1755
1756
1757 sibling->group_caps = event->group_caps;
1758
1759 WARN_ON_ONCE(sibling->ctx != event->ctx);
1760 }
1761
1762out:
1763 perf_event__header_size(event->group_leader);
1764
1765 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1766 perf_event__header_size(tmp);
1767}
1768
1769static bool is_orphaned_event(struct perf_event *event)
1770{
1771 return event->state == PERF_EVENT_STATE_DEAD;
1772}
1773
1774static inline int __pmu_filter_match(struct perf_event *event)
1775{
1776 struct pmu *pmu = event->pmu;
1777 return pmu->filter_match ? pmu->filter_match(event) : 1;
1778}
1779
1780
1781
1782
1783
1784
1785
1786static inline int pmu_filter_match(struct perf_event *event)
1787{
1788 struct perf_event *child;
1789
1790 if (!__pmu_filter_match(event))
1791 return 0;
1792
1793 list_for_each_entry(child, &event->sibling_list, group_entry) {
1794 if (!__pmu_filter_match(child))
1795 return 0;
1796 }
1797
1798 return 1;
1799}
1800
1801static inline int
1802event_filter_match(struct perf_event *event)
1803{
1804 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1805 perf_cgroup_match(event) && pmu_filter_match(event);
1806}
1807
1808static void
1809event_sched_out(struct perf_event *event,
1810 struct perf_cpu_context *cpuctx,
1811 struct perf_event_context *ctx)
1812{
1813 u64 tstamp = perf_event_time(event);
1814 u64 delta;
1815
1816 WARN_ON_ONCE(event->ctx != ctx);
1817 lockdep_assert_held(&ctx->lock);
1818
1819
1820
1821
1822
1823
1824
1825 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1826 !event_filter_match(event)) {
1827 delta = tstamp - event->tstamp_stopped;
1828 event->tstamp_running += delta;
1829 event->tstamp_stopped = tstamp;
1830 }
1831
1832 if (event->state != PERF_EVENT_STATE_ACTIVE)
1833 return;
1834
1835 perf_pmu_disable(event->pmu);
1836
1837 event->tstamp_stopped = tstamp;
1838 event->pmu->del(event, 0);
1839 event->oncpu = -1;
1840 event->state = PERF_EVENT_STATE_INACTIVE;
1841 if (event->pending_disable) {
1842 event->pending_disable = 0;
1843 event->state = PERF_EVENT_STATE_OFF;
1844 }
1845
1846 if (!is_software_event(event))
1847 cpuctx->active_oncpu--;
1848 if (!--ctx->nr_active)
1849 perf_event_ctx_deactivate(ctx);
1850 if (event->attr.freq && event->attr.sample_freq)
1851 ctx->nr_freq--;
1852 if (event->attr.exclusive || !cpuctx->active_oncpu)
1853 cpuctx->exclusive = 0;
1854
1855 perf_pmu_enable(event->pmu);
1856}
1857
1858static void
1859group_sched_out(struct perf_event *group_event,
1860 struct perf_cpu_context *cpuctx,
1861 struct perf_event_context *ctx)
1862{
1863 struct perf_event *event;
1864 int state = group_event->state;
1865
1866 perf_pmu_disable(ctx->pmu);
1867
1868 event_sched_out(group_event, cpuctx, ctx);
1869
1870
1871
1872
1873 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1874 event_sched_out(event, cpuctx, ctx);
1875
1876 perf_pmu_enable(ctx->pmu);
1877
1878 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1879 cpuctx->exclusive = 0;
1880}
1881
1882#define DETACH_GROUP 0x01UL
1883
1884
1885
1886
1887
1888
1889
1890static void
1891__perf_remove_from_context(struct perf_event *event,
1892 struct perf_cpu_context *cpuctx,
1893 struct perf_event_context *ctx,
1894 void *info)
1895{
1896 unsigned long flags = (unsigned long)info;
1897
1898 event_sched_out(event, cpuctx, ctx);
1899 if (flags & DETACH_GROUP)
1900 perf_group_detach(event);
1901 list_del_event(event, ctx);
1902
1903 if (!ctx->nr_events && ctx->is_active) {
1904 ctx->is_active = 0;
1905 if (ctx->task) {
1906 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1907 cpuctx->task_ctx = NULL;
1908 }
1909 }
1910}
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1923{
1924 struct perf_event_context *ctx = event->ctx;
1925
1926 lockdep_assert_held(&ctx->mutex);
1927
1928 event_function_call(event, __perf_remove_from_context, (void *)flags);
1929
1930
1931
1932
1933
1934
1935
1936 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1937 if ((flags & DETACH_GROUP) &&
1938 (event->attach_state & PERF_ATTACH_GROUP)) {
1939
1940
1941
1942
1943 raw_spin_lock_irq(&ctx->lock);
1944 perf_group_detach(event);
1945 raw_spin_unlock_irq(&ctx->lock);
1946 }
1947}
1948
1949
1950
1951
1952static void __perf_event_disable(struct perf_event *event,
1953 struct perf_cpu_context *cpuctx,
1954 struct perf_event_context *ctx,
1955 void *info)
1956{
1957 if (event->state < PERF_EVENT_STATE_INACTIVE)
1958 return;
1959
1960 update_context_time(ctx);
1961 update_cgrp_time_from_event(event);
1962 update_group_times(event);
1963 if (event == event->group_leader)
1964 group_sched_out(event, cpuctx, ctx);
1965 else
1966 event_sched_out(event, cpuctx, ctx);
1967 event->state = PERF_EVENT_STATE_OFF;
1968}
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984static void _perf_event_disable(struct perf_event *event)
1985{
1986 struct perf_event_context *ctx = event->ctx;
1987
1988 raw_spin_lock_irq(&ctx->lock);
1989 if (event->state <= PERF_EVENT_STATE_OFF) {
1990 raw_spin_unlock_irq(&ctx->lock);
1991 return;
1992 }
1993 raw_spin_unlock_irq(&ctx->lock);
1994
1995 event_function_call(event, __perf_event_disable, NULL);
1996}
1997
1998void perf_event_disable_local(struct perf_event *event)
1999{
2000 event_function_local(event, __perf_event_disable, NULL);
2001}
2002
2003
2004
2005
2006
2007void perf_event_disable(struct perf_event *event)
2008{
2009 struct perf_event_context *ctx;
2010
2011 ctx = perf_event_ctx_lock(event);
2012 _perf_event_disable(event);
2013 perf_event_ctx_unlock(event, ctx);
2014}
2015EXPORT_SYMBOL_GPL(perf_event_disable);
2016
2017void perf_event_disable_inatomic(struct perf_event *event)
2018{
2019 event->pending_disable = 1;
2020 irq_work_queue(&event->pending);
2021}
2022
2023static void perf_set_shadow_time(struct perf_event *event,
2024 struct perf_event_context *ctx,
2025 u64 tstamp)
2026{
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052 if (is_cgroup_event(event))
2053 perf_cgroup_set_shadow_time(event, tstamp);
2054 else
2055 event->shadow_ctx_time = tstamp - ctx->timestamp;
2056}
2057
2058#define MAX_INTERRUPTS (~0ULL)
2059
2060static void perf_log_throttle(struct perf_event *event, int enable);
2061static void perf_log_itrace_start(struct perf_event *event);
2062
2063static int
2064event_sched_in(struct perf_event *event,
2065 struct perf_cpu_context *cpuctx,
2066 struct perf_event_context *ctx)
2067{
2068 u64 tstamp = perf_event_time(event);
2069 int ret = 0;
2070
2071 lockdep_assert_held(&ctx->lock);
2072
2073 if (event->state <= PERF_EVENT_STATE_OFF)
2074 return 0;
2075
2076 WRITE_ONCE(event->oncpu, smp_processor_id());
2077
2078
2079
2080
2081 smp_wmb();
2082 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2083
2084
2085
2086
2087
2088
2089 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2090 perf_log_throttle(event, 1);
2091 event->hw.interrupts = 0;
2092 }
2093
2094
2095
2096
2097 smp_wmb();
2098
2099 perf_pmu_disable(event->pmu);
2100
2101 perf_set_shadow_time(event, ctx, tstamp);
2102
2103 perf_log_itrace_start(event);
2104
2105 if (event->pmu->add(event, PERF_EF_START)) {
2106 event->state = PERF_EVENT_STATE_INACTIVE;
2107 event->oncpu = -1;
2108 ret = -EAGAIN;
2109 goto out;
2110 }
2111
2112 event->tstamp_running += tstamp - event->tstamp_stopped;
2113
2114 if (!is_software_event(event))
2115 cpuctx->active_oncpu++;
2116 if (!ctx->nr_active++)
2117 perf_event_ctx_activate(ctx);
2118 if (event->attr.freq && event->attr.sample_freq)
2119 ctx->nr_freq++;
2120
2121 if (event->attr.exclusive)
2122 cpuctx->exclusive = 1;
2123
2124out:
2125 perf_pmu_enable(event->pmu);
2126
2127 return ret;
2128}
2129
2130static int
2131group_sched_in(struct perf_event *group_event,
2132 struct perf_cpu_context *cpuctx,
2133 struct perf_event_context *ctx)
2134{
2135 struct perf_event *event, *partial_group = NULL;
2136 struct pmu *pmu = ctx->pmu;
2137 u64 now = ctx->time;
2138 bool simulate = false;
2139
2140 if (group_event->state == PERF_EVENT_STATE_OFF)
2141 return 0;
2142
2143 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2144
2145 if (event_sched_in(group_event, cpuctx, ctx)) {
2146 pmu->cancel_txn(pmu);
2147 perf_mux_hrtimer_restart(cpuctx);
2148 return -EAGAIN;
2149 }
2150
2151
2152
2153
2154 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2155 if (event_sched_in(event, cpuctx, ctx)) {
2156 partial_group = event;
2157 goto group_error;
2158 }
2159 }
2160
2161 if (!pmu->commit_txn(pmu))
2162 return 0;
2163
2164group_error:
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2180 if (event == partial_group)
2181 simulate = true;
2182
2183 if (simulate) {
2184 event->tstamp_running += now - event->tstamp_stopped;
2185 event->tstamp_stopped = now;
2186 } else {
2187 event_sched_out(event, cpuctx, ctx);
2188 }
2189 }
2190 event_sched_out(group_event, cpuctx, ctx);
2191
2192 pmu->cancel_txn(pmu);
2193
2194 perf_mux_hrtimer_restart(cpuctx);
2195
2196 return -EAGAIN;
2197}
2198
2199
2200
2201
2202static int group_can_go_on(struct perf_event *event,
2203 struct perf_cpu_context *cpuctx,
2204 int can_add_hw)
2205{
2206
2207
2208
2209 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2210 return 1;
2211
2212
2213
2214
2215 if (cpuctx->exclusive)
2216 return 0;
2217
2218
2219
2220
2221 if (event->attr.exclusive && cpuctx->active_oncpu)
2222 return 0;
2223
2224
2225
2226
2227 return can_add_hw;
2228}
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248static void __perf_event_enable_time(struct perf_event *event, u64 now)
2249{
2250 WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
2251
2252 event->tstamp_stopped = now;
2253 event->tstamp_enabled = now - event->total_time_enabled;
2254 event->tstamp_running = now - event->total_time_running;
2255}
2256
2257static void add_event_to_ctx(struct perf_event *event,
2258 struct perf_event_context *ctx)
2259{
2260 u64 tstamp = perf_event_time(event);
2261
2262 list_add_event(event, ctx);
2263 perf_group_attach(event);
2264
2265
2266
2267
2268 if (event->state == PERF_EVENT_STATE_INACTIVE)
2269 __perf_event_enable_time(event, tstamp);
2270}
2271
2272static void ctx_sched_out(struct perf_event_context *ctx,
2273 struct perf_cpu_context *cpuctx,
2274 enum event_type_t event_type);
2275static void
2276ctx_sched_in(struct perf_event_context *ctx,
2277 struct perf_cpu_context *cpuctx,
2278 enum event_type_t event_type,
2279 struct task_struct *task);
2280
2281static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2282 struct perf_event_context *ctx,
2283 enum event_type_t event_type)
2284{
2285 if (!cpuctx->task_ctx)
2286 return;
2287
2288 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2289 return;
2290
2291 ctx_sched_out(ctx, cpuctx, event_type);
2292}
2293
2294static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2295 struct perf_event_context *ctx,
2296 struct task_struct *task)
2297{
2298 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2299 if (ctx)
2300 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2301 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2302 if (ctx)
2303 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2304}
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321static void ctx_resched(struct perf_cpu_context *cpuctx,
2322 struct perf_event_context *task_ctx,
2323 enum event_type_t event_type)
2324{
2325 enum event_type_t ctx_event_type = event_type & EVENT_ALL;
2326 bool cpu_event = !!(event_type & EVENT_CPU);
2327
2328
2329
2330
2331
2332 if (event_type & EVENT_PINNED)
2333 event_type |= EVENT_FLEXIBLE;
2334
2335 perf_pmu_disable(cpuctx->ctx.pmu);
2336 if (task_ctx)
2337 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2338
2339
2340
2341
2342
2343
2344
2345
2346 if (cpu_event)
2347 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2348 else if (ctx_event_type & EVENT_PINNED)
2349 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2350
2351 perf_event_sched_in(cpuctx, task_ctx, current);
2352 perf_pmu_enable(cpuctx->ctx.pmu);
2353}
2354
2355
2356
2357
2358
2359
2360
2361static int __perf_install_in_context(void *info)
2362{
2363 struct perf_event *event = info;
2364 struct perf_event_context *ctx = event->ctx;
2365 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2366 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2367 bool reprogram = true;
2368 int ret = 0;
2369
2370 raw_spin_lock(&cpuctx->ctx.lock);
2371 if (ctx->task) {
2372 raw_spin_lock(&ctx->lock);
2373 task_ctx = ctx;
2374
2375 reprogram = (ctx->task == current);
2376
2377
2378
2379
2380
2381
2382
2383
2384 if (task_curr(ctx->task) && !reprogram) {
2385 ret = -ESRCH;
2386 goto unlock;
2387 }
2388
2389 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2390 } else if (task_ctx) {
2391 raw_spin_lock(&task_ctx->lock);
2392 }
2393
2394 if (reprogram) {
2395 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2396 add_event_to_ctx(event, ctx);
2397 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2398 } else {
2399 add_event_to_ctx(event, ctx);
2400 }
2401
2402unlock:
2403 perf_ctx_unlock(cpuctx, task_ctx);
2404
2405 return ret;
2406}
2407
2408
2409
2410
2411
2412
2413static void
2414perf_install_in_context(struct perf_event_context *ctx,
2415 struct perf_event *event,
2416 int cpu)
2417{
2418 struct task_struct *task = READ_ONCE(ctx->task);
2419
2420 lockdep_assert_held(&ctx->mutex);
2421
2422 if (event->cpu != -1)
2423 event->cpu = cpu;
2424
2425
2426
2427
2428
2429 smp_store_release(&event->ctx, ctx);
2430
2431 if (!task) {
2432 cpu_function_call(cpu, __perf_install_in_context, event);
2433 return;
2434 }
2435
2436
2437
2438
2439 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2440 return;
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472 smp_mb();
2473again:
2474 if (!task_function_call(task, __perf_install_in_context, event))
2475 return;
2476
2477 raw_spin_lock_irq(&ctx->lock);
2478 task = ctx->task;
2479 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2480
2481
2482
2483
2484
2485 raw_spin_unlock_irq(&ctx->lock);
2486 return;
2487 }
2488
2489
2490
2491
2492 if (task_curr(task)) {
2493 raw_spin_unlock_irq(&ctx->lock);
2494 goto again;
2495 }
2496 add_event_to_ctx(event, ctx);
2497 raw_spin_unlock_irq(&ctx->lock);
2498}
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508static void __perf_event_mark_enabled(struct perf_event *event)
2509{
2510 struct perf_event *sub;
2511 u64 tstamp = perf_event_time(event);
2512
2513 event->state = PERF_EVENT_STATE_INACTIVE;
2514 __perf_event_enable_time(event, tstamp);
2515 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2516
2517 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2518 __perf_event_enable_time(sub, tstamp);
2519 }
2520}
2521
2522
2523
2524
2525static void __perf_event_enable(struct perf_event *event,
2526 struct perf_cpu_context *cpuctx,
2527 struct perf_event_context *ctx,
2528 void *info)
2529{
2530 struct perf_event *leader = event->group_leader;
2531 struct perf_event_context *task_ctx;
2532
2533 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2534 event->state <= PERF_EVENT_STATE_ERROR)
2535 return;
2536
2537 if (ctx->is_active)
2538 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2539
2540 __perf_event_mark_enabled(event);
2541
2542 if (!ctx->is_active)
2543 return;
2544
2545 if (!event_filter_match(event)) {
2546 if (is_cgroup_event(event))
2547 perf_cgroup_defer_enabled(event);
2548 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2549 return;
2550 }
2551
2552
2553
2554
2555
2556 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2557 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2558 return;
2559 }
2560
2561 task_ctx = cpuctx->task_ctx;
2562 if (ctx->task)
2563 WARN_ON_ONCE(task_ctx != ctx);
2564
2565 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2566}
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577static void _perf_event_enable(struct perf_event *event)
2578{
2579 struct perf_event_context *ctx = event->ctx;
2580
2581 raw_spin_lock_irq(&ctx->lock);
2582 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2583 event->state < PERF_EVENT_STATE_ERROR) {
2584 raw_spin_unlock_irq(&ctx->lock);
2585 return;
2586 }
2587
2588
2589
2590
2591
2592
2593
2594
2595 if (event->state == PERF_EVENT_STATE_ERROR)
2596 event->state = PERF_EVENT_STATE_OFF;
2597 raw_spin_unlock_irq(&ctx->lock);
2598
2599 event_function_call(event, __perf_event_enable, NULL);
2600}
2601
2602
2603
2604
2605void perf_event_enable(struct perf_event *event)
2606{
2607 struct perf_event_context *ctx;
2608
2609 ctx = perf_event_ctx_lock(event);
2610 _perf_event_enable(event);
2611 perf_event_ctx_unlock(event, ctx);
2612}
2613EXPORT_SYMBOL_GPL(perf_event_enable);
2614
2615struct stop_event_data {
2616 struct perf_event *event;
2617 unsigned int restart;
2618};
2619
2620static int __perf_event_stop(void *info)
2621{
2622 struct stop_event_data *sd = info;
2623 struct perf_event *event = sd->event;
2624
2625
2626 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2627 return 0;
2628
2629
2630 smp_rmb();
2631
2632
2633
2634
2635
2636 if (READ_ONCE(event->oncpu) != smp_processor_id())
2637 return -EAGAIN;
2638
2639 event->pmu->stop(event, PERF_EF_UPDATE);
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650 if (sd->restart)
2651 event->pmu->start(event, 0);
2652
2653 return 0;
2654}
2655
2656static int perf_event_stop(struct perf_event *event, int restart)
2657{
2658 struct stop_event_data sd = {
2659 .event = event,
2660 .restart = restart,
2661 };
2662 int ret = 0;
2663
2664 do {
2665 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2666 return 0;
2667
2668
2669 smp_rmb();
2670
2671
2672
2673
2674
2675
2676 ret = cpu_function_call(READ_ONCE(event->oncpu),
2677 __perf_event_stop, &sd);
2678 } while (ret == -EAGAIN);
2679
2680 return ret;
2681}
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705void perf_event_addr_filters_sync(struct perf_event *event)
2706{
2707 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2708
2709 if (!has_addr_filter(event))
2710 return;
2711
2712 raw_spin_lock(&ifh->lock);
2713 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2714 event->pmu->addr_filters_sync(event);
2715 event->hw.addr_filters_gen = event->addr_filters_gen;
2716 }
2717 raw_spin_unlock(&ifh->lock);
2718}
2719EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2720
2721static int _perf_event_refresh(struct perf_event *event, int refresh)
2722{
2723
2724
2725
2726 if (event->attr.inherit || !is_sampling_event(event))
2727 return -EINVAL;
2728
2729 atomic_add(refresh, &event->event_limit);
2730 _perf_event_enable(event);
2731
2732 return 0;
2733}
2734
2735
2736
2737
2738int perf_event_refresh(struct perf_event *event, int refresh)
2739{
2740 struct perf_event_context *ctx;
2741 int ret;
2742
2743 ctx = perf_event_ctx_lock(event);
2744 ret = _perf_event_refresh(event, refresh);
2745 perf_event_ctx_unlock(event, ctx);
2746
2747 return ret;
2748}
2749EXPORT_SYMBOL_GPL(perf_event_refresh);
2750
2751static void ctx_sched_out(struct perf_event_context *ctx,
2752 struct perf_cpu_context *cpuctx,
2753 enum event_type_t event_type)
2754{
2755 int is_active = ctx->is_active;
2756 struct perf_event *event;
2757
2758 lockdep_assert_held(&ctx->lock);
2759
2760 if (likely(!ctx->nr_events)) {
2761
2762
2763
2764 WARN_ON_ONCE(ctx->is_active);
2765 if (ctx->task)
2766 WARN_ON_ONCE(cpuctx->task_ctx);
2767 return;
2768 }
2769
2770 ctx->is_active &= ~event_type;
2771 if (!(ctx->is_active & EVENT_ALL))
2772 ctx->is_active = 0;
2773
2774 if (ctx->task) {
2775 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2776 if (!ctx->is_active)
2777 cpuctx->task_ctx = NULL;
2778 }
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790 if (is_active & EVENT_TIME) {
2791
2792 update_context_time(ctx);
2793 update_cgrp_time_from_cpuctx(cpuctx);
2794 }
2795
2796 is_active ^= ctx->is_active;
2797
2798 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2799 return;
2800
2801 perf_pmu_disable(ctx->pmu);
2802 if (is_active & EVENT_PINNED) {
2803 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2804 group_sched_out(event, cpuctx, ctx);
2805 }
2806
2807 if (is_active & EVENT_FLEXIBLE) {
2808 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2809 group_sched_out(event, cpuctx, ctx);
2810 }
2811 perf_pmu_enable(ctx->pmu);
2812}
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822static int context_equiv(struct perf_event_context *ctx1,
2823 struct perf_event_context *ctx2)
2824{
2825 lockdep_assert_held(&ctx1->lock);
2826 lockdep_assert_held(&ctx2->lock);
2827
2828
2829 if (ctx1->pin_count || ctx2->pin_count)
2830 return 0;
2831
2832
2833 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2834 return 1;
2835
2836
2837 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2838 return 1;
2839
2840
2841
2842
2843
2844 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2845 ctx1->parent_gen == ctx2->parent_gen)
2846 return 1;
2847
2848
2849 return 0;
2850}
2851
2852static void __perf_event_sync_stat(struct perf_event *event,
2853 struct perf_event *next_event)
2854{
2855 u64 value;
2856
2857 if (!event->attr.inherit_stat)
2858 return;
2859
2860
2861
2862
2863
2864
2865
2866
2867 switch (event->state) {
2868 case PERF_EVENT_STATE_ACTIVE:
2869 event->pmu->read(event);
2870
2871
2872 case PERF_EVENT_STATE_INACTIVE:
2873 update_event_times(event);
2874 break;
2875
2876 default:
2877 break;
2878 }
2879
2880
2881
2882
2883
2884 value = local64_read(&next_event->count);
2885 value = local64_xchg(&event->count, value);
2886 local64_set(&next_event->count, value);
2887
2888 swap(event->total_time_enabled, next_event->total_time_enabled);
2889 swap(event->total_time_running, next_event->total_time_running);
2890
2891
2892
2893
2894 perf_event_update_userpage(event);
2895 perf_event_update_userpage(next_event);
2896}
2897
2898static void perf_event_sync_stat(struct perf_event_context *ctx,
2899 struct perf_event_context *next_ctx)
2900{
2901 struct perf_event *event, *next_event;
2902
2903 if (!ctx->nr_stat)
2904 return;
2905
2906 update_context_time(ctx);
2907
2908 event = list_first_entry(&ctx->event_list,
2909 struct perf_event, event_entry);
2910
2911 next_event = list_first_entry(&next_ctx->event_list,
2912 struct perf_event, event_entry);
2913
2914 while (&event->event_entry != &ctx->event_list &&
2915 &next_event->event_entry != &next_ctx->event_list) {
2916
2917 __perf_event_sync_stat(event, next_event);
2918
2919 event = list_next_entry(event, event_entry);
2920 next_event = list_next_entry(next_event, event_entry);
2921 }
2922}
2923
2924static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2925 struct task_struct *next)
2926{
2927 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2928 struct perf_event_context *next_ctx;
2929 struct perf_event_context *parent, *next_parent;
2930 struct perf_cpu_context *cpuctx;
2931 int do_switch = 1;
2932
2933 if (likely(!ctx))
2934 return;
2935
2936 cpuctx = __get_cpu_context(ctx);
2937 if (!cpuctx->task_ctx)
2938 return;
2939
2940 rcu_read_lock();
2941 next_ctx = next->perf_event_ctxp[ctxn];
2942 if (!next_ctx)
2943 goto unlock;
2944
2945 parent = rcu_dereference(ctx->parent_ctx);
2946 next_parent = rcu_dereference(next_ctx->parent_ctx);
2947
2948
2949 if (!parent && !next_parent)
2950 goto unlock;
2951
2952 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962 raw_spin_lock(&ctx->lock);
2963 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2964 if (context_equiv(ctx, next_ctx)) {
2965 WRITE_ONCE(ctx->task, next);
2966 WRITE_ONCE(next_ctx->task, task);
2967
2968 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2969
2970
2971
2972
2973
2974
2975
2976
2977 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2978 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2979
2980 do_switch = 0;
2981
2982 perf_event_sync_stat(ctx, next_ctx);
2983 }
2984 raw_spin_unlock(&next_ctx->lock);
2985 raw_spin_unlock(&ctx->lock);
2986 }
2987unlock:
2988 rcu_read_unlock();
2989
2990 if (do_switch) {
2991 raw_spin_lock(&ctx->lock);
2992 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
2993 raw_spin_unlock(&ctx->lock);
2994 }
2995}
2996
2997static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2998
2999void perf_sched_cb_dec(struct pmu *pmu)
3000{
3001 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3002
3003 this_cpu_dec(perf_sched_cb_usages);
3004
3005 if (!--cpuctx->sched_cb_usage)
3006 list_del(&cpuctx->sched_cb_entry);
3007}
3008
3009
3010void perf_sched_cb_inc(struct pmu *pmu)
3011{
3012 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3013
3014 if (!cpuctx->sched_cb_usage++)
3015 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3016
3017 this_cpu_inc(perf_sched_cb_usages);
3018}
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028static void perf_pmu_sched_task(struct task_struct *prev,
3029 struct task_struct *next,
3030 bool sched_in)
3031{
3032 struct perf_cpu_context *cpuctx;
3033 struct pmu *pmu;
3034
3035 if (prev == next)
3036 return;
3037
3038 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3039 pmu = cpuctx->ctx.pmu;
3040
3041 if (WARN_ON_ONCE(!pmu->sched_task))
3042 continue;
3043
3044 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3045 perf_pmu_disable(pmu);
3046
3047 pmu->sched_task(cpuctx->task_ctx, sched_in);
3048
3049 perf_pmu_enable(pmu);
3050 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3051 }
3052}
3053
3054static void perf_event_switch(struct task_struct *task,
3055 struct task_struct *next_prev, bool sched_in);
3056
3057#define for_each_task_context_nr(ctxn) \
3058 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071void __perf_event_task_sched_out(struct task_struct *task,
3072 struct task_struct *next)
3073{
3074 int ctxn;
3075
3076 if (__this_cpu_read(perf_sched_cb_usages))
3077 perf_pmu_sched_task(task, next, false);
3078
3079 if (atomic_read(&nr_switch_events))
3080 perf_event_switch(task, next, false);
3081
3082 for_each_task_context_nr(ctxn)
3083 perf_event_context_sched_out(task, ctxn, next);
3084
3085
3086
3087
3088
3089
3090 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3091 perf_cgroup_sched_out(task, next);
3092}
3093
3094
3095
3096
3097static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3098 enum event_type_t event_type)
3099{
3100 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3101}
3102
3103static void
3104ctx_pinned_sched_in(struct perf_event_context *ctx,
3105 struct perf_cpu_context *cpuctx)
3106{
3107 struct perf_event *event;
3108
3109 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3110 if (event->state <= PERF_EVENT_STATE_OFF)
3111 continue;
3112 if (!event_filter_match(event))
3113 continue;
3114
3115
3116 if (is_cgroup_event(event))
3117 perf_cgroup_mark_enabled(event, ctx);
3118
3119 if (group_can_go_on(event, cpuctx, 1))
3120 group_sched_in(event, cpuctx, ctx);
3121
3122
3123
3124
3125
3126 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3127 update_group_times(event);
3128 event->state = PERF_EVENT_STATE_ERROR;
3129 }
3130 }
3131}
3132
3133static void
3134ctx_flexible_sched_in(struct perf_event_context *ctx,
3135 struct perf_cpu_context *cpuctx)
3136{
3137 struct perf_event *event;
3138 int can_add_hw = 1;
3139
3140 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3141
3142 if (event->state <= PERF_EVENT_STATE_OFF)
3143 continue;
3144
3145
3146
3147
3148 if (!event_filter_match(event))
3149 continue;
3150
3151
3152 if (is_cgroup_event(event))
3153 perf_cgroup_mark_enabled(event, ctx);
3154
3155 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3156 if (group_sched_in(event, cpuctx, ctx))
3157 can_add_hw = 0;
3158 }
3159 }
3160}
3161
3162static void
3163ctx_sched_in(struct perf_event_context *ctx,
3164 struct perf_cpu_context *cpuctx,
3165 enum event_type_t event_type,
3166 struct task_struct *task)
3167{
3168 int is_active = ctx->is_active;
3169 u64 now;
3170
3171 lockdep_assert_held(&ctx->lock);
3172
3173 if (likely(!ctx->nr_events))
3174 return;
3175
3176 ctx->is_active |= (event_type | EVENT_TIME);
3177 if (ctx->task) {
3178 if (!is_active)
3179 cpuctx->task_ctx = ctx;
3180 else
3181 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3182 }
3183
3184 is_active ^= ctx->is_active;
3185
3186 if (is_active & EVENT_TIME) {
3187
3188 now = perf_clock();
3189 ctx->timestamp = now;
3190 perf_cgroup_set_timestamp(task, ctx);
3191 }
3192
3193
3194
3195
3196
3197 if (is_active & EVENT_PINNED)
3198 ctx_pinned_sched_in(ctx, cpuctx);
3199
3200
3201 if (is_active & EVENT_FLEXIBLE)
3202 ctx_flexible_sched_in(ctx, cpuctx);
3203}
3204
3205static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3206 enum event_type_t event_type,
3207 struct task_struct *task)
3208{
3209 struct perf_event_context *ctx = &cpuctx->ctx;
3210
3211 ctx_sched_in(ctx, cpuctx, event_type, task);
3212}
3213
3214static void perf_event_context_sched_in(struct perf_event_context *ctx,
3215 struct task_struct *task)
3216{
3217 struct perf_cpu_context *cpuctx;
3218
3219 cpuctx = __get_cpu_context(ctx);
3220 if (cpuctx->task_ctx == ctx)
3221 return;
3222
3223 perf_ctx_lock(cpuctx, ctx);
3224
3225
3226
3227
3228 if (!ctx->nr_events)
3229 goto unlock;
3230
3231 perf_pmu_disable(ctx->pmu);
3232
3233
3234
3235
3236
3237
3238
3239
3240 if (!list_empty(&ctx->pinned_groups))
3241 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3242 perf_event_sched_in(cpuctx, ctx, task);
3243 perf_pmu_enable(ctx->pmu);
3244
3245unlock:
3246 perf_ctx_unlock(cpuctx, ctx);
3247}
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260void __perf_event_task_sched_in(struct task_struct *prev,
3261 struct task_struct *task)
3262{
3263 struct perf_event_context *ctx;
3264 int ctxn;
3265
3266
3267
3268
3269
3270
3271
3272
3273 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3274 perf_cgroup_sched_in(prev, task);
3275
3276 for_each_task_context_nr(ctxn) {
3277 ctx = task->perf_event_ctxp[ctxn];
3278 if (likely(!ctx))
3279 continue;
3280
3281 perf_event_context_sched_in(ctx, task);
3282 }
3283
3284 if (atomic_read(&nr_switch_events))
3285 perf_event_switch(task, prev, true);
3286
3287 if (__this_cpu_read(perf_sched_cb_usages))
3288 perf_pmu_sched_task(prev, task, true);
3289}
3290
3291static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3292{
3293 u64 frequency = event->attr.sample_freq;
3294 u64 sec = NSEC_PER_SEC;
3295 u64 divisor, dividend;
3296
3297 int count_fls, nsec_fls, frequency_fls, sec_fls;
3298
3299 count_fls = fls64(count);
3300 nsec_fls = fls64(nsec);
3301 frequency_fls = fls64(frequency);
3302 sec_fls = 30;
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318#define REDUCE_FLS(a, b) \
3319do { \
3320 if (a##_fls > b##_fls) { \
3321 a >>= 1; \
3322 a##_fls--; \
3323 } else { \
3324 b >>= 1; \
3325 b##_fls--; \
3326 } \
3327} while (0)
3328
3329
3330
3331
3332
3333 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3334 REDUCE_FLS(nsec, frequency);
3335 REDUCE_FLS(sec, count);
3336 }
3337
3338 if (count_fls + sec_fls > 64) {
3339 divisor = nsec * frequency;
3340
3341 while (count_fls + sec_fls > 64) {
3342 REDUCE_FLS(count, sec);
3343 divisor >>= 1;
3344 }
3345
3346 dividend = count * sec;
3347 } else {
3348 dividend = count * sec;
3349
3350 while (nsec_fls + frequency_fls > 64) {
3351 REDUCE_FLS(nsec, frequency);
3352 dividend >>= 1;
3353 }
3354
3355 divisor = nsec * frequency;
3356 }
3357
3358 if (!divisor)
3359 return dividend;
3360
3361 return div64_u64(dividend, divisor);
3362}
3363
3364static DEFINE_PER_CPU(int, perf_throttled_count);
3365static DEFINE_PER_CPU(u64, perf_throttled_seq);
3366
3367static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3368{
3369 struct hw_perf_event *hwc = &event->hw;
3370 s64 period, sample_period;
3371 s64 delta;
3372
3373 period = perf_calculate_period(event, nsec, count);
3374
3375 delta = (s64)(period - hwc->sample_period);
3376 delta = (delta + 7) / 8;
3377
3378 sample_period = hwc->sample_period + delta;
3379
3380 if (!sample_period)
3381 sample_period = 1;
3382
3383 hwc->sample_period = sample_period;
3384
3385 if (local64_read(&hwc->period_left) > 8*sample_period) {
3386 if (disable)
3387 event->pmu->stop(event, PERF_EF_UPDATE);
3388
3389 local64_set(&hwc->period_left, 0);
3390
3391 if (disable)
3392 event->pmu->start(event, PERF_EF_RELOAD);
3393 }
3394}
3395
3396
3397
3398
3399
3400
3401static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3402 int needs_unthr)
3403{
3404 struct perf_event *event;
3405 struct hw_perf_event *hwc;
3406 u64 now, period = TICK_NSEC;
3407 s64 delta;
3408
3409
3410
3411
3412
3413
3414 if (!(ctx->nr_freq || needs_unthr))
3415 return;
3416
3417 raw_spin_lock(&ctx->lock);
3418 perf_pmu_disable(ctx->pmu);
3419
3420 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3421 if (event->state != PERF_EVENT_STATE_ACTIVE)
3422 continue;
3423
3424 if (!event_filter_match(event))
3425 continue;
3426
3427 perf_pmu_disable(event->pmu);
3428
3429 hwc = &event->hw;
3430
3431 if (hwc->interrupts == MAX_INTERRUPTS) {
3432 hwc->interrupts = 0;
3433 perf_log_throttle(event, 1);
3434 event->pmu->start(event, 0);
3435 }
3436
3437 if (!event->attr.freq || !event->attr.sample_freq)
3438 goto next;
3439
3440
3441
3442
3443 event->pmu->stop(event, PERF_EF_UPDATE);
3444
3445 now = local64_read(&event->count);
3446 delta = now - hwc->freq_count_stamp;
3447 hwc->freq_count_stamp = now;
3448
3449
3450
3451
3452
3453
3454
3455
3456 if (delta > 0)
3457 perf_adjust_period(event, period, delta, false);
3458
3459 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3460 next:
3461 perf_pmu_enable(event->pmu);
3462 }
3463
3464 perf_pmu_enable(ctx->pmu);
3465 raw_spin_unlock(&ctx->lock);
3466}
3467
3468
3469
3470
3471static void rotate_ctx(struct perf_event_context *ctx)
3472{
3473
3474
3475
3476
3477 if (!ctx->rotate_disable)
3478 list_rotate_left(&ctx->flexible_groups);
3479}
3480
3481static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3482{
3483 struct perf_event_context *ctx = NULL;
3484 int rotate = 0;
3485
3486 if (cpuctx->ctx.nr_events) {
3487 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3488 rotate = 1;
3489 }
3490
3491 ctx = cpuctx->task_ctx;
3492 if (ctx && ctx->nr_events) {
3493 if (ctx->nr_events != ctx->nr_active)
3494 rotate = 1;
3495 }
3496
3497 if (!rotate)
3498 goto done;
3499
3500 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3501 perf_pmu_disable(cpuctx->ctx.pmu);
3502
3503 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3504 if (ctx)
3505 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3506
3507 rotate_ctx(&cpuctx->ctx);
3508 if (ctx)
3509 rotate_ctx(ctx);
3510
3511 perf_event_sched_in(cpuctx, ctx, current);
3512
3513 perf_pmu_enable(cpuctx->ctx.pmu);
3514 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3515done:
3516
3517 return rotate;
3518}
3519
3520void perf_event_task_tick(void)
3521{
3522 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3523 struct perf_event_context *ctx, *tmp;
3524 int throttled;
3525
3526 WARN_ON(!irqs_disabled());
3527
3528 __this_cpu_inc(perf_throttled_seq);
3529 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3530 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3531
3532 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3533 perf_adjust_freq_unthr_context(ctx, throttled);
3534}
3535
3536static int event_enable_on_exec(struct perf_event *event,
3537 struct perf_event_context *ctx)
3538{
3539 if (!event->attr.enable_on_exec)
3540 return 0;
3541
3542 event->attr.enable_on_exec = 0;
3543 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3544 return 0;
3545
3546 __perf_event_mark_enabled(event);
3547
3548 return 1;
3549}
3550
3551
3552
3553
3554
3555static void perf_event_enable_on_exec(int ctxn)
3556{
3557 struct perf_event_context *ctx, *clone_ctx = NULL;
3558 enum event_type_t event_type = 0;
3559 struct perf_cpu_context *cpuctx;
3560 struct perf_event *event;
3561 unsigned long flags;
3562 int enabled = 0;
3563
3564 local_irq_save(flags);
3565 ctx = current->perf_event_ctxp[ctxn];
3566 if (!ctx || !ctx->nr_events)
3567 goto out;
3568
3569 cpuctx = __get_cpu_context(ctx);
3570 perf_ctx_lock(cpuctx, ctx);
3571 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3572 list_for_each_entry(event, &ctx->event_list, event_entry) {
3573 enabled |= event_enable_on_exec(event, ctx);
3574 event_type |= get_event_type(event);
3575 }
3576
3577
3578
3579
3580 if (enabled) {
3581 clone_ctx = unclone_ctx(ctx);
3582 ctx_resched(cpuctx, ctx, event_type);
3583 } else {
3584 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3585 }
3586 perf_ctx_unlock(cpuctx, ctx);
3587
3588out:
3589 local_irq_restore(flags);
3590
3591 if (clone_ctx)
3592 put_ctx(clone_ctx);
3593}
3594
3595struct perf_read_data {
3596 struct perf_event *event;
3597 bool group;
3598 int ret;
3599};
3600
3601static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3602{
3603 u16 local_pkg, event_pkg;
3604
3605 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3606 int local_cpu = smp_processor_id();
3607
3608 event_pkg = topology_physical_package_id(event_cpu);
3609 local_pkg = topology_physical_package_id(local_cpu);
3610
3611 if (event_pkg == local_pkg)
3612 return local_cpu;
3613 }
3614
3615 return event_cpu;
3616}
3617
3618
3619
3620
3621static void __perf_event_read(void *info)
3622{
3623 struct perf_read_data *data = info;
3624 struct perf_event *sub, *event = data->event;
3625 struct perf_event_context *ctx = event->ctx;
3626 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3627 struct pmu *pmu = event->pmu;
3628
3629
3630
3631
3632
3633
3634
3635
3636 if (ctx->task && cpuctx->task_ctx != ctx)
3637 return;
3638
3639 raw_spin_lock(&ctx->lock);
3640 if (ctx->is_active) {
3641 update_context_time(ctx);
3642 update_cgrp_time_from_event(event);
3643 }
3644
3645 update_event_times(event);
3646 if (event->state != PERF_EVENT_STATE_ACTIVE)
3647 goto unlock;
3648
3649 if (!data->group) {
3650 pmu->read(event);
3651 data->ret = 0;
3652 goto unlock;
3653 }
3654
3655 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3656
3657 pmu->read(event);
3658
3659 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3660 update_event_times(sub);
3661 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3662
3663
3664
3665
3666 sub->pmu->read(sub);
3667 }
3668 }
3669
3670 data->ret = pmu->commit_txn(pmu);
3671
3672unlock:
3673 raw_spin_unlock(&ctx->lock);
3674}
3675
3676static inline u64 perf_event_count(struct perf_event *event)
3677{
3678 return local64_read(&event->count) + atomic64_read(&event->child_count);
3679}
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689int perf_event_read_local(struct perf_event *event, u64 *value)
3690{
3691 unsigned long flags;
3692 int ret = 0;
3693
3694
3695
3696
3697
3698 local_irq_save(flags);
3699
3700
3701
3702
3703
3704 if (event->attr.inherit) {
3705 ret = -EOPNOTSUPP;
3706 goto out;
3707 }
3708
3709
3710 if ((event->attach_state & PERF_ATTACH_TASK) &&
3711 event->hw.target != current) {
3712 ret = -EINVAL;
3713 goto out;
3714 }
3715
3716
3717 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3718 event->cpu != smp_processor_id()) {
3719 ret = -EINVAL;
3720 goto out;
3721 }
3722
3723
3724
3725
3726
3727
3728 if (event->oncpu == smp_processor_id())
3729 event->pmu->read(event);
3730
3731 *value = local64_read(&event->count);
3732out:
3733 local_irq_restore(flags);
3734
3735 return ret;
3736}
3737
3738static int perf_event_read(struct perf_event *event, bool group)
3739{
3740 int event_cpu, ret = 0;
3741
3742
3743
3744
3745
3746 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3747 struct perf_read_data data = {
3748 .event = event,
3749 .group = group,
3750 .ret = 0,
3751 };
3752
3753 event_cpu = READ_ONCE(event->oncpu);
3754 if ((unsigned)event_cpu >= nr_cpu_ids)
3755 return 0;
3756
3757 preempt_disable();
3758 event_cpu = __perf_event_read_cpu(event, event_cpu);
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3771 preempt_enable();
3772 ret = data.ret;
3773 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3774 struct perf_event_context *ctx = event->ctx;
3775 unsigned long flags;
3776
3777 raw_spin_lock_irqsave(&ctx->lock, flags);
3778
3779
3780
3781
3782
3783 if (ctx->is_active) {
3784 update_context_time(ctx);
3785 update_cgrp_time_from_event(event);
3786 }
3787 if (group)
3788 update_group_times(event);
3789 else
3790 update_event_times(event);
3791 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3792 }
3793
3794 return ret;
3795}
3796
3797
3798
3799
3800static void __perf_event_init_context(struct perf_event_context *ctx)
3801{
3802 raw_spin_lock_init(&ctx->lock);
3803 mutex_init(&ctx->mutex);
3804 INIT_LIST_HEAD(&ctx->active_ctx_list);
3805 INIT_LIST_HEAD(&ctx->pinned_groups);
3806 INIT_LIST_HEAD(&ctx->flexible_groups);
3807 INIT_LIST_HEAD(&ctx->event_list);
3808 atomic_set(&ctx->refcount, 1);
3809}
3810
3811static struct perf_event_context *
3812alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3813{
3814 struct perf_event_context *ctx;
3815
3816 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3817 if (!ctx)
3818 return NULL;
3819
3820 __perf_event_init_context(ctx);
3821 if (task) {
3822 ctx->task = task;
3823 get_task_struct(task);
3824 }
3825 ctx->pmu = pmu;
3826
3827 return ctx;
3828}
3829
3830static struct task_struct *
3831find_lively_task_by_vpid(pid_t vpid)
3832{
3833 struct task_struct *task;
3834
3835 rcu_read_lock();
3836 if (!vpid)
3837 task = current;
3838 else
3839 task = find_task_by_vpid(vpid);
3840 if (task)
3841 get_task_struct(task);
3842 rcu_read_unlock();
3843
3844 if (!task)
3845 return ERR_PTR(-ESRCH);
3846
3847 return task;
3848}
3849
3850
3851
3852
3853static struct perf_event_context *
3854find_get_context(struct pmu *pmu, struct task_struct *task,
3855 struct perf_event *event)
3856{
3857 struct perf_event_context *ctx, *clone_ctx = NULL;
3858 struct perf_cpu_context *cpuctx;
3859 void *task_ctx_data = NULL;
3860 unsigned long flags;
3861 int ctxn, err;
3862 int cpu = event->cpu;
3863
3864 if (!task) {
3865
3866 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3867 return ERR_PTR(-EACCES);
3868
3869 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3870 ctx = &cpuctx->ctx;
3871 get_ctx(ctx);
3872 ++ctx->pin_count;
3873
3874 return ctx;
3875 }
3876
3877 err = -EINVAL;
3878 ctxn = pmu->task_ctx_nr;
3879 if (ctxn < 0)
3880 goto errout;
3881
3882 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3883 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3884 if (!task_ctx_data) {
3885 err = -ENOMEM;
3886 goto errout;
3887 }
3888 }
3889
3890retry:
3891 ctx = perf_lock_task_context(task, ctxn, &flags);
3892 if (ctx) {
3893 clone_ctx = unclone_ctx(ctx);
3894 ++ctx->pin_count;
3895
3896 if (task_ctx_data && !ctx->task_ctx_data) {
3897 ctx->task_ctx_data = task_ctx_data;
3898 task_ctx_data = NULL;
3899 }
3900 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3901
3902 if (clone_ctx)
3903 put_ctx(clone_ctx);
3904 } else {
3905 ctx = alloc_perf_context(pmu, task);
3906 err = -ENOMEM;
3907 if (!ctx)
3908 goto errout;
3909
3910 if (task_ctx_data) {
3911 ctx->task_ctx_data = task_ctx_data;
3912 task_ctx_data = NULL;
3913 }
3914
3915 err = 0;
3916 mutex_lock(&task->perf_event_mutex);
3917
3918
3919
3920
3921 if (task->flags & PF_EXITING)
3922 err = -ESRCH;
3923 else if (task->perf_event_ctxp[ctxn])
3924 err = -EAGAIN;
3925 else {
3926 get_ctx(ctx);
3927 ++ctx->pin_count;
3928 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3929 }
3930 mutex_unlock(&task->perf_event_mutex);
3931
3932 if (unlikely(err)) {
3933 put_ctx(ctx);
3934
3935 if (err == -EAGAIN)
3936 goto retry;
3937 goto errout;
3938 }
3939 }
3940
3941 kfree(task_ctx_data);
3942 return ctx;
3943
3944errout:
3945 kfree(task_ctx_data);
3946 return ERR_PTR(err);
3947}
3948
3949static void perf_event_free_filter(struct perf_event *event);
3950static void perf_event_free_bpf_prog(struct perf_event *event);
3951
3952static void free_event_rcu(struct rcu_head *head)
3953{
3954 struct perf_event *event;
3955
3956 event = container_of(head, struct perf_event, rcu_head);
3957 if (event->ns)
3958 put_pid_ns(event->ns);
3959 perf_event_free_filter(event);
3960 kfree(event);
3961}
3962
3963static void ring_buffer_attach(struct perf_event *event,
3964 struct ring_buffer *rb);
3965
3966static void detach_sb_event(struct perf_event *event)
3967{
3968 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3969
3970 raw_spin_lock(&pel->lock);
3971 list_del_rcu(&event->sb_list);
3972 raw_spin_unlock(&pel->lock);
3973}
3974
3975static bool is_sb_event(struct perf_event *event)
3976{
3977 struct perf_event_attr *attr = &event->attr;
3978
3979 if (event->parent)
3980 return false;
3981
3982 if (event->attach_state & PERF_ATTACH_TASK)
3983 return false;
3984
3985 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3986 attr->comm || attr->comm_exec ||
3987 attr->task ||
3988 attr->context_switch)
3989 return true;
3990 return false;
3991}
3992
3993static void unaccount_pmu_sb_event(struct perf_event *event)
3994{
3995 if (is_sb_event(event))
3996 detach_sb_event(event);
3997}
3998
3999static void unaccount_event_cpu(struct perf_event *event, int cpu)
4000{
4001 if (event->parent)
4002 return;
4003
4004 if (is_cgroup_event(event))
4005 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4006}
4007
4008#ifdef CONFIG_NO_HZ_FULL
4009static DEFINE_SPINLOCK(nr_freq_lock);
4010#endif
4011
4012static void unaccount_freq_event_nohz(void)
4013{
4014#ifdef CONFIG_NO_HZ_FULL
4015 spin_lock(&nr_freq_lock);
4016 if (atomic_dec_and_test(&nr_freq_events))
4017 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4018 spin_unlock(&nr_freq_lock);
4019#endif
4020}
4021
4022static void unaccount_freq_event(void)
4023{
4024 if (tick_nohz_full_enabled())
4025 unaccount_freq_event_nohz();
4026 else
4027 atomic_dec(&nr_freq_events);
4028}
4029
4030static void unaccount_event(struct perf_event *event)
4031{
4032 bool dec = false;
4033
4034 if (event->parent)
4035 return;
4036
4037 if (event->attach_state & PERF_ATTACH_TASK)
4038 dec = true;
4039 if (event->attr.mmap || event->attr.mmap_data)
4040 atomic_dec(&nr_mmap_events);
4041 if (event->attr.comm)
4042 atomic_dec(&nr_comm_events);
4043 if (event->attr.namespaces)
4044 atomic_dec(&nr_namespaces_events);
4045 if (event->attr.task)
4046 atomic_dec(&nr_task_events);
4047 if (event->attr.freq)
4048 unaccount_freq_event();
4049 if (event->attr.context_switch) {
4050 dec = true;
4051 atomic_dec(&nr_switch_events);
4052 }
4053 if (is_cgroup_event(event))
4054 dec = true;
4055 if (has_branch_stack(event))
4056 dec = true;
4057
4058 if (dec) {
4059 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4060 schedule_delayed_work(&perf_sched_work, HZ);
4061 }
4062
4063 unaccount_event_cpu(event, event->cpu);
4064
4065 unaccount_pmu_sb_event(event);
4066}
4067
4068static void perf_sched_delayed(struct work_struct *work)
4069{
4070 mutex_lock(&perf_sched_mutex);
4071 if (atomic_dec_and_test(&perf_sched_count))
4072 static_branch_disable(&perf_sched_events);
4073 mutex_unlock(&perf_sched_mutex);
4074}
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088static int exclusive_event_init(struct perf_event *event)
4089{
4090 struct pmu *pmu = event->pmu;
4091
4092 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4093 return 0;
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108 if (event->attach_state & PERF_ATTACH_TASK) {
4109 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4110 return -EBUSY;
4111 } else {
4112 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4113 return -EBUSY;
4114 }
4115
4116 return 0;
4117}
4118
4119static void exclusive_event_destroy(struct perf_event *event)
4120{
4121 struct pmu *pmu = event->pmu;
4122
4123 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4124 return;
4125
4126
4127 if (event->attach_state & PERF_ATTACH_TASK)
4128 atomic_dec(&pmu->exclusive_cnt);
4129 else
4130 atomic_inc(&pmu->exclusive_cnt);
4131}
4132
4133static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4134{
4135 if ((e1->pmu == e2->pmu) &&
4136 (e1->cpu == e2->cpu ||
4137 e1->cpu == -1 ||
4138 e2->cpu == -1))
4139 return true;
4140 return false;
4141}
4142
4143
4144static bool exclusive_event_installable(struct perf_event *event,
4145 struct perf_event_context *ctx)
4146{
4147 struct perf_event *iter_event;
4148 struct pmu *pmu = event->pmu;
4149
4150 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4151 return true;
4152
4153 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4154 if (exclusive_event_match(iter_event, event))
4155 return false;
4156 }
4157
4158 return true;
4159}
4160
4161static void perf_addr_filters_splice(struct perf_event *event,
4162 struct list_head *head);
4163
4164static void _free_event(struct perf_event *event)
4165{
4166 irq_work_sync(&event->pending);
4167
4168 unaccount_event(event);
4169
4170 if (event->rb) {
4171
4172
4173
4174
4175
4176
4177 mutex_lock(&event->mmap_mutex);
4178 ring_buffer_attach(event, NULL);
4179 mutex_unlock(&event->mmap_mutex);
4180 }
4181
4182 if (is_cgroup_event(event))
4183 perf_detach_cgroup(event);
4184
4185 if (!event->parent) {
4186 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4187 put_callchain_buffers();
4188 }
4189
4190 perf_event_free_bpf_prog(event);
4191 perf_addr_filters_splice(event, NULL);
4192 kfree(event->addr_filters_offs);
4193
4194 if (event->destroy)
4195 event->destroy(event);
4196
4197 if (event->ctx)
4198 put_ctx(event->ctx);
4199
4200 exclusive_event_destroy(event);
4201 module_put(event->pmu->module);
4202
4203 call_rcu(&event->rcu_head, free_event_rcu);
4204}
4205
4206
4207
4208
4209
4210static void free_event(struct perf_event *event)
4211{
4212 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4213 "unexpected event refcount: %ld; ptr=%p\n",
4214 atomic_long_read(&event->refcount), event)) {
4215
4216 return;
4217 }
4218
4219 _free_event(event);
4220}
4221
4222
4223
4224
4225static void perf_remove_from_owner(struct perf_event *event)
4226{
4227 struct task_struct *owner;
4228
4229 rcu_read_lock();
4230
4231
4232
4233
4234
4235
4236 owner = lockless_dereference(event->owner);
4237 if (owner) {
4238
4239
4240
4241
4242
4243 get_task_struct(owner);
4244 }
4245 rcu_read_unlock();
4246
4247 if (owner) {
4248
4249
4250
4251
4252
4253
4254
4255
4256 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4257
4258
4259
4260
4261
4262
4263
4264 if (event->owner) {
4265 list_del_init(&event->owner_entry);
4266 smp_store_release(&event->owner, NULL);
4267 }
4268 mutex_unlock(&owner->perf_event_mutex);
4269 put_task_struct(owner);
4270 }
4271}
4272
4273static void put_event(struct perf_event *event)
4274{
4275 if (!atomic_long_dec_and_test(&event->refcount))
4276 return;
4277
4278 _free_event(event);
4279}
4280
4281
4282
4283
4284
4285
4286int perf_event_release_kernel(struct perf_event *event)
4287{
4288 struct perf_event_context *ctx = event->ctx;
4289 struct perf_event *child, *tmp;
4290
4291
4292
4293
4294
4295 if (!ctx) {
4296 WARN_ON_ONCE(event->attach_state &
4297 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4298 goto no_ctx;
4299 }
4300
4301 if (!is_kernel_event(event))
4302 perf_remove_from_owner(event);
4303
4304 ctx = perf_event_ctx_lock(event);
4305 WARN_ON_ONCE(ctx->parent_ctx);
4306 perf_remove_from_context(event, DETACH_GROUP);
4307
4308 raw_spin_lock_irq(&ctx->lock);
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320 event->state = PERF_EVENT_STATE_DEAD;
4321 raw_spin_unlock_irq(&ctx->lock);
4322
4323 perf_event_ctx_unlock(event, ctx);
4324
4325again:
4326 mutex_lock(&event->child_mutex);
4327 list_for_each_entry(child, &event->child_list, child_list) {
4328
4329
4330
4331
4332
4333 ctx = lockless_dereference(child->ctx);
4334
4335
4336
4337
4338
4339
4340
4341
4342 get_ctx(ctx);
4343
4344
4345
4346
4347
4348
4349 mutex_unlock(&event->child_mutex);
4350 mutex_lock(&ctx->mutex);
4351 mutex_lock(&event->child_mutex);
4352
4353
4354
4355
4356
4357
4358 tmp = list_first_entry_or_null(&event->child_list,
4359 struct perf_event, child_list);
4360 if (tmp == child) {
4361 perf_remove_from_context(child, DETACH_GROUP);
4362 list_del(&child->child_list);
4363 free_event(child);
4364
4365
4366
4367
4368 put_event(event);
4369 }
4370
4371 mutex_unlock(&event->child_mutex);
4372 mutex_unlock(&ctx->mutex);
4373 put_ctx(ctx);
4374 goto again;
4375 }
4376 mutex_unlock(&event->child_mutex);
4377
4378no_ctx:
4379 put_event(event);
4380 return 0;
4381}
4382EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4383
4384
4385
4386
4387static int perf_release(struct inode *inode, struct file *file)
4388{
4389 perf_event_release_kernel(file->private_data);
4390 return 0;
4391}
4392
4393u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4394{
4395 struct perf_event *child;
4396 u64 total = 0;
4397
4398 *enabled = 0;
4399 *running = 0;
4400
4401 mutex_lock(&event->child_mutex);
4402
4403 (void)perf_event_read(event, false);
4404 total += perf_event_count(event);
4405
4406 *enabled += event->total_time_enabled +
4407 atomic64_read(&event->child_total_time_enabled);
4408 *running += event->total_time_running +
4409 atomic64_read(&event->child_total_time_running);
4410
4411 list_for_each_entry(child, &event->child_list, child_list) {
4412 (void)perf_event_read(child, false);
4413 total += perf_event_count(child);
4414 *enabled += child->total_time_enabled;
4415 *running += child->total_time_running;
4416 }
4417 mutex_unlock(&event->child_mutex);
4418
4419 return total;
4420}
4421EXPORT_SYMBOL_GPL(perf_event_read_value);
4422
4423static int __perf_read_group_add(struct perf_event *leader,
4424 u64 read_format, u64 *values)
4425{
4426 struct perf_event_context *ctx = leader->ctx;
4427 struct perf_event *sub;
4428 unsigned long flags;
4429 int n = 1;
4430 int ret;
4431
4432 ret = perf_event_read(leader, true);
4433 if (ret)
4434 return ret;
4435
4436
4437
4438
4439
4440
4441 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4442 values[n++] += leader->total_time_enabled +
4443 atomic64_read(&leader->child_total_time_enabled);
4444 }
4445
4446 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4447 values[n++] += leader->total_time_running +
4448 atomic64_read(&leader->child_total_time_running);
4449 }
4450
4451
4452
4453
4454 values[n++] += perf_event_count(leader);
4455 if (read_format & PERF_FORMAT_ID)
4456 values[n++] = primary_event_id(leader);
4457
4458 raw_spin_lock_irqsave(&ctx->lock, flags);
4459
4460 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4461 values[n++] += perf_event_count(sub);
4462 if (read_format & PERF_FORMAT_ID)
4463 values[n++] = primary_event_id(sub);
4464 }
4465
4466 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4467 return 0;
4468}
4469
4470static int perf_read_group(struct perf_event *event,
4471 u64 read_format, char __user *buf)
4472{
4473 struct perf_event *leader = event->group_leader, *child;
4474 struct perf_event_context *ctx = leader->ctx;
4475 int ret;
4476 u64 *values;
4477
4478 lockdep_assert_held(&ctx->mutex);
4479
4480 values = kzalloc(event->read_size, GFP_KERNEL);
4481 if (!values)
4482 return -ENOMEM;
4483
4484 values[0] = 1 + leader->nr_siblings;
4485
4486
4487
4488
4489
4490 mutex_lock(&leader->child_mutex);
4491
4492 ret = __perf_read_group_add(leader, read_format, values);
4493 if (ret)
4494 goto unlock;
4495
4496 list_for_each_entry(child, &leader->child_list, child_list) {
4497 ret = __perf_read_group_add(child, read_format, values);
4498 if (ret)
4499 goto unlock;
4500 }
4501
4502 mutex_unlock(&leader->child_mutex);
4503
4504 ret = event->read_size;
4505 if (copy_to_user(buf, values, event->read_size))
4506 ret = -EFAULT;
4507 goto out;
4508
4509unlock:
4510 mutex_unlock(&leader->child_mutex);
4511out:
4512 kfree(values);
4513 return ret;
4514}
4515
4516static int perf_read_one(struct perf_event *event,
4517 u64 read_format, char __user *buf)
4518{
4519 u64 enabled, running;
4520 u64 values[4];
4521 int n = 0;
4522
4523 values[n++] = perf_event_read_value(event, &enabled, &running);
4524 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4525 values[n++] = enabled;
4526 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4527 values[n++] = running;
4528 if (read_format & PERF_FORMAT_ID)
4529 values[n++] = primary_event_id(event);
4530
4531 if (copy_to_user(buf, values, n * sizeof(u64)))
4532 return -EFAULT;
4533
4534 return n * sizeof(u64);
4535}
4536
4537static bool is_event_hup(struct perf_event *event)
4538{
4539 bool no_children;
4540
4541 if (event->state > PERF_EVENT_STATE_EXIT)
4542 return false;
4543
4544 mutex_lock(&event->child_mutex);
4545 no_children = list_empty(&event->child_list);
4546 mutex_unlock(&event->child_mutex);
4547 return no_children;
4548}
4549
4550
4551
4552
4553static ssize_t
4554__perf_read(struct perf_event *event, char __user *buf, size_t count)
4555{
4556 u64 read_format = event->attr.read_format;
4557 int ret;
4558
4559
4560
4561
4562
4563
4564 if (event->state == PERF_EVENT_STATE_ERROR)
4565 return 0;
4566
4567 if (count < event->read_size)
4568 return -ENOSPC;
4569
4570 WARN_ON_ONCE(event->ctx->parent_ctx);
4571 if (read_format & PERF_FORMAT_GROUP)
4572 ret = perf_read_group(event, read_format, buf);
4573 else
4574 ret = perf_read_one(event, read_format, buf);
4575
4576 return ret;
4577}
4578
4579static ssize_t
4580perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4581{
4582 struct perf_event *event = file->private_data;
4583 struct perf_event_context *ctx;
4584 int ret;
4585
4586 ctx = perf_event_ctx_lock(event);
4587 ret = __perf_read(event, buf, count);
4588 perf_event_ctx_unlock(event, ctx);
4589
4590 return ret;
4591}
4592
4593static unsigned int perf_poll(struct file *file, poll_table *wait)
4594{
4595 struct perf_event *event = file->private_data;
4596 struct ring_buffer *rb;
4597 unsigned int events = POLLHUP;
4598
4599 poll_wait(file, &event->waitq, wait);
4600
4601 if (is_event_hup(event))
4602 return events;
4603
4604
4605
4606
4607
4608 mutex_lock(&event->mmap_mutex);
4609 rb = event->rb;
4610 if (rb)
4611 events = atomic_xchg(&rb->poll, 0);
4612 mutex_unlock(&event->mmap_mutex);
4613 return events;
4614}
4615
4616static void _perf_event_reset(struct perf_event *event)
4617{
4618 (void)perf_event_read(event, false);
4619 local64_set(&event->count, 0);
4620 perf_event_update_userpage(event);
4621}
4622
4623
4624
4625
4626
4627
4628
4629static void perf_event_for_each_child(struct perf_event *event,
4630 void (*func)(struct perf_event *))
4631{
4632 struct perf_event *child;
4633
4634 WARN_ON_ONCE(event->ctx->parent_ctx);
4635
4636 mutex_lock(&event->child_mutex);
4637 func(event);
4638 list_for_each_entry(child, &event->child_list, child_list)
4639 func(child);
4640 mutex_unlock(&event->child_mutex);
4641}
4642
4643static void perf_event_for_each(struct perf_event *event,
4644 void (*func)(struct perf_event *))
4645{
4646 struct perf_event_context *ctx = event->ctx;
4647 struct perf_event *sibling;
4648
4649 lockdep_assert_held(&ctx->mutex);
4650
4651 event = event->group_leader;
4652
4653 perf_event_for_each_child(event, func);
4654 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4655 perf_event_for_each_child(sibling, func);
4656}
4657
4658static void __perf_event_period(struct perf_event *event,
4659 struct perf_cpu_context *cpuctx,
4660 struct perf_event_context *ctx,
4661 void *info)
4662{
4663 u64 value = *((u64 *)info);
4664 bool active;
4665
4666 if (event->attr.freq) {
4667 event->attr.sample_freq = value;
4668 } else {
4669 event->attr.sample_period = value;
4670 event->hw.sample_period = value;
4671 }
4672
4673 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4674 if (active) {
4675 perf_pmu_disable(ctx->pmu);
4676
4677
4678
4679
4680 if (event->hw.interrupts == MAX_INTERRUPTS) {
4681 event->hw.interrupts = 0;
4682 perf_log_throttle(event, 1);
4683 }
4684 event->pmu->stop(event, PERF_EF_UPDATE);
4685 }
4686
4687 local64_set(&event->hw.period_left, 0);
4688
4689 if (active) {
4690 event->pmu->start(event, PERF_EF_RELOAD);
4691 perf_pmu_enable(ctx->pmu);
4692 }
4693}
4694
4695static int perf_event_period(struct perf_event *event, u64 __user *arg)
4696{
4697 u64 value;
4698
4699 if (!is_sampling_event(event))
4700 return -EINVAL;
4701
4702 if (copy_from_user(&value, arg, sizeof(value)))
4703 return -EFAULT;
4704
4705 if (!value)
4706 return -EINVAL;
4707
4708 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4709 return -EINVAL;
4710
4711 event_function_call(event, __perf_event_period, &value);
4712
4713 return 0;
4714}
4715
4716static const struct file_operations perf_fops;
4717
4718static inline int perf_fget_light(int fd, struct fd *p)
4719{
4720 struct fd f = fdget(fd);
4721 if (!f.file)
4722 return -EBADF;
4723
4724 if (f.file->f_op != &perf_fops) {
4725 fdput(f);
4726 return -EBADF;
4727 }
4728 *p = f;
4729 return 0;
4730}
4731
4732static int perf_event_set_output(struct perf_event *event,
4733 struct perf_event *output_event);
4734static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4735static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4736
4737static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4738{
4739 void (*func)(struct perf_event *);
4740 u32 flags = arg;
4741
4742 switch (cmd) {
4743 case PERF_EVENT_IOC_ENABLE:
4744 func = _perf_event_enable;
4745 break;
4746 case PERF_EVENT_IOC_DISABLE:
4747 func = _perf_event_disable;
4748 break;
4749 case PERF_EVENT_IOC_RESET:
4750 func = _perf_event_reset;
4751 break;
4752
4753 case PERF_EVENT_IOC_REFRESH:
4754 return _perf_event_refresh(event, arg);
4755
4756 case PERF_EVENT_IOC_PERIOD:
4757 return perf_event_period(event, (u64 __user *)arg);
4758
4759 case PERF_EVENT_IOC_ID:
4760 {
4761 u64 id = primary_event_id(event);
4762
4763 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4764 return -EFAULT;
4765 return 0;
4766 }
4767
4768 case PERF_EVENT_IOC_SET_OUTPUT:
4769 {
4770 int ret;
4771 if (arg != -1) {
4772 struct perf_event *output_event;
4773 struct fd output;
4774 ret = perf_fget_light(arg, &output);
4775 if (ret)
4776 return ret;
4777 output_event = output.file->private_data;
4778 ret = perf_event_set_output(event, output_event);
4779 fdput(output);
4780 } else {
4781 ret = perf_event_set_output(event, NULL);
4782 }
4783 return ret;
4784 }
4785
4786 case PERF_EVENT_IOC_SET_FILTER:
4787 return perf_event_set_filter(event, (void __user *)arg);
4788
4789 case PERF_EVENT_IOC_SET_BPF:
4790 return perf_event_set_bpf_prog(event, arg);
4791
4792 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4793 struct ring_buffer *rb;
4794
4795 rcu_read_lock();
4796 rb = rcu_dereference(event->rb);
4797 if (!rb || !rb->nr_pages) {
4798 rcu_read_unlock();
4799 return -EINVAL;
4800 }
4801 rb_toggle_paused(rb, !!arg);
4802 rcu_read_unlock();
4803 return 0;
4804 }
4805 default:
4806 return -ENOTTY;
4807 }
4808
4809 if (flags & PERF_IOC_FLAG_GROUP)
4810 perf_event_for_each(event, func);
4811 else
4812 perf_event_for_each_child(event, func);
4813
4814 return 0;
4815}
4816
4817static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4818{
4819 struct perf_event *event = file->private_data;
4820 struct perf_event_context *ctx;
4821 long ret;
4822
4823 ctx = perf_event_ctx_lock(event);
4824 ret = _perf_ioctl(event, cmd, arg);
4825 perf_event_ctx_unlock(event, ctx);
4826
4827 return ret;
4828}
4829
4830#ifdef CONFIG_COMPAT
4831static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4832 unsigned long arg)
4833{
4834 switch (_IOC_NR(cmd)) {
4835 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4836 case _IOC_NR(PERF_EVENT_IOC_ID):
4837
4838 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4839 cmd &= ~IOCSIZE_MASK;
4840 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4841 }
4842 break;
4843 }
4844 return perf_ioctl(file, cmd, arg);
4845}
4846#else
4847# define perf_compat_ioctl NULL
4848#endif
4849
4850int perf_event_task_enable(void)
4851{
4852 struct perf_event_context *ctx;
4853 struct perf_event *event;
4854
4855 mutex_lock(¤t->perf_event_mutex);
4856 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4857 ctx = perf_event_ctx_lock(event);
4858 perf_event_for_each_child(event, _perf_event_enable);
4859 perf_event_ctx_unlock(event, ctx);
4860 }
4861 mutex_unlock(¤t->perf_event_mutex);
4862
4863 return 0;
4864}
4865
4866int perf_event_task_disable(void)
4867{
4868 struct perf_event_context *ctx;
4869 struct perf_event *event;
4870
4871 mutex_lock(¤t->perf_event_mutex);
4872 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4873 ctx = perf_event_ctx_lock(event);
4874 perf_event_for_each_child(event, _perf_event_disable);
4875 perf_event_ctx_unlock(event, ctx);
4876 }
4877 mutex_unlock(¤t->perf_event_mutex);
4878
4879 return 0;
4880}
4881
4882static int perf_event_index(struct perf_event *event)
4883{
4884 if (event->hw.state & PERF_HES_STOPPED)
4885 return 0;
4886
4887 if (event->state != PERF_EVENT_STATE_ACTIVE)
4888 return 0;
4889
4890 return event->pmu->event_idx(event);
4891}
4892
4893static void calc_timer_values(struct perf_event *event,
4894 u64 *now,
4895 u64 *enabled,
4896 u64 *running)
4897{
4898 u64 ctx_time;
4899
4900 *now = perf_clock();
4901 ctx_time = event->shadow_ctx_time + *now;
4902 *enabled = ctx_time - event->tstamp_enabled;
4903 *running = ctx_time - event->tstamp_running;
4904}
4905
4906static void perf_event_init_userpage(struct perf_event *event)
4907{
4908 struct perf_event_mmap_page *userpg;
4909 struct ring_buffer *rb;
4910
4911 rcu_read_lock();
4912 rb = rcu_dereference(event->rb);
4913 if (!rb)
4914 goto unlock;
4915
4916 userpg = rb->user_page;
4917
4918
4919 userpg->cap_bit0_is_deprecated = 1;
4920 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4921 userpg->data_offset = PAGE_SIZE;
4922 userpg->data_size = perf_data_size(rb);
4923
4924unlock:
4925 rcu_read_unlock();
4926}
4927
4928void __weak arch_perf_update_userpage(
4929 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4930{
4931}
4932
4933
4934
4935
4936
4937
4938void perf_event_update_userpage(struct perf_event *event)
4939{
4940 struct perf_event_mmap_page *userpg;
4941 struct ring_buffer *rb;
4942 u64 enabled, running, now;
4943
4944 rcu_read_lock();
4945 rb = rcu_dereference(event->rb);
4946 if (!rb)
4947 goto unlock;
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958 calc_timer_values(event, &now, &enabled, &running);
4959
4960 userpg = rb->user_page;
4961
4962
4963
4964
4965 preempt_disable();
4966 ++userpg->lock;
4967 barrier();
4968 userpg->index = perf_event_index(event);
4969 userpg->offset = perf_event_count(event);
4970 if (userpg->index)
4971 userpg->offset -= local64_read(&event->hw.prev_count);
4972
4973 userpg->time_enabled = enabled +
4974 atomic64_read(&event->child_total_time_enabled);
4975
4976 userpg->time_running = running +
4977 atomic64_read(&event->child_total_time_running);
4978
4979 arch_perf_update_userpage(event, userpg, now);
4980
4981 barrier();
4982 ++userpg->lock;
4983 preempt_enable();
4984unlock:
4985 rcu_read_unlock();
4986}
4987
4988static int perf_mmap_fault(struct vm_fault *vmf)
4989{
4990 struct perf_event *event = vmf->vma->vm_file->private_data;
4991 struct ring_buffer *rb;
4992 int ret = VM_FAULT_SIGBUS;
4993
4994 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4995 if (vmf->pgoff == 0)
4996 ret = 0;
4997 return ret;
4998 }
4999
5000 rcu_read_lock();
5001 rb = rcu_dereference(event->rb);
5002 if (!rb)
5003 goto unlock;
5004
5005 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5006 goto unlock;
5007
5008 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5009 if (!vmf->page)
5010 goto unlock;
5011
5012 get_page(vmf->page);
5013 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5014 vmf->page->index = vmf->pgoff;
5015
5016 ret = 0;
5017unlock:
5018 rcu_read_unlock();
5019
5020 return ret;
5021}
5022
5023static void ring_buffer_attach(struct perf_event *event,
5024 struct ring_buffer *rb)
5025{
5026 struct ring_buffer *old_rb = NULL;
5027 unsigned long flags;
5028
5029 if (event->rb) {
5030
5031
5032
5033
5034 WARN_ON_ONCE(event->rcu_pending);
5035
5036 old_rb = event->rb;
5037 spin_lock_irqsave(&old_rb->event_lock, flags);
5038 list_del_rcu(&event->rb_entry);
5039 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5040
5041 event->rcu_batches = get_state_synchronize_rcu();
5042 event->rcu_pending = 1;
5043 }
5044
5045 if (rb) {
5046 if (event->rcu_pending) {
5047 cond_synchronize_rcu(event->rcu_batches);
5048 event->rcu_pending = 0;
5049 }
5050
5051 spin_lock_irqsave(&rb->event_lock, flags);
5052 list_add_rcu(&event->rb_entry, &rb->event_list);
5053 spin_unlock_irqrestore(&rb->event_lock, flags);
5054 }
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066 if (has_aux(event))
5067 perf_event_stop(event, 0);
5068
5069 rcu_assign_pointer(event->rb, rb);
5070
5071 if (old_rb) {
5072 ring_buffer_put(old_rb);
5073
5074
5075
5076
5077
5078 wake_up_all(&event->waitq);
5079 }
5080}
5081
5082static void ring_buffer_wakeup(struct perf_event *event)
5083{
5084 struct ring_buffer *rb;
5085
5086 rcu_read_lock();
5087 rb = rcu_dereference(event->rb);
5088 if (rb) {
5089 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5090 wake_up_all(&event->waitq);
5091 }
5092 rcu_read_unlock();
5093}
5094
5095struct ring_buffer *ring_buffer_get(struct perf_event *event)
5096{
5097 struct ring_buffer *rb;
5098
5099 rcu_read_lock();
5100 rb = rcu_dereference(event->rb);
5101 if (rb) {
5102 if (!atomic_inc_not_zero(&rb->refcount))
5103 rb = NULL;
5104 }
5105 rcu_read_unlock();
5106
5107 return rb;
5108}
5109
5110void ring_buffer_put(struct ring_buffer *rb)
5111{
5112 if (!atomic_dec_and_test(&rb->refcount))
5113 return;
5114
5115 WARN_ON_ONCE(!list_empty(&rb->event_list));
5116
5117 call_rcu(&rb->rcu_head, rb_free_rcu);
5118}
5119
5120static void perf_mmap_open(struct vm_area_struct *vma)
5121{
5122 struct perf_event *event = vma->vm_file->private_data;
5123
5124 atomic_inc(&event->mmap_count);
5125 atomic_inc(&event->rb->mmap_count);
5126
5127 if (vma->vm_pgoff)
5128 atomic_inc(&event->rb->aux_mmap_count);
5129
5130 if (event->pmu->event_mapped)
5131 event->pmu->event_mapped(event, vma->vm_mm);
5132}
5133
5134static void perf_pmu_output_stop(struct perf_event *event);
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144static void perf_mmap_close(struct vm_area_struct *vma)
5145{
5146 struct perf_event *event = vma->vm_file->private_data;
5147
5148 struct ring_buffer *rb = ring_buffer_get(event);
5149 struct user_struct *mmap_user = rb->mmap_user;
5150 int mmap_locked = rb->mmap_locked;
5151 unsigned long size = perf_data_size(rb);
5152
5153 if (event->pmu->event_unmapped)
5154 event->pmu->event_unmapped(event, vma->vm_mm);
5155
5156
5157
5158
5159
5160
5161 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5162 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5163
5164
5165
5166
5167
5168
5169 perf_pmu_output_stop(event);
5170
5171
5172 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5173 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5174
5175
5176 rb_free_aux(rb);
5177 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5178
5179 mutex_unlock(&event->mmap_mutex);
5180 }
5181
5182 atomic_dec(&rb->mmap_count);
5183
5184 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5185 goto out_put;
5186
5187 ring_buffer_attach(event, NULL);
5188 mutex_unlock(&event->mmap_mutex);
5189
5190
5191 if (atomic_read(&rb->mmap_count))
5192 goto out_put;
5193
5194
5195
5196
5197
5198
5199again:
5200 rcu_read_lock();
5201 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5202 if (!atomic_long_inc_not_zero(&event->refcount)) {
5203
5204
5205
5206
5207 continue;
5208 }
5209 rcu_read_unlock();
5210
5211 mutex_lock(&event->mmap_mutex);
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222 if (event->rb == rb)
5223 ring_buffer_attach(event, NULL);
5224
5225 mutex_unlock(&event->mmap_mutex);
5226 put_event(event);
5227
5228
5229
5230
5231
5232 goto again;
5233 }
5234 rcu_read_unlock();
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5246 vma->vm_mm->pinned_vm -= mmap_locked;
5247 free_uid(mmap_user);
5248
5249out_put:
5250 ring_buffer_put(rb);
5251}
5252
5253static const struct vm_operations_struct perf_mmap_vmops = {
5254 .open = perf_mmap_open,
5255 .close = perf_mmap_close,
5256 .fault = perf_mmap_fault,
5257 .page_mkwrite = perf_mmap_fault,
5258};
5259
5260static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5261{
5262 struct perf_event *event = file->private_data;
5263 unsigned long user_locked, user_lock_limit;
5264 struct user_struct *user = current_user();
5265 unsigned long locked, lock_limit;
5266 struct ring_buffer *rb = NULL;
5267 unsigned long vma_size;
5268 unsigned long nr_pages;
5269 long user_extra = 0, extra = 0;
5270 int ret = 0, flags = 0;
5271
5272
5273
5274
5275
5276
5277 if (event->cpu == -1 && event->attr.inherit)
5278 return -EINVAL;
5279
5280 if (!(vma->vm_flags & VM_SHARED))
5281 return -EINVAL;
5282
5283 vma_size = vma->vm_end - vma->vm_start;
5284
5285 if (vma->vm_pgoff == 0) {
5286 nr_pages = (vma_size / PAGE_SIZE) - 1;
5287 } else {
5288
5289
5290
5291
5292
5293 u64 aux_offset, aux_size;
5294
5295 if (!event->rb)
5296 return -EINVAL;
5297
5298 nr_pages = vma_size / PAGE_SIZE;
5299
5300 mutex_lock(&event->mmap_mutex);
5301 ret = -EINVAL;
5302
5303 rb = event->rb;
5304 if (!rb)
5305 goto aux_unlock;
5306
5307 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5308 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5309
5310 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5311 goto aux_unlock;
5312
5313 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5314 goto aux_unlock;
5315
5316
5317 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5318 goto aux_unlock;
5319
5320 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5321 goto aux_unlock;
5322
5323
5324 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5325 goto aux_unlock;
5326
5327 if (!is_power_of_2(nr_pages))
5328 goto aux_unlock;
5329
5330 if (!atomic_inc_not_zero(&rb->mmap_count))
5331 goto aux_unlock;
5332
5333 if (rb_has_aux(rb)) {
5334 atomic_inc(&rb->aux_mmap_count);
5335 ret = 0;
5336 goto unlock;
5337 }
5338
5339 atomic_set(&rb->aux_mmap_count, 1);
5340 user_extra = nr_pages;
5341
5342 goto accounting;
5343 }
5344
5345
5346
5347
5348
5349 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5350 return -EINVAL;
5351
5352 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5353 return -EINVAL;
5354
5355 WARN_ON_ONCE(event->ctx->parent_ctx);
5356again:
5357 mutex_lock(&event->mmap_mutex);
5358 if (event->rb) {
5359 if (event->rb->nr_pages != nr_pages) {
5360 ret = -EINVAL;
5361 goto unlock;
5362 }
5363
5364 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5365
5366
5367
5368
5369
5370 mutex_unlock(&event->mmap_mutex);
5371 goto again;
5372 }
5373
5374 goto unlock;
5375 }
5376
5377 user_extra = nr_pages + 1;
5378
5379accounting:
5380 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5381
5382
5383
5384
5385 user_lock_limit *= num_online_cpus();
5386
5387 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5388
5389 if (user_locked > user_lock_limit)
5390 extra = user_locked - user_lock_limit;
5391
5392 lock_limit = rlimit(RLIMIT_MEMLOCK);
5393 lock_limit >>= PAGE_SHIFT;
5394 locked = vma->vm_mm->pinned_vm + extra;
5395
5396 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5397 !capable(CAP_IPC_LOCK)) {
5398 ret = -EPERM;
5399 goto unlock;
5400 }
5401
5402 WARN_ON(!rb && event->rb);
5403
5404 if (vma->vm_flags & VM_WRITE)
5405 flags |= RING_BUFFER_WRITABLE;
5406
5407 if (!rb) {
5408 rb = rb_alloc(nr_pages,
5409 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5410 event->cpu, flags);
5411
5412 if (!rb) {
5413 ret = -ENOMEM;
5414 goto unlock;
5415 }
5416
5417 atomic_set(&rb->mmap_count, 1);
5418 rb->mmap_user = get_current_user();
5419 rb->mmap_locked = extra;
5420
5421 ring_buffer_attach(event, rb);
5422
5423 perf_event_init_userpage(event);
5424 perf_event_update_userpage(event);
5425 } else {
5426 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5427 event->attr.aux_watermark, flags);
5428 if (!ret)
5429 rb->aux_mmap_locked = extra;
5430 }
5431
5432unlock:
5433 if (!ret) {
5434 atomic_long_add(user_extra, &user->locked_vm);
5435 vma->vm_mm->pinned_vm += extra;
5436
5437 atomic_inc(&event->mmap_count);
5438 } else if (rb) {
5439 atomic_dec(&rb->mmap_count);
5440 }
5441aux_unlock:
5442 mutex_unlock(&event->mmap_mutex);
5443
5444
5445
5446
5447
5448 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5449 vma->vm_ops = &perf_mmap_vmops;
5450
5451 if (event->pmu->event_mapped)
5452 event->pmu->event_mapped(event, vma->vm_mm);
5453
5454 return ret;
5455}
5456
5457static int perf_fasync(int fd, struct file *filp, int on)
5458{
5459 struct inode *inode = file_inode(filp);
5460 struct perf_event *event = filp->private_data;
5461 int retval;
5462
5463 inode_lock(inode);
5464 retval = fasync_helper(fd, filp, on, &event->fasync);
5465 inode_unlock(inode);
5466
5467 if (retval < 0)
5468 return retval;
5469
5470 return 0;
5471}
5472
5473static const struct file_operations perf_fops = {
5474 .llseek = no_llseek,
5475 .release = perf_release,
5476 .read = perf_read,
5477 .poll = perf_poll,
5478 .unlocked_ioctl = perf_ioctl,
5479 .compat_ioctl = perf_compat_ioctl,
5480 .mmap = perf_mmap,
5481 .fasync = perf_fasync,
5482};
5483
5484
5485
5486
5487
5488
5489
5490
5491static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5492{
5493
5494 if (event->parent)
5495 event = event->parent;
5496 return &event->fasync;
5497}
5498
5499void perf_event_wakeup(struct perf_event *event)
5500{
5501 ring_buffer_wakeup(event);
5502
5503 if (event->pending_kill) {
5504 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5505 event->pending_kill = 0;
5506 }
5507}
5508
5509static void perf_pending_event(struct irq_work *entry)
5510{
5511 struct perf_event *event = container_of(entry,
5512 struct perf_event, pending);
5513 int rctx;
5514
5515 rctx = perf_swevent_get_recursion_context();
5516
5517
5518
5519
5520
5521 if (event->pending_disable) {
5522 event->pending_disable = 0;
5523 perf_event_disable_local(event);
5524 }
5525
5526 if (event->pending_wakeup) {
5527 event->pending_wakeup = 0;
5528 perf_event_wakeup(event);
5529 }
5530
5531 if (rctx >= 0)
5532 perf_swevent_put_recursion_context(rctx);
5533}
5534
5535
5536
5537
5538
5539
5540struct perf_guest_info_callbacks *perf_guest_cbs;
5541
5542int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5543{
5544 perf_guest_cbs = cbs;
5545 return 0;
5546}
5547EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5548
5549int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5550{
5551 perf_guest_cbs = NULL;
5552 return 0;
5553}
5554EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5555
5556static void
5557perf_output_sample_regs(struct perf_output_handle *handle,
5558 struct pt_regs *regs, u64 mask)
5559{
5560 int bit;
5561 DECLARE_BITMAP(_mask, 64);
5562
5563 bitmap_from_u64(_mask, mask);
5564 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5565 u64 val;
5566
5567 val = perf_reg_value(regs, bit);
5568 perf_output_put(handle, val);
5569 }
5570}
5571
5572static void perf_sample_regs_user(struct perf_regs *regs_user,
5573 struct pt_regs *regs,
5574 struct pt_regs *regs_user_copy)
5575{
5576 if (user_mode(regs)) {
5577 regs_user->abi = perf_reg_abi(current);
5578 regs_user->regs = regs;
5579 } else if (current->mm) {
5580 perf_get_regs_user(regs_user, regs, regs_user_copy);
5581 } else {
5582 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5583 regs_user->regs = NULL;
5584 }
5585}
5586
5587static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5588 struct pt_regs *regs)
5589{
5590 regs_intr->regs = regs;
5591 regs_intr->abi = perf_reg_abi(current);
5592}
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602static u64 perf_ustack_task_size(struct pt_regs *regs)
5603{
5604 unsigned long addr = perf_user_stack_pointer(regs);
5605
5606 if (!addr || addr >= TASK_SIZE)
5607 return 0;
5608
5609 return TASK_SIZE - addr;
5610}
5611
5612static u16
5613perf_sample_ustack_size(u16 stack_size, u16 header_size,
5614 struct pt_regs *regs)
5615{
5616 u64 task_size;
5617
5618
5619 if (!regs)
5620 return 0;
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5633 stack_size = min(stack_size, (u16) task_size);
5634
5635
5636 header_size += 2 * sizeof(u64);
5637
5638
5639 if ((u16) (header_size + stack_size) < header_size) {
5640
5641
5642
5643
5644 stack_size = USHRT_MAX - header_size - sizeof(u64);
5645 stack_size = round_up(stack_size, sizeof(u64));
5646 }
5647
5648 return stack_size;
5649}
5650
5651static void
5652perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5653 struct pt_regs *regs)
5654{
5655
5656 if (!regs) {
5657 u64 size = 0;
5658 perf_output_put(handle, size);
5659 } else {
5660 unsigned long sp;
5661 unsigned int rem;
5662 u64 dyn_size;
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676 perf_output_put(handle, dump_size);
5677
5678
5679 sp = perf_user_stack_pointer(regs);
5680 rem = __output_copy_user(handle, (void *) sp, dump_size);
5681 dyn_size = dump_size - rem;
5682
5683 perf_output_skip(handle, rem);
5684
5685
5686 perf_output_put(handle, dyn_size);
5687 }
5688}
5689
5690static void __perf_event_header__init_id(struct perf_event_header *header,
5691 struct perf_sample_data *data,
5692 struct perf_event *event)
5693{
5694 u64 sample_type = event->attr.sample_type;
5695
5696 data->type = sample_type;
5697 header->size += event->id_header_size;
5698
5699 if (sample_type & PERF_SAMPLE_TID) {
5700
5701 data->tid_entry.pid = perf_event_pid(event, current);
5702 data->tid_entry.tid = perf_event_tid(event, current);
5703 }
5704
5705 if (sample_type & PERF_SAMPLE_TIME)
5706 data->time = perf_event_clock(event);
5707
5708 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5709 data->id = primary_event_id(event);
5710
5711 if (sample_type & PERF_SAMPLE_STREAM_ID)
5712 data->stream_id = event->id;
5713
5714 if (sample_type & PERF_SAMPLE_CPU) {
5715 data->cpu_entry.cpu = raw_smp_processor_id();
5716 data->cpu_entry.reserved = 0;
5717 }
5718}
5719
5720void perf_event_header__init_id(struct perf_event_header *header,
5721 struct perf_sample_data *data,
5722 struct perf_event *event)
5723{
5724 if (event->attr.sample_id_all)
5725 __perf_event_header__init_id(header, data, event);
5726}
5727
5728static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5729 struct perf_sample_data *data)
5730{
5731 u64 sample_type = data->type;
5732
5733 if (sample_type & PERF_SAMPLE_TID)
5734 perf_output_put(handle, data->tid_entry);
5735
5736 if (sample_type & PERF_SAMPLE_TIME)
5737 perf_output_put(handle, data->time);
5738
5739 if (sample_type & PERF_SAMPLE_ID)
5740 perf_output_put(handle, data->id);
5741
5742 if (sample_type & PERF_SAMPLE_STREAM_ID)
5743 perf_output_put(handle, data->stream_id);
5744
5745 if (sample_type & PERF_SAMPLE_CPU)
5746 perf_output_put(handle, data->cpu_entry);
5747
5748 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5749 perf_output_put(handle, data->id);
5750}
5751
5752void perf_event__output_id_sample(struct perf_event *event,
5753 struct perf_output_handle *handle,
5754 struct perf_sample_data *sample)
5755{
5756 if (event->attr.sample_id_all)
5757 __perf_event__output_id_sample(handle, sample);
5758}
5759
5760static void perf_output_read_one(struct perf_output_handle *handle,
5761 struct perf_event *event,
5762 u64 enabled, u64 running)
5763{
5764 u64 read_format = event->attr.read_format;
5765 u64 values[4];
5766 int n = 0;
5767
5768 values[n++] = perf_event_count(event);
5769 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5770 values[n++] = enabled +
5771 atomic64_read(&event->child_total_time_enabled);
5772 }
5773 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5774 values[n++] = running +
5775 atomic64_read(&event->child_total_time_running);
5776 }
5777 if (read_format & PERF_FORMAT_ID)
5778 values[n++] = primary_event_id(event);
5779
5780 __output_copy(handle, values, n * sizeof(u64));
5781}
5782
5783static void perf_output_read_group(struct perf_output_handle *handle,
5784 struct perf_event *event,
5785 u64 enabled, u64 running)
5786{
5787 struct perf_event *leader = event->group_leader, *sub;
5788 u64 read_format = event->attr.read_format;
5789 u64 values[5];
5790 int n = 0;
5791
5792 values[n++] = 1 + leader->nr_siblings;
5793
5794 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5795 values[n++] = enabled;
5796
5797 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5798 values[n++] = running;
5799
5800 if (leader != event)
5801 leader->pmu->read(leader);
5802
5803 values[n++] = perf_event_count(leader);
5804 if (read_format & PERF_FORMAT_ID)
5805 values[n++] = primary_event_id(leader);
5806
5807 __output_copy(handle, values, n * sizeof(u64));
5808
5809 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5810 n = 0;
5811
5812 if ((sub != event) &&
5813 (sub->state == PERF_EVENT_STATE_ACTIVE))
5814 sub->pmu->read(sub);
5815
5816 values[n++] = perf_event_count(sub);
5817 if (read_format & PERF_FORMAT_ID)
5818 values[n++] = primary_event_id(sub);
5819
5820 __output_copy(handle, values, n * sizeof(u64));
5821 }
5822}
5823
5824#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5825 PERF_FORMAT_TOTAL_TIME_RUNNING)
5826
5827
5828
5829
5830
5831
5832
5833
5834static void perf_output_read(struct perf_output_handle *handle,
5835 struct perf_event *event)
5836{
5837 u64 enabled = 0, running = 0, now;
5838 u64 read_format = event->attr.read_format;
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5850 calc_timer_values(event, &now, &enabled, &running);
5851
5852 if (event->attr.read_format & PERF_FORMAT_GROUP)
5853 perf_output_read_group(handle, event, enabled, running);
5854 else
5855 perf_output_read_one(handle, event, enabled, running);
5856}
5857
5858void perf_output_sample(struct perf_output_handle *handle,
5859 struct perf_event_header *header,
5860 struct perf_sample_data *data,
5861 struct perf_event *event)
5862{
5863 u64 sample_type = data->type;
5864
5865 perf_output_put(handle, *header);
5866
5867 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5868 perf_output_put(handle, data->id);
5869
5870 if (sample_type & PERF_SAMPLE_IP)
5871 perf_output_put(handle, data->ip);
5872
5873 if (sample_type & PERF_SAMPLE_TID)
5874 perf_output_put(handle, data->tid_entry);
5875
5876 if (sample_type & PERF_SAMPLE_TIME)
5877 perf_output_put(handle, data->time);
5878
5879 if (sample_type & PERF_SAMPLE_ADDR)
5880 perf_output_put(handle, data->addr);
5881
5882 if (sample_type & PERF_SAMPLE_ID)
5883 perf_output_put(handle, data->id);
5884
5885 if (sample_type & PERF_SAMPLE_STREAM_ID)
5886 perf_output_put(handle, data->stream_id);
5887
5888 if (sample_type & PERF_SAMPLE_CPU)
5889 perf_output_put(handle, data->cpu_entry);
5890
5891 if (sample_type & PERF_SAMPLE_PERIOD)
5892 perf_output_put(handle, data->period);
5893
5894 if (sample_type & PERF_SAMPLE_READ)
5895 perf_output_read(handle, event);
5896
5897 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5898 if (data->callchain) {
5899 int size = 1;
5900
5901 if (data->callchain)
5902 size += data->callchain->nr;
5903
5904 size *= sizeof(u64);
5905
5906 __output_copy(handle, data->callchain, size);
5907 } else {
5908 u64 nr = 0;
5909 perf_output_put(handle, nr);
5910 }
5911 }
5912
5913 if (sample_type & PERF_SAMPLE_RAW) {
5914 struct perf_raw_record *raw = data->raw;
5915
5916 if (raw) {
5917 struct perf_raw_frag *frag = &raw->frag;
5918
5919 perf_output_put(handle, raw->size);
5920 do {
5921 if (frag->copy) {
5922 __output_custom(handle, frag->copy,
5923 frag->data, frag->size);
5924 } else {
5925 __output_copy(handle, frag->data,
5926 frag->size);
5927 }
5928 if (perf_raw_frag_last(frag))
5929 break;
5930 frag = frag->next;
5931 } while (1);
5932 if (frag->pad)
5933 __output_skip(handle, NULL, frag->pad);
5934 } else {
5935 struct {
5936 u32 size;
5937 u32 data;
5938 } raw = {
5939 .size = sizeof(u32),
5940 .data = 0,
5941 };
5942 perf_output_put(handle, raw);
5943 }
5944 }
5945
5946 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5947 if (data->br_stack) {
5948 size_t size;
5949
5950 size = data->br_stack->nr
5951 * sizeof(struct perf_branch_entry);
5952
5953 perf_output_put(handle, data->br_stack->nr);
5954 perf_output_copy(handle, data->br_stack->entries, size);
5955 } else {
5956
5957
5958
5959 u64 nr = 0;
5960 perf_output_put(handle, nr);
5961 }
5962 }
5963
5964 if (sample_type & PERF_SAMPLE_REGS_USER) {
5965 u64 abi = data->regs_user.abi;
5966
5967
5968
5969
5970
5971 perf_output_put(handle, abi);
5972
5973 if (abi) {
5974 u64 mask = event->attr.sample_regs_user;
5975 perf_output_sample_regs(handle,
5976 data->regs_user.regs,
5977 mask);
5978 }
5979 }
5980
5981 if (sample_type & PERF_SAMPLE_STACK_USER) {
5982 perf_output_sample_ustack(handle,
5983 data->stack_user_size,
5984 data->regs_user.regs);
5985 }
5986
5987 if (sample_type & PERF_SAMPLE_WEIGHT)
5988 perf_output_put(handle, data->weight);
5989
5990 if (sample_type & PERF_SAMPLE_DATA_SRC)
5991 perf_output_put(handle, data->data_src.val);
5992
5993 if (sample_type & PERF_SAMPLE_TRANSACTION)
5994 perf_output_put(handle, data->txn);
5995
5996 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5997 u64 abi = data->regs_intr.abi;
5998
5999
6000
6001
6002 perf_output_put(handle, abi);
6003
6004 if (abi) {
6005 u64 mask = event->attr.sample_regs_intr;
6006
6007 perf_output_sample_regs(handle,
6008 data->regs_intr.regs,
6009 mask);
6010 }
6011 }
6012
6013 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6014 perf_output_put(handle, data->phys_addr);
6015
6016 if (!event->attr.watermark) {
6017 int wakeup_events = event->attr.wakeup_events;
6018
6019 if (wakeup_events) {
6020 struct ring_buffer *rb = handle->rb;
6021 int events = local_inc_return(&rb->events);
6022
6023 if (events >= wakeup_events) {
6024 local_sub(wakeup_events, &rb->events);
6025 local_inc(&rb->wakeup);
6026 }
6027 }
6028 }
6029}
6030
6031static u64 perf_virt_to_phys(u64 virt)
6032{
6033 u64 phys_addr = 0;
6034 struct page *p = NULL;
6035
6036 if (!virt)
6037 return 0;
6038
6039 if (virt >= TASK_SIZE) {
6040
6041 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6042 !(virt >= VMALLOC_START && virt < VMALLOC_END))
6043 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6044 } else {
6045
6046
6047
6048
6049
6050
6051
6052 if ((current->mm != NULL) &&
6053 (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6054 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6055
6056 if (p)
6057 put_page(p);
6058 }
6059
6060 return phys_addr;
6061}
6062
6063void perf_prepare_sample(struct perf_event_header *header,
6064 struct perf_sample_data *data,
6065 struct perf_event *event,
6066 struct pt_regs *regs)
6067{
6068 u64 sample_type = event->attr.sample_type;
6069
6070 header->type = PERF_RECORD_SAMPLE;
6071 header->size = sizeof(*header) + event->header_size;
6072
6073 header->misc = 0;
6074 header->misc |= perf_misc_flags(regs);
6075
6076 __perf_event_header__init_id(header, data, event);
6077
6078 if (sample_type & PERF_SAMPLE_IP)
6079 data->ip = perf_instruction_pointer(regs);
6080
6081 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6082 int size = 1;
6083
6084 data->callchain = perf_callchain(event, regs);
6085
6086 if (data->callchain)
6087 size += data->callchain->nr;
6088
6089 header->size += size * sizeof(u64);
6090 }
6091
6092 if (sample_type & PERF_SAMPLE_RAW) {
6093 struct perf_raw_record *raw = data->raw;
6094 int size;
6095
6096 if (raw) {
6097 struct perf_raw_frag *frag = &raw->frag;
6098 u32 sum = 0;
6099
6100 do {
6101 sum += frag->size;
6102 if (perf_raw_frag_last(frag))
6103 break;
6104 frag = frag->next;
6105 } while (1);
6106
6107 size = round_up(sum + sizeof(u32), sizeof(u64));
6108 raw->size = size - sizeof(u32);
6109 frag->pad = raw->size - sum;
6110 } else {
6111 size = sizeof(u64);
6112 }
6113
6114 header->size += size;
6115 }
6116
6117 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6118 int size = sizeof(u64);
6119 if (data->br_stack) {
6120 size += data->br_stack->nr
6121 * sizeof(struct perf_branch_entry);
6122 }
6123 header->size += size;
6124 }
6125
6126 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6127 perf_sample_regs_user(&data->regs_user, regs,
6128 &data->regs_user_copy);
6129
6130 if (sample_type & PERF_SAMPLE_REGS_USER) {
6131
6132 int size = sizeof(u64);
6133
6134 if (data->regs_user.regs) {
6135 u64 mask = event->attr.sample_regs_user;
6136 size += hweight64(mask) * sizeof(u64);
6137 }
6138
6139 header->size += size;
6140 }
6141
6142 if (sample_type & PERF_SAMPLE_STACK_USER) {
6143
6144
6145
6146
6147
6148
6149 u16 stack_size = event->attr.sample_stack_user;
6150 u16 size = sizeof(u64);
6151
6152 stack_size = perf_sample_ustack_size(stack_size, header->size,
6153 data->regs_user.regs);
6154
6155
6156
6157
6158
6159
6160 if (stack_size)
6161 size += sizeof(u64) + stack_size;
6162
6163 data->stack_user_size = stack_size;
6164 header->size += size;
6165 }
6166
6167 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6168
6169 int size = sizeof(u64);
6170
6171 perf_sample_regs_intr(&data->regs_intr, regs);
6172
6173 if (data->regs_intr.regs) {
6174 u64 mask = event->attr.sample_regs_intr;
6175
6176 size += hweight64(mask) * sizeof(u64);
6177 }
6178
6179 header->size += size;
6180 }
6181
6182 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6183 data->phys_addr = perf_virt_to_phys(data->addr);
6184}
6185
6186static void __always_inline
6187__perf_event_output(struct perf_event *event,
6188 struct perf_sample_data *data,
6189 struct pt_regs *regs,
6190 int (*output_begin)(struct perf_output_handle *,
6191 struct perf_event *,
6192 unsigned int))
6193{
6194 struct perf_output_handle handle;
6195 struct perf_event_header header;
6196
6197
6198 rcu_read_lock();
6199
6200 perf_prepare_sample(&header, data, event, regs);
6201
6202 if (output_begin(&handle, event, header.size))
6203 goto exit;
6204
6205 perf_output_sample(&handle, &header, data, event);
6206
6207 perf_output_end(&handle);
6208
6209exit:
6210 rcu_read_unlock();
6211}
6212
6213void
6214perf_event_output_forward(struct perf_event *event,
6215 struct perf_sample_data *data,
6216 struct pt_regs *regs)
6217{
6218 __perf_event_output(event, data, regs, perf_output_begin_forward);
6219}
6220
6221void
6222perf_event_output_backward(struct perf_event *event,
6223 struct perf_sample_data *data,
6224 struct pt_regs *regs)
6225{
6226 __perf_event_output(event, data, regs, perf_output_begin_backward);
6227}
6228
6229void
6230perf_event_output(struct perf_event *event,
6231 struct perf_sample_data *data,
6232 struct pt_regs *regs)
6233{
6234 __perf_event_output(event, data, regs, perf_output_begin);
6235}
6236
6237
6238
6239
6240
6241struct perf_read_event {
6242 struct perf_event_header header;
6243
6244 u32 pid;
6245 u32 tid;
6246};
6247
6248static void
6249perf_event_read_event(struct perf_event *event,
6250 struct task_struct *task)
6251{
6252 struct perf_output_handle handle;
6253 struct perf_sample_data sample;
6254 struct perf_read_event read_event = {
6255 .header = {
6256 .type = PERF_RECORD_READ,
6257 .misc = 0,
6258 .size = sizeof(read_event) + event->read_size,
6259 },
6260 .pid = perf_event_pid(event, task),
6261 .tid = perf_event_tid(event, task),
6262 };
6263 int ret;
6264
6265 perf_event_header__init_id(&read_event.header, &sample, event);
6266 ret = perf_output_begin(&handle, event, read_event.header.size);
6267 if (ret)
6268 return;
6269
6270 perf_output_put(&handle, read_event);
6271 perf_output_read(&handle, event);
6272 perf_event__output_id_sample(event, &handle, &sample);
6273
6274 perf_output_end(&handle);
6275}
6276
6277typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6278
6279static void
6280perf_iterate_ctx(struct perf_event_context *ctx,
6281 perf_iterate_f output,
6282 void *data, bool all)
6283{
6284 struct perf_event *event;
6285
6286 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6287 if (!all) {
6288 if (event->state < PERF_EVENT_STATE_INACTIVE)
6289 continue;
6290 if (!event_filter_match(event))
6291 continue;
6292 }
6293
6294 output(event, data);
6295 }
6296}
6297
6298static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6299{
6300 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6301 struct perf_event *event;
6302
6303 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6304
6305
6306
6307
6308
6309 if (!smp_load_acquire(&event->ctx))
6310 continue;
6311
6312 if (event->state < PERF_EVENT_STATE_INACTIVE)
6313 continue;
6314 if (!event_filter_match(event))
6315 continue;
6316 output(event, data);
6317 }
6318}
6319
6320
6321
6322
6323
6324
6325
6326static void
6327perf_iterate_sb(perf_iterate_f output, void *data,
6328 struct perf_event_context *task_ctx)
6329{
6330 struct perf_event_context *ctx;
6331 int ctxn;
6332
6333 rcu_read_lock();
6334 preempt_disable();
6335
6336
6337
6338
6339
6340
6341 if (task_ctx) {
6342 perf_iterate_ctx(task_ctx, output, data, false);
6343 goto done;
6344 }
6345
6346 perf_iterate_sb_cpu(output, data);
6347
6348 for_each_task_context_nr(ctxn) {
6349 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6350 if (ctx)
6351 perf_iterate_ctx(ctx, output, data, false);
6352 }
6353done:
6354 preempt_enable();
6355 rcu_read_unlock();
6356}
6357
6358
6359
6360
6361
6362static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6363{
6364 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6365 struct perf_addr_filter *filter;
6366 unsigned int restart = 0, count = 0;
6367 unsigned long flags;
6368
6369 if (!has_addr_filter(event))
6370 return;
6371
6372 raw_spin_lock_irqsave(&ifh->lock, flags);
6373 list_for_each_entry(filter, &ifh->list, entry) {
6374 if (filter->inode) {
6375 event->addr_filters_offs[count] = 0;
6376 restart++;
6377 }
6378
6379 count++;
6380 }
6381
6382 if (restart)
6383 event->addr_filters_gen++;
6384 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6385
6386 if (restart)
6387 perf_event_stop(event, 1);
6388}
6389
6390void perf_event_exec(void)
6391{
6392 struct perf_event_context *ctx;
6393 int ctxn;
6394
6395 rcu_read_lock();
6396 for_each_task_context_nr(ctxn) {
6397 ctx = current->perf_event_ctxp[ctxn];
6398 if (!ctx)
6399 continue;
6400
6401 perf_event_enable_on_exec(ctxn);
6402
6403 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6404 true);
6405 }
6406 rcu_read_unlock();
6407}
6408
6409struct remote_output {
6410 struct ring_buffer *rb;
6411 int err;
6412};
6413
6414static void __perf_event_output_stop(struct perf_event *event, void *data)
6415{
6416 struct perf_event *parent = event->parent;
6417 struct remote_output *ro = data;
6418 struct ring_buffer *rb = ro->rb;
6419 struct stop_event_data sd = {
6420 .event = event,
6421 };
6422
6423 if (!has_aux(event))
6424 return;
6425
6426 if (!parent)
6427 parent = event;
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439 if (rcu_dereference(parent->rb) == rb)
6440 ro->err = __perf_event_stop(&sd);
6441}
6442
6443static int __perf_pmu_output_stop(void *info)
6444{
6445 struct perf_event *event = info;
6446 struct pmu *pmu = event->pmu;
6447 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6448 struct remote_output ro = {
6449 .rb = event->rb,
6450 };
6451
6452 rcu_read_lock();
6453 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6454 if (cpuctx->task_ctx)
6455 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6456 &ro, false);
6457 rcu_read_unlock();
6458
6459 return ro.err;
6460}
6461
6462static void perf_pmu_output_stop(struct perf_event *event)
6463{
6464 struct perf_event *iter;
6465 int err, cpu;
6466
6467restart:
6468 rcu_read_lock();
6469 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6470
6471
6472
6473
6474
6475
6476 cpu = iter->cpu;
6477 if (cpu == -1)
6478 cpu = READ_ONCE(iter->oncpu);
6479
6480 if (cpu == -1)
6481 continue;
6482
6483 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6484 if (err == -EAGAIN) {
6485 rcu_read_unlock();
6486 goto restart;
6487 }
6488 }
6489 rcu_read_unlock();
6490}
6491
6492
6493
6494
6495
6496
6497
6498struct perf_task_event {
6499 struct task_struct *task;
6500 struct perf_event_context *task_ctx;
6501
6502 struct {
6503 struct perf_event_header header;
6504
6505 u32 pid;
6506 u32 ppid;
6507 u32 tid;
6508 u32 ptid;
6509 u64 time;
6510 } event_id;
6511};
6512
6513static int perf_event_task_match(struct perf_event *event)
6514{
6515 return event->attr.comm || event->attr.mmap ||
6516 event->attr.mmap2 || event->attr.mmap_data ||
6517 event->attr.task;
6518}
6519
6520static void perf_event_task_output(struct perf_event *event,
6521 void *data)
6522{
6523 struct perf_task_event *task_event = data;
6524 struct perf_output_handle handle;
6525 struct perf_sample_data sample;
6526 struct task_struct *task = task_event->task;
6527 int ret, size = task_event->event_id.header.size;
6528
6529 if (!perf_event_task_match(event))
6530 return;
6531
6532 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6533
6534 ret = perf_output_begin(&handle, event,
6535 task_event->event_id.header.size);
6536 if (ret)
6537 goto out;
6538
6539 task_event->event_id.pid = perf_event_pid(event, task);
6540 task_event->event_id.ppid = perf_event_pid(event, current);
6541
6542 task_event->event_id.tid = perf_event_tid(event, task);
6543 task_event->event_id.ptid = perf_event_tid(event, current);
6544
6545 task_event->event_id.time = perf_event_clock(event);
6546
6547 perf_output_put(&handle, task_event->event_id);
6548
6549 perf_event__output_id_sample(event, &handle, &sample);
6550
6551 perf_output_end(&handle);
6552out:
6553 task_event->event_id.header.size = size;
6554}
6555
6556static void perf_event_task(struct task_struct *task,
6557 struct perf_event_context *task_ctx,
6558 int new)
6559{
6560 struct perf_task_event task_event;
6561
6562 if (!atomic_read(&nr_comm_events) &&
6563 !atomic_read(&nr_mmap_events) &&
6564 !atomic_read(&nr_task_events))
6565 return;
6566
6567 task_event = (struct perf_task_event){
6568 .task = task,
6569 .task_ctx = task_ctx,
6570 .event_id = {
6571 .header = {
6572 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6573 .misc = 0,
6574 .size = sizeof(task_event.event_id),
6575 },
6576
6577
6578
6579
6580
6581 },
6582 };
6583
6584 perf_iterate_sb(perf_event_task_output,
6585 &task_event,
6586 task_ctx);
6587}
6588
6589void perf_event_fork(struct task_struct *task)
6590{
6591 perf_event_task(task, NULL, 1);
6592 perf_event_namespaces(task);
6593}
6594
6595
6596
6597
6598
6599struct perf_comm_event {
6600 struct task_struct *task;
6601 char *comm;
6602 int comm_size;
6603
6604 struct {
6605 struct perf_event_header header;
6606
6607 u32 pid;
6608 u32 tid;
6609 } event_id;
6610};
6611
6612static int perf_event_comm_match(struct perf_event *event)
6613{
6614 return event->attr.comm;
6615}
6616
6617static void perf_event_comm_output(struct perf_event *event,
6618 void *data)
6619{
6620 struct perf_comm_event *comm_event = data;
6621 struct perf_output_handle handle;
6622 struct perf_sample_data sample;
6623 int size = comm_event->event_id.header.size;
6624 int ret;
6625
6626 if (!perf_event_comm_match(event))
6627 return;
6628
6629 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6630 ret = perf_output_begin(&handle, event,
6631 comm_event->event_id.header.size);
6632
6633 if (ret)
6634 goto out;
6635
6636 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6637 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6638
6639 perf_output_put(&handle, comm_event->event_id);
6640 __output_copy(&handle, comm_event->comm,
6641 comm_event->comm_size);
6642
6643 perf_event__output_id_sample(event, &handle, &sample);
6644
6645 perf_output_end(&handle);
6646out:
6647 comm_event->event_id.header.size = size;
6648}
6649
6650static void perf_event_comm_event(struct perf_comm_event *comm_event)
6651{
6652 char comm[TASK_COMM_LEN];
6653 unsigned int size;
6654
6655 memset(comm, 0, sizeof(comm));
6656 strlcpy(comm, comm_event->task->comm, sizeof(comm));
6657 size = ALIGN(strlen(comm)+1, sizeof(u64));
6658
6659 comm_event->comm = comm;
6660 comm_event->comm_size = size;
6661
6662 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6663
6664 perf_iterate_sb(perf_event_comm_output,
6665 comm_event,
6666 NULL);
6667}
6668
6669void perf_event_comm(struct task_struct *task, bool exec)
6670{
6671 struct perf_comm_event comm_event;
6672
6673 if (!atomic_read(&nr_comm_events))
6674 return;
6675
6676 comm_event = (struct perf_comm_event){
6677 .task = task,
6678
6679
6680 .event_id = {
6681 .header = {
6682 .type = PERF_RECORD_COMM,
6683 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6684
6685 },
6686
6687
6688 },
6689 };
6690
6691 perf_event_comm_event(&comm_event);
6692}
6693
6694
6695
6696
6697
6698struct perf_namespaces_event {
6699 struct task_struct *task;
6700
6701 struct {
6702 struct perf_event_header header;
6703
6704 u32 pid;
6705 u32 tid;
6706 u64 nr_namespaces;
6707 struct perf_ns_link_info link_info[NR_NAMESPACES];
6708 } event_id;
6709};
6710
6711static int perf_event_namespaces_match(struct perf_event *event)
6712{
6713 return event->attr.namespaces;
6714}
6715
6716static void perf_event_namespaces_output(struct perf_event *event,
6717 void *data)
6718{
6719 struct perf_namespaces_event *namespaces_event = data;
6720 struct perf_output_handle handle;
6721 struct perf_sample_data sample;
6722 int ret;
6723
6724 if (!perf_event_namespaces_match(event))
6725 return;
6726
6727 perf_event_header__init_id(&namespaces_event->event_id.header,
6728 &sample, event);
6729 ret = perf_output_begin(&handle, event,
6730 namespaces_event->event_id.header.size);
6731 if (ret)
6732 return;
6733
6734 namespaces_event->event_id.pid = perf_event_pid(event,
6735 namespaces_event->task);
6736 namespaces_event->event_id.tid = perf_event_tid(event,
6737 namespaces_event->task);
6738
6739 perf_output_put(&handle, namespaces_event->event_id);
6740
6741 perf_event__output_id_sample(event, &handle, &sample);
6742
6743 perf_output_end(&handle);
6744}
6745
6746static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
6747 struct task_struct *task,
6748 const struct proc_ns_operations *ns_ops)
6749{
6750 struct path ns_path;
6751 struct inode *ns_inode;
6752 void *error;
6753
6754 error = ns_get_path(&ns_path, task, ns_ops);
6755 if (!error) {
6756 ns_inode = ns_path.dentry->d_inode;
6757 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
6758 ns_link_info->ino = ns_inode->i_ino;
6759 }
6760}
6761
6762void perf_event_namespaces(struct task_struct *task)
6763{
6764 struct perf_namespaces_event namespaces_event;
6765 struct perf_ns_link_info *ns_link_info;
6766
6767 if (!atomic_read(&nr_namespaces_events))
6768 return;
6769
6770 namespaces_event = (struct perf_namespaces_event){
6771 .task = task,
6772 .event_id = {
6773 .header = {
6774 .type = PERF_RECORD_NAMESPACES,
6775 .misc = 0,
6776 .size = sizeof(namespaces_event.event_id),
6777 },
6778
6779
6780 .nr_namespaces = NR_NAMESPACES,
6781
6782 },
6783 };
6784
6785 ns_link_info = namespaces_event.event_id.link_info;
6786
6787 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
6788 task, &mntns_operations);
6789
6790#ifdef CONFIG_USER_NS
6791 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
6792 task, &userns_operations);
6793#endif
6794#ifdef CONFIG_NET_NS
6795 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
6796 task, &netns_operations);
6797#endif
6798#ifdef CONFIG_UTS_NS
6799 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
6800 task, &utsns_operations);
6801#endif
6802#ifdef CONFIG_IPC_NS
6803 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
6804 task, &ipcns_operations);
6805#endif
6806#ifdef CONFIG_PID_NS
6807 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
6808 task, &pidns_operations);
6809#endif
6810#ifdef CONFIG_CGROUPS
6811 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
6812 task, &cgroupns_operations);
6813#endif
6814
6815 perf_iterate_sb(perf_event_namespaces_output,
6816 &namespaces_event,
6817 NULL);
6818}
6819
6820
6821
6822
6823
6824struct perf_mmap_event {
6825 struct vm_area_struct *vma;
6826
6827 const char *file_name;
6828 int file_size;
6829 int maj, min;
6830 u64 ino;
6831 u64 ino_generation;
6832 u32 prot, flags;
6833
6834 struct {
6835 struct perf_event_header header;
6836
6837 u32 pid;
6838 u32 tid;
6839 u64 start;
6840 u64 len;
6841 u64 pgoff;
6842 } event_id;
6843};
6844
6845static int perf_event_mmap_match(struct perf_event *event,
6846 void *data)
6847{
6848 struct perf_mmap_event *mmap_event = data;
6849 struct vm_area_struct *vma = mmap_event->vma;
6850 int executable = vma->vm_flags & VM_EXEC;
6851
6852 return (!executable && event->attr.mmap_data) ||
6853 (executable && (event->attr.mmap || event->attr.mmap2));
6854}
6855
6856static void perf_event_mmap_output(struct perf_event *event,
6857 void *data)
6858{
6859 struct perf_mmap_event *mmap_event = data;
6860 struct perf_output_handle handle;
6861 struct perf_sample_data sample;
6862 int size = mmap_event->event_id.header.size;
6863 int ret;
6864
6865 if (!perf_event_mmap_match(event, data))
6866 return;
6867
6868 if (event->attr.mmap2) {
6869 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6870 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6871 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6872 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6873 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6874 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6875 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6876 }
6877
6878 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6879 ret = perf_output_begin(&handle, event,
6880 mmap_event->event_id.header.size);
6881 if (ret)
6882 goto out;
6883
6884 mmap_event->event_id.pid = perf_event_pid(event, current);
6885 mmap_event->event_id.tid = perf_event_tid(event, current);
6886
6887 perf_output_put(&handle, mmap_event->event_id);
6888
6889 if (event->attr.mmap2) {
6890 perf_output_put(&handle, mmap_event->maj);
6891 perf_output_put(&handle, mmap_event->min);
6892 perf_output_put(&handle, mmap_event->ino);
6893 perf_output_put(&handle, mmap_event->ino_generation);
6894 perf_output_put(&handle, mmap_event->prot);
6895 perf_output_put(&handle, mmap_event->flags);
6896 }
6897
6898 __output_copy(&handle, mmap_event->file_name,
6899 mmap_event->file_size);
6900
6901 perf_event__output_id_sample(event, &handle, &sample);
6902
6903 perf_output_end(&handle);
6904out:
6905 mmap_event->event_id.header.size = size;
6906}
6907
6908static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6909{
6910 struct vm_area_struct *vma = mmap_event->vma;
6911 struct file *file = vma->vm_file;
6912 int maj = 0, min = 0;
6913 u64 ino = 0, gen = 0;
6914 u32 prot = 0, flags = 0;
6915 unsigned int size;
6916 char tmp[16];
6917 char *buf = NULL;
6918 char *name;
6919
6920 if (vma->vm_flags & VM_READ)
6921 prot |= PROT_READ;
6922 if (vma->vm_flags & VM_WRITE)
6923 prot |= PROT_WRITE;
6924 if (vma->vm_flags & VM_EXEC)
6925 prot |= PROT_EXEC;
6926
6927 if (vma->vm_flags & VM_MAYSHARE)
6928 flags = MAP_SHARED;
6929 else
6930 flags = MAP_PRIVATE;
6931
6932 if (vma->vm_flags & VM_DENYWRITE)
6933 flags |= MAP_DENYWRITE;
6934 if (vma->vm_flags & VM_MAYEXEC)
6935 flags |= MAP_EXECUTABLE;
6936 if (vma->vm_flags & VM_LOCKED)
6937 flags |= MAP_LOCKED;
6938 if (vma->vm_flags & VM_HUGETLB)
6939 flags |= MAP_HUGETLB;
6940
6941 if (file) {
6942 struct inode *inode;
6943 dev_t dev;
6944
6945 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6946 if (!buf) {
6947 name = "//enomem";
6948 goto cpy_name;
6949 }
6950
6951
6952
6953
6954
6955 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6956 if (IS_ERR(name)) {
6957 name = "//toolong";
6958 goto cpy_name;
6959 }
6960 inode = file_inode(vma->vm_file);
6961 dev = inode->i_sb->s_dev;
6962 ino = inode->i_ino;
6963 gen = inode->i_generation;
6964 maj = MAJOR(dev);
6965 min = MINOR(dev);
6966
6967 goto got_name;
6968 } else {
6969 if (vma->vm_ops && vma->vm_ops->name) {
6970 name = (char *) vma->vm_ops->name(vma);
6971 if (name)
6972 goto cpy_name;
6973 }
6974
6975 name = (char *)arch_vma_name(vma);
6976 if (name)
6977 goto cpy_name;
6978
6979 if (vma->vm_start <= vma->vm_mm->start_brk &&
6980 vma->vm_end >= vma->vm_mm->brk) {
6981 name = "[heap]";
6982 goto cpy_name;
6983 }
6984 if (vma->vm_start <= vma->vm_mm->start_stack &&
6985 vma->vm_end >= vma->vm_mm->start_stack) {
6986 name = "[stack]";
6987 goto cpy_name;
6988 }
6989
6990 name = "//anon";
6991 goto cpy_name;
6992 }
6993
6994cpy_name:
6995 strlcpy(tmp, name, sizeof(tmp));
6996 name = tmp;
6997got_name:
6998
6999
7000
7001
7002
7003 size = strlen(name)+1;
7004 while (!IS_ALIGNED(size, sizeof(u64)))
7005 name[size++] = '\0';
7006
7007 mmap_event->file_name = name;
7008 mmap_event->file_size = size;
7009 mmap_event->maj = maj;
7010 mmap_event->min = min;
7011 mmap_event->ino = ino;
7012 mmap_event->ino_generation = gen;
7013 mmap_event->prot = prot;
7014 mmap_event->flags = flags;
7015
7016 if (!(vma->vm_flags & VM_EXEC))
7017 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7018
7019 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7020
7021 perf_iterate_sb(perf_event_mmap_output,
7022 mmap_event,
7023 NULL);
7024
7025 kfree(buf);
7026}
7027
7028
7029
7030
7031static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7032 struct file *file, unsigned long offset,
7033 unsigned long size)
7034{
7035 if (filter->inode != file_inode(file))
7036 return false;
7037
7038 if (filter->offset > offset + size)
7039 return false;
7040
7041 if (filter->offset + filter->size < offset)
7042 return false;
7043
7044 return true;
7045}
7046
7047static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7048{
7049 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7050 struct vm_area_struct *vma = data;
7051 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7052 struct file *file = vma->vm_file;
7053 struct perf_addr_filter *filter;
7054 unsigned int restart = 0, count = 0;
7055
7056 if (!has_addr_filter(event))
7057 return;
7058
7059 if (!file)
7060 return;
7061
7062 raw_spin_lock_irqsave(&ifh->lock, flags);
7063 list_for_each_entry(filter, &ifh->list, entry) {
7064 if (perf_addr_filter_match(filter, file, off,
7065 vma->vm_end - vma->vm_start)) {
7066 event->addr_filters_offs[count] = vma->vm_start;
7067 restart++;
7068 }
7069
7070 count++;
7071 }
7072
7073 if (restart)
7074 event->addr_filters_gen++;
7075 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7076
7077 if (restart)
7078 perf_event_stop(event, 1);
7079}
7080
7081
7082
7083
7084static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7085{
7086 struct perf_event_context *ctx;
7087 int ctxn;
7088
7089
7090
7091
7092
7093 if (!(vma->vm_flags & VM_EXEC))
7094 return;
7095
7096 rcu_read_lock();
7097 for_each_task_context_nr(ctxn) {
7098 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7099 if (!ctx)
7100 continue;
7101
7102 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7103 }
7104 rcu_read_unlock();
7105}
7106
7107void perf_event_mmap(struct vm_area_struct *vma)
7108{
7109 struct perf_mmap_event mmap_event;
7110
7111 if (!atomic_read(&nr_mmap_events))
7112 return;
7113
7114 mmap_event = (struct perf_mmap_event){
7115 .vma = vma,
7116
7117
7118 .event_id = {
7119 .header = {
7120 .type = PERF_RECORD_MMAP,
7121 .misc = PERF_RECORD_MISC_USER,
7122
7123 },
7124
7125
7126 .start = vma->vm_start,
7127 .len = vma->vm_end - vma->vm_start,
7128 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
7129 },
7130
7131
7132
7133
7134
7135
7136 };
7137
7138 perf_addr_filters_adjust(vma);
7139 perf_event_mmap_event(&mmap_event);
7140}
7141
7142void perf_event_aux_event(struct perf_event *event, unsigned long head,
7143 unsigned long size, u64 flags)
7144{
7145 struct perf_output_handle handle;
7146 struct perf_sample_data sample;
7147 struct perf_aux_event {
7148 struct perf_event_header header;
7149 u64 offset;
7150 u64 size;
7151 u64 flags;
7152 } rec = {
7153 .header = {
7154 .type = PERF_RECORD_AUX,
7155 .misc = 0,
7156 .size = sizeof(rec),
7157 },
7158 .offset = head,
7159 .size = size,
7160 .flags = flags,
7161 };
7162 int ret;
7163
7164 perf_event_header__init_id(&rec.header, &sample, event);
7165 ret = perf_output_begin(&handle, event, rec.header.size);
7166
7167 if (ret)
7168 return;
7169
7170 perf_output_put(&handle, rec);
7171 perf_event__output_id_sample(event, &handle, &sample);
7172
7173 perf_output_end(&handle);
7174}
7175
7176
7177
7178
7179void perf_log_lost_samples(struct perf_event *event, u64 lost)
7180{
7181 struct perf_output_handle handle;
7182 struct perf_sample_data sample;
7183 int ret;
7184
7185 struct {
7186 struct perf_event_header header;
7187 u64 lost;
7188 } lost_samples_event = {
7189 .header = {
7190 .type = PERF_RECORD_LOST_SAMPLES,
7191 .misc = 0,
7192 .size = sizeof(lost_samples_event),
7193 },
7194 .lost = lost,
7195 };
7196
7197 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7198
7199 ret = perf_output_begin(&handle, event,
7200 lost_samples_event.header.size);
7201 if (ret)
7202 return;
7203
7204 perf_output_put(&handle, lost_samples_event);
7205 perf_event__output_id_sample(event, &handle, &sample);
7206 perf_output_end(&handle);
7207}
7208
7209
7210
7211
7212
7213struct perf_switch_event {
7214 struct task_struct *task;
7215 struct task_struct *next_prev;
7216
7217 struct {
7218 struct perf_event_header header;
7219 u32 next_prev_pid;
7220 u32 next_prev_tid;
7221 } event_id;
7222};
7223
7224static int perf_event_switch_match(struct perf_event *event)
7225{
7226 return event->attr.context_switch;
7227}
7228
7229static void perf_event_switch_output(struct perf_event *event, void *data)
7230{
7231 struct perf_switch_event *se = data;
7232 struct perf_output_handle handle;
7233 struct perf_sample_data sample;
7234 int ret;
7235
7236 if (!perf_event_switch_match(event))
7237 return;
7238
7239
7240 if (event->ctx->task) {
7241 se->event_id.header.type = PERF_RECORD_SWITCH;
7242 se->event_id.header.size = sizeof(se->event_id.header);
7243 } else {
7244 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7245 se->event_id.header.size = sizeof(se->event_id);
7246 se->event_id.next_prev_pid =
7247 perf_event_pid(event, se->next_prev);
7248 se->event_id.next_prev_tid =
7249 perf_event_tid(event, se->next_prev);
7250 }
7251
7252 perf_event_header__init_id(&se->event_id.header, &sample, event);
7253
7254 ret = perf_output_begin(&handle, event, se->event_id.header.size);
7255 if (ret)
7256 return;
7257
7258 if (event->ctx->task)
7259 perf_output_put(&handle, se->event_id.header);
7260 else
7261 perf_output_put(&handle, se->event_id);
7262
7263 perf_event__output_id_sample(event, &handle, &sample);
7264
7265 perf_output_end(&handle);
7266}
7267
7268static void perf_event_switch(struct task_struct *task,
7269 struct task_struct *next_prev, bool sched_in)
7270{
7271 struct perf_switch_event switch_event;
7272
7273
7274
7275 switch_event = (struct perf_switch_event){
7276 .task = task,
7277 .next_prev = next_prev,
7278 .event_id = {
7279 .header = {
7280
7281 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7282
7283 },
7284
7285
7286 },
7287 };
7288
7289 perf_iterate_sb(perf_event_switch_output,
7290 &switch_event,
7291 NULL);
7292}
7293
7294
7295
7296
7297
7298static void perf_log_throttle(struct perf_event *event, int enable)
7299{
7300 struct perf_output_handle handle;
7301 struct perf_sample_data sample;
7302 int ret;
7303
7304 struct {
7305 struct perf_event_header header;
7306 u64 time;
7307 u64 id;
7308 u64 stream_id;
7309 } throttle_event = {
7310 .header = {
7311 .type = PERF_RECORD_THROTTLE,
7312 .misc = 0,
7313 .size = sizeof(throttle_event),
7314 },
7315 .time = perf_event_clock(event),
7316 .id = primary_event_id(event),
7317 .stream_id = event->id,
7318 };
7319
7320 if (enable)
7321 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7322
7323 perf_event_header__init_id(&throttle_event.header, &sample, event);
7324
7325 ret = perf_output_begin(&handle, event,
7326 throttle_event.header.size);
7327 if (ret)
7328 return;
7329
7330 perf_output_put(&handle, throttle_event);
7331 perf_event__output_id_sample(event, &handle, &sample);
7332 perf_output_end(&handle);
7333}
7334
7335void perf_event_itrace_started(struct perf_event *event)
7336{
7337 event->attach_state |= PERF_ATTACH_ITRACE;
7338}
7339
7340static void perf_log_itrace_start(struct perf_event *event)
7341{
7342 struct perf_output_handle handle;
7343 struct perf_sample_data sample;
7344 struct perf_aux_event {
7345 struct perf_event_header header;
7346 u32 pid;
7347 u32 tid;
7348 } rec;
7349 int ret;
7350
7351 if (event->parent)
7352 event = event->parent;
7353
7354 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7355 event->attach_state & PERF_ATTACH_ITRACE)
7356 return;
7357
7358 rec.header.type = PERF_RECORD_ITRACE_START;
7359 rec.header.misc = 0;
7360 rec.header.size = sizeof(rec);
7361 rec.pid = perf_event_pid(event, current);
7362 rec.tid = perf_event_tid(event, current);
7363
7364 perf_event_header__init_id(&rec.header, &sample, event);
7365 ret = perf_output_begin(&handle, event, rec.header.size);
7366
7367 if (ret)
7368 return;
7369
7370 perf_output_put(&handle, rec);
7371 perf_event__output_id_sample(event, &handle, &sample);
7372
7373 perf_output_end(&handle);
7374}
7375
7376static int
7377__perf_event_account_interrupt(struct perf_event *event, int throttle)
7378{
7379 struct hw_perf_event *hwc = &event->hw;
7380 int ret = 0;
7381 u64 seq;
7382
7383 seq = __this_cpu_read(perf_throttled_seq);
7384 if (seq != hwc->interrupts_seq) {
7385 hwc->interrupts_seq = seq;
7386 hwc->interrupts = 1;
7387 } else {
7388 hwc->interrupts++;
7389 if (unlikely(throttle
7390 && hwc->interrupts >= max_samples_per_tick)) {
7391 __this_cpu_inc(perf_throttled_count);
7392 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7393 hwc->interrupts = MAX_INTERRUPTS;
7394 perf_log_throttle(event, 0);
7395 ret = 1;
7396 }
7397 }
7398
7399 if (event->attr.freq) {
7400 u64 now = perf_clock();
7401 s64 delta = now - hwc->freq_time_stamp;
7402
7403 hwc->freq_time_stamp = now;
7404
7405 if (delta > 0 && delta < 2*TICK_NSEC)
7406 perf_adjust_period(event, delta, hwc->last_period, true);
7407 }
7408
7409 return ret;
7410}
7411
7412int perf_event_account_interrupt(struct perf_event *event)
7413{
7414 return __perf_event_account_interrupt(event, 1);
7415}
7416
7417
7418
7419
7420
7421static int __perf_event_overflow(struct perf_event *event,
7422 int throttle, struct perf_sample_data *data,
7423 struct pt_regs *regs)
7424{
7425 int events = atomic_read(&event->event_limit);
7426 int ret = 0;
7427
7428
7429
7430
7431
7432 if (unlikely(!is_sampling_event(event)))
7433 return 0;
7434
7435 ret = __perf_event_account_interrupt(event, throttle);
7436
7437
7438
7439
7440
7441
7442 event->pending_kill = POLL_IN;
7443 if (events && atomic_dec_and_test(&event->event_limit)) {
7444 ret = 1;
7445 event->pending_kill = POLL_HUP;
7446
7447 perf_event_disable_inatomic(event);
7448 }
7449
7450 READ_ONCE(event->overflow_handler)(event, data, regs);
7451
7452 if (*perf_event_fasync(event) && event->pending_kill) {
7453 event->pending_wakeup = 1;
7454 irq_work_queue(&event->pending);
7455 }
7456
7457 return ret;
7458}
7459
7460int perf_event_overflow(struct perf_event *event,
7461 struct perf_sample_data *data,
7462 struct pt_regs *regs)
7463{
7464 return __perf_event_overflow(event, 1, data, regs);
7465}
7466
7467
7468
7469
7470
7471struct swevent_htable {
7472 struct swevent_hlist *swevent_hlist;
7473 struct mutex hlist_mutex;
7474 int hlist_refcount;
7475
7476
7477 int recursion[PERF_NR_CONTEXTS];
7478};
7479
7480static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7481
7482
7483
7484
7485
7486
7487
7488
7489u64 perf_swevent_set_period(struct perf_event *event)
7490{
7491 struct hw_perf_event *hwc = &event->hw;
7492 u64 period = hwc->last_period;
7493 u64 nr, offset;
7494 s64 old, val;
7495
7496 hwc->last_period = hwc->sample_period;
7497
7498again:
7499 old = val = local64_read(&hwc->period_left);
7500 if (val < 0)
7501 return 0;
7502
7503 nr = div64_u64(period + val, period);
7504 offset = nr * period;
7505 val -= offset;
7506 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7507 goto again;
7508
7509 return nr;
7510}
7511
7512static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7513 struct perf_sample_data *data,
7514 struct pt_regs *regs)
7515{
7516 struct hw_perf_event *hwc = &event->hw;
7517 int throttle = 0;
7518
7519 if (!overflow)
7520 overflow = perf_swevent_set_period(event);
7521
7522 if (hwc->interrupts == MAX_INTERRUPTS)
7523 return;
7524
7525 for (; overflow; overflow--) {
7526 if (__perf_event_overflow(event, throttle,
7527 data, regs)) {
7528
7529
7530
7531
7532 break;
7533 }
7534 throttle = 1;
7535 }
7536}
7537
7538static void perf_swevent_event(struct perf_event *event, u64 nr,
7539 struct perf_sample_data *data,
7540 struct pt_regs *regs)
7541{
7542 struct hw_perf_event *hwc = &event->hw;
7543
7544 local64_add(nr, &event->count);
7545
7546 if (!regs)
7547 return;
7548
7549 if (!is_sampling_event(event))
7550 return;
7551
7552 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7553 data->period = nr;
7554 return perf_swevent_overflow(event, 1, data, regs);
7555 } else
7556 data->period = event->hw.last_period;
7557
7558 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7559 return perf_swevent_overflow(event, 1, data, regs);
7560
7561 if (local64_add_negative(nr, &hwc->period_left))
7562 return;
7563
7564 perf_swevent_overflow(event, 0, data, regs);
7565}
7566
7567static int perf_exclude_event(struct perf_event *event,
7568 struct pt_regs *regs)
7569{
7570 if (event->hw.state & PERF_HES_STOPPED)
7571 return 1;
7572
7573 if (regs) {
7574 if (event->attr.exclude_user && user_mode(regs))
7575 return 1;
7576
7577 if (event->attr.exclude_kernel && !user_mode(regs))
7578 return 1;
7579 }
7580
7581 return 0;
7582}
7583
7584static int perf_swevent_match(struct perf_event *event,
7585 enum perf_type_id type,
7586 u32 event_id,
7587 struct perf_sample_data *data,
7588 struct pt_regs *regs)
7589{
7590 if (event->attr.type != type)
7591 return 0;
7592
7593 if (event->attr.config != event_id)
7594 return 0;
7595
7596 if (perf_exclude_event(event, regs))
7597 return 0;
7598
7599 return 1;
7600}
7601
7602static inline u64 swevent_hash(u64 type, u32 event_id)
7603{
7604 u64 val = event_id | (type << 32);
7605
7606 return hash_64(val, SWEVENT_HLIST_BITS);
7607}
7608
7609static inline struct hlist_head *
7610__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7611{
7612 u64 hash = swevent_hash(type, event_id);
7613
7614 return &hlist->heads[hash];
7615}
7616
7617
7618static inline struct hlist_head *
7619find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7620{
7621 struct swevent_hlist *hlist;
7622
7623 hlist = rcu_dereference(swhash->swevent_hlist);
7624 if (!hlist)
7625 return NULL;
7626
7627 return __find_swevent_head(hlist, type, event_id);
7628}
7629
7630
7631static inline struct hlist_head *
7632find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7633{
7634 struct swevent_hlist *hlist;
7635 u32 event_id = event->attr.config;
7636 u64 type = event->attr.type;
7637
7638
7639
7640
7641
7642
7643 hlist = rcu_dereference_protected(swhash->swevent_hlist,
7644 lockdep_is_held(&event->ctx->lock));
7645 if (!hlist)
7646 return NULL;
7647
7648 return __find_swevent_head(hlist, type, event_id);
7649}
7650
7651static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7652 u64 nr,
7653 struct perf_sample_data *data,
7654 struct pt_regs *regs)
7655{
7656 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7657 struct perf_event *event;
7658 struct hlist_head *head;
7659
7660 rcu_read_lock();
7661 head = find_swevent_head_rcu(swhash, type, event_id);
7662 if (!head)
7663 goto end;
7664
7665 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7666 if (perf_swevent_match(event, type, event_id, data, regs))
7667 perf_swevent_event(event, nr, data, regs);
7668 }
7669end:
7670 rcu_read_unlock();
7671}
7672
7673DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7674
7675int perf_swevent_get_recursion_context(void)
7676{
7677 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7678
7679 return get_recursion_context(swhash->recursion);
7680}
7681EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7682
7683void perf_swevent_put_recursion_context(int rctx)
7684{
7685 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7686
7687 put_recursion_context(swhash->recursion, rctx);
7688}
7689
7690void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7691{
7692 struct perf_sample_data data;
7693
7694 if (WARN_ON_ONCE(!regs))
7695 return;
7696
7697 perf_sample_data_init(&data, addr, 0);
7698 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7699}
7700
7701void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7702{
7703 int rctx;
7704
7705 preempt_disable_notrace();
7706 rctx = perf_swevent_get_recursion_context();
7707 if (unlikely(rctx < 0))
7708 goto fail;
7709
7710 ___perf_sw_event(event_id, nr, regs, addr);
7711
7712 perf_swevent_put_recursion_context(rctx);
7713fail:
7714 preempt_enable_notrace();
7715}
7716
7717static void perf_swevent_read(struct perf_event *event)
7718{
7719}
7720
7721static int perf_swevent_add(struct perf_event *event, int flags)
7722{
7723 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7724 struct hw_perf_event *hwc = &event->hw;
7725 struct hlist_head *head;
7726
7727 if (is_sampling_event(event)) {
7728 hwc->last_period = hwc->sample_period;
7729 perf_swevent_set_period(event);
7730 }
7731
7732 hwc->state = !(flags & PERF_EF_START);
7733
7734 head = find_swevent_head(swhash, event);
7735 if (WARN_ON_ONCE(!head))
7736 return -EINVAL;
7737
7738 hlist_add_head_rcu(&event->hlist_entry, head);
7739 perf_event_update_userpage(event);
7740
7741 return 0;
7742}
7743
7744static void perf_swevent_del(struct perf_event *event, int flags)
7745{
7746 hlist_del_rcu(&event->hlist_entry);
7747}
7748
7749static void perf_swevent_start(struct perf_event *event, int flags)
7750{
7751 event->hw.state = 0;
7752}
7753
7754static void perf_swevent_stop(struct perf_event *event, int flags)
7755{
7756 event->hw.state = PERF_HES_STOPPED;
7757}
7758
7759
7760static inline struct swevent_hlist *
7761swevent_hlist_deref(struct swevent_htable *swhash)
7762{
7763 return rcu_dereference_protected(swhash->swevent_hlist,
7764 lockdep_is_held(&swhash->hlist_mutex));
7765}
7766
7767static void swevent_hlist_release(struct swevent_htable *swhash)
7768{
7769 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7770
7771 if (!hlist)
7772 return;
7773
7774 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7775 kfree_rcu(hlist, rcu_head);
7776}
7777
7778static void swevent_hlist_put_cpu(int cpu)
7779{
7780 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7781
7782 mutex_lock(&swhash->hlist_mutex);
7783
7784 if (!--swhash->hlist_refcount)
7785 swevent_hlist_release(swhash);
7786
7787 mutex_unlock(&swhash->hlist_mutex);
7788}
7789
7790static void swevent_hlist_put(void)
7791{
7792 int cpu;
7793
7794 for_each_possible_cpu(cpu)
7795 swevent_hlist_put_cpu(cpu);
7796}
7797
7798static int swevent_hlist_get_cpu(int cpu)
7799{
7800 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7801 int err = 0;
7802
7803 mutex_lock(&swhash->hlist_mutex);
7804 if (!swevent_hlist_deref(swhash) &&
7805 cpumask_test_cpu(cpu, perf_online_mask)) {
7806 struct swevent_hlist *hlist;
7807
7808 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7809 if (!hlist) {
7810 err = -ENOMEM;
7811 goto exit;
7812 }
7813 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7814 }
7815 swhash->hlist_refcount++;
7816exit:
7817 mutex_unlock(&swhash->hlist_mutex);
7818
7819 return err;
7820}
7821
7822static int swevent_hlist_get(void)
7823{
7824 int err, cpu, failed_cpu;
7825
7826 mutex_lock(&pmus_lock);
7827 for_each_possible_cpu(cpu) {
7828 err = swevent_hlist_get_cpu(cpu);
7829 if (err) {
7830 failed_cpu = cpu;
7831 goto fail;
7832 }
7833 }
7834 mutex_unlock(&pmus_lock);
7835 return 0;
7836fail:
7837 for_each_possible_cpu(cpu) {
7838 if (cpu == failed_cpu)
7839 break;
7840 swevent_hlist_put_cpu(cpu);
7841 }
7842 mutex_unlock(&pmus_lock);
7843 return err;
7844}
7845
7846struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7847
7848static void sw_perf_event_destroy(struct perf_event *event)
7849{
7850 u64 event_id = event->attr.config;
7851
7852 WARN_ON(event->parent);
7853
7854 static_key_slow_dec(&perf_swevent_enabled[event_id]);
7855 swevent_hlist_put();
7856}
7857
7858static int perf_swevent_init(struct perf_event *event)
7859{
7860 u64 event_id = event->attr.config;
7861
7862 if (event->attr.type != PERF_TYPE_SOFTWARE)
7863 return -ENOENT;
7864
7865
7866
7867
7868 if (has_branch_stack(event))
7869 return -EOPNOTSUPP;
7870
7871 switch (event_id) {
7872 case PERF_COUNT_SW_CPU_CLOCK:
7873 case PERF_COUNT_SW_TASK_CLOCK:
7874 return -ENOENT;
7875
7876 default:
7877 break;
7878 }
7879
7880 if (event_id >= PERF_COUNT_SW_MAX)
7881 return -ENOENT;
7882
7883 if (!event->parent) {
7884 int err;
7885
7886 err = swevent_hlist_get();
7887 if (err)
7888 return err;
7889
7890 static_key_slow_inc(&perf_swevent_enabled[event_id]);
7891 event->destroy = sw_perf_event_destroy;
7892 }
7893
7894 return 0;
7895}
7896
7897static struct pmu perf_swevent = {
7898 .task_ctx_nr = perf_sw_context,
7899
7900 .capabilities = PERF_PMU_CAP_NO_NMI,
7901
7902 .event_init = perf_swevent_init,
7903 .add = perf_swevent_add,
7904 .del = perf_swevent_del,
7905 .start = perf_swevent_start,
7906 .stop = perf_swevent_stop,
7907 .read = perf_swevent_read,
7908};
7909
7910#ifdef CONFIG_EVENT_TRACING
7911
7912static int perf_tp_filter_match(struct perf_event *event,
7913 struct perf_sample_data *data)
7914{
7915 void *record = data->raw->frag.data;
7916
7917
7918 if (event->parent)
7919 event = event->parent;
7920
7921 if (likely(!event->filter) || filter_match_preds(event->filter, record))
7922 return 1;
7923 return 0;
7924}
7925
7926static int perf_tp_event_match(struct perf_event *event,
7927 struct perf_sample_data *data,
7928 struct pt_regs *regs)
7929{
7930 if (event->hw.state & PERF_HES_STOPPED)
7931 return 0;
7932
7933
7934
7935 if (event->attr.exclude_kernel)
7936 return 0;
7937
7938 if (!perf_tp_filter_match(event, data))
7939 return 0;
7940
7941 return 1;
7942}
7943
7944void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7945 struct trace_event_call *call, u64 count,
7946 struct pt_regs *regs, struct hlist_head *head,
7947 struct task_struct *task)
7948{
7949 struct bpf_prog *prog = call->prog;
7950
7951 if (prog) {
7952 *(struct pt_regs **)raw_data = regs;
7953 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7954 perf_swevent_put_recursion_context(rctx);
7955 return;
7956 }
7957 }
7958 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7959 rctx, task, NULL);
7960}
7961EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7962
7963void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7964 struct pt_regs *regs, struct hlist_head *head, int rctx,
7965 struct task_struct *task, struct perf_event *event)
7966{
7967 struct perf_sample_data data;
7968
7969 struct perf_raw_record raw = {
7970 .frag = {
7971 .size = entry_size,
7972 .data = record,
7973 },
7974 };
7975
7976 perf_sample_data_init(&data, 0, 0);
7977 data.raw = &raw;
7978
7979 perf_trace_buf_update(record, event_type);
7980
7981
7982 if (event) {
7983 if (perf_tp_event_match(event, &data, regs))
7984 perf_swevent_event(event, count, &data, regs);
7985 } else {
7986 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7987 if (perf_tp_event_match(event, &data, regs))
7988 perf_swevent_event(event, count, &data, regs);
7989 }
7990 }
7991
7992
7993
7994
7995
7996 if (task && task != current) {
7997 struct perf_event_context *ctx;
7998 struct trace_entry *entry = record;
7999
8000 rcu_read_lock();
8001 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8002 if (!ctx)
8003 goto unlock;
8004
8005 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8006 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8007 continue;
8008 if (event->attr.config != entry->type)
8009 continue;
8010 if (perf_tp_event_match(event, &data, regs))
8011 perf_swevent_event(event, count, &data, regs);
8012 }
8013unlock:
8014 rcu_read_unlock();
8015 }
8016
8017 perf_swevent_put_recursion_context(rctx);
8018}
8019EXPORT_SYMBOL_GPL(perf_tp_event);
8020
8021static void tp_perf_event_destroy(struct perf_event *event)
8022{
8023 perf_trace_destroy(event);
8024}
8025
8026static int perf_tp_event_init(struct perf_event *event)
8027{
8028 int err;
8029
8030 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8031 return -ENOENT;
8032
8033
8034
8035
8036 if (has_branch_stack(event))
8037 return -EOPNOTSUPP;
8038
8039 err = perf_trace_init(event);
8040 if (err)
8041 return err;
8042
8043 event->destroy = tp_perf_event_destroy;
8044
8045 return 0;
8046}
8047
8048static struct pmu perf_tracepoint = {
8049 .task_ctx_nr = perf_sw_context,
8050
8051 .event_init = perf_tp_event_init,
8052 .add = perf_trace_add,
8053 .del = perf_trace_del,
8054 .start = perf_swevent_start,
8055 .stop = perf_swevent_stop,
8056 .read = perf_swevent_read,
8057};
8058
8059static inline void perf_tp_register(void)
8060{
8061 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8062}
8063
8064static void perf_event_free_filter(struct perf_event *event)
8065{
8066 ftrace_profile_free_filter(event);
8067}
8068
8069#ifdef CONFIG_BPF_SYSCALL
8070static void bpf_overflow_handler(struct perf_event *event,
8071 struct perf_sample_data *data,
8072 struct pt_regs *regs)
8073{
8074 struct bpf_perf_event_data_kern ctx = {
8075 .data = data,
8076 .regs = regs,
8077 };
8078 int ret = 0;
8079
8080 preempt_disable();
8081 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8082 goto out;
8083 rcu_read_lock();
8084 ret = BPF_PROG_RUN(event->prog, &ctx);
8085 rcu_read_unlock();
8086out:
8087 __this_cpu_dec(bpf_prog_active);
8088 preempt_enable();
8089 if (!ret)
8090 return;
8091
8092 event->orig_overflow_handler(event, data, regs);
8093}
8094
8095static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8096{
8097 struct bpf_prog *prog;
8098
8099 if (event->overflow_handler_context)
8100
8101 return -EINVAL;
8102
8103 if (event->prog)
8104 return -EEXIST;
8105
8106 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8107 if (IS_ERR(prog))
8108 return PTR_ERR(prog);
8109
8110 event->prog = prog;
8111 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8112 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8113 return 0;
8114}
8115
8116static void perf_event_free_bpf_handler(struct perf_event *event)
8117{
8118 struct bpf_prog *prog = event->prog;
8119
8120 if (!prog)
8121 return;
8122
8123 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8124 event->prog = NULL;
8125 bpf_prog_put(prog);
8126}
8127#else
8128static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8129{
8130 return -EOPNOTSUPP;
8131}
8132static void perf_event_free_bpf_handler(struct perf_event *event)
8133{
8134}
8135#endif
8136
8137static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8138{
8139 bool is_kprobe, is_tracepoint, is_syscall_tp;
8140 struct bpf_prog *prog;
8141
8142 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8143 return perf_event_set_bpf_handler(event, prog_fd);
8144
8145 if (event->tp_event->prog)
8146 return -EEXIST;
8147
8148 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8149 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8150 is_syscall_tp = is_syscall_trace_event(event->tp_event);
8151 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8152
8153 return -EINVAL;
8154
8155 prog = bpf_prog_get(prog_fd);
8156 if (IS_ERR(prog))
8157 return PTR_ERR(prog);
8158
8159 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8160 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8161 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8162
8163 bpf_prog_put(prog);
8164 return -EINVAL;
8165 }
8166
8167 if (is_tracepoint || is_syscall_tp) {
8168 int off = trace_event_get_offsets(event->tp_event);
8169
8170 if (prog->aux->max_ctx_offset > off) {
8171 bpf_prog_put(prog);
8172 return -EACCES;
8173 }
8174 }
8175 event->tp_event->prog = prog;
8176 event->tp_event->bpf_prog_owner = event;
8177
8178 return 0;
8179}
8180
8181static void perf_event_free_bpf_prog(struct perf_event *event)
8182{
8183 struct bpf_prog *prog;
8184
8185 perf_event_free_bpf_handler(event);
8186
8187 if (!event->tp_event)
8188 return;
8189
8190 prog = event->tp_event->prog;
8191 if (prog && event->tp_event->bpf_prog_owner == event) {
8192 event->tp_event->prog = NULL;
8193 bpf_prog_put(prog);
8194 }
8195}
8196
8197#else
8198
8199static inline void perf_tp_register(void)
8200{
8201}
8202
8203static void perf_event_free_filter(struct perf_event *event)
8204{
8205}
8206
8207static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8208{
8209 return -ENOENT;
8210}
8211
8212static void perf_event_free_bpf_prog(struct perf_event *event)
8213{
8214}
8215#endif
8216
8217#ifdef CONFIG_HAVE_HW_BREAKPOINT
8218void perf_bp_event(struct perf_event *bp, void *data)
8219{
8220 struct perf_sample_data sample;
8221 struct pt_regs *regs = data;
8222
8223 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8224
8225 if (!bp->hw.state && !perf_exclude_event(bp, regs))
8226 perf_swevent_event(bp, 1, &sample, regs);
8227}
8228#endif
8229
8230
8231
8232
8233static struct perf_addr_filter *
8234perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8235{
8236 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8237 struct perf_addr_filter *filter;
8238
8239 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8240 if (!filter)
8241 return NULL;
8242
8243 INIT_LIST_HEAD(&filter->entry);
8244 list_add_tail(&filter->entry, filters);
8245
8246 return filter;
8247}
8248
8249static void free_filters_list(struct list_head *filters)
8250{
8251 struct perf_addr_filter *filter, *iter;
8252
8253 list_for_each_entry_safe(filter, iter, filters, entry) {
8254 if (filter->inode)
8255 iput(filter->inode);
8256 list_del(&filter->entry);
8257 kfree(filter);
8258 }
8259}
8260
8261
8262
8263
8264static void perf_addr_filters_splice(struct perf_event *event,
8265 struct list_head *head)
8266{
8267 unsigned long flags;
8268 LIST_HEAD(list);
8269
8270 if (!has_addr_filter(event))
8271 return;
8272
8273
8274 if (event->parent)
8275 return;
8276
8277 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8278
8279 list_splice_init(&event->addr_filters.list, &list);
8280 if (head)
8281 list_splice(head, &event->addr_filters.list);
8282
8283 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8284
8285 free_filters_list(&list);
8286}
8287
8288
8289
8290
8291
8292
8293static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8294 struct mm_struct *mm)
8295{
8296 struct vm_area_struct *vma;
8297
8298 for (vma = mm->mmap; vma; vma = vma->vm_next) {
8299 struct file *file = vma->vm_file;
8300 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8301 unsigned long vma_size = vma->vm_end - vma->vm_start;
8302
8303 if (!file)
8304 continue;
8305
8306 if (!perf_addr_filter_match(filter, file, off, vma_size))
8307 continue;
8308
8309 return vma->vm_start;
8310 }
8311
8312 return 0;
8313}
8314
8315
8316
8317
8318
8319static void perf_event_addr_filters_apply(struct perf_event *event)
8320{
8321 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8322 struct task_struct *task = READ_ONCE(event->ctx->task);
8323 struct perf_addr_filter *filter;
8324 struct mm_struct *mm = NULL;
8325 unsigned int count = 0;
8326 unsigned long flags;
8327
8328
8329
8330
8331
8332 if (task == TASK_TOMBSTONE)
8333 return;
8334
8335 if (!ifh->nr_file_filters)
8336 return;
8337
8338 mm = get_task_mm(event->ctx->task);
8339 if (!mm)
8340 goto restart;
8341
8342 down_read(&mm->mmap_sem);
8343
8344 raw_spin_lock_irqsave(&ifh->lock, flags);
8345 list_for_each_entry(filter, &ifh->list, entry) {
8346 event->addr_filters_offs[count] = 0;
8347
8348
8349
8350
8351
8352 if (filter->inode)
8353 event->addr_filters_offs[count] =
8354 perf_addr_filter_apply(filter, mm);
8355
8356 count++;
8357 }
8358
8359 event->addr_filters_gen++;
8360 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8361
8362 up_read(&mm->mmap_sem);
8363
8364 mmput(mm);
8365
8366restart:
8367 perf_event_stop(event, 1);
8368}
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388enum {
8389 IF_ACT_NONE = -1,
8390 IF_ACT_FILTER,
8391 IF_ACT_START,
8392 IF_ACT_STOP,
8393 IF_SRC_FILE,
8394 IF_SRC_KERNEL,
8395 IF_SRC_FILEADDR,
8396 IF_SRC_KERNELADDR,
8397};
8398
8399enum {
8400 IF_STATE_ACTION = 0,
8401 IF_STATE_SOURCE,
8402 IF_STATE_END,
8403};
8404
8405static const match_table_t if_tokens = {
8406 { IF_ACT_FILTER, "filter" },
8407 { IF_ACT_START, "start" },
8408 { IF_ACT_STOP, "stop" },
8409 { IF_SRC_FILE, "%u/%u@%s" },
8410 { IF_SRC_KERNEL, "%u/%u" },
8411 { IF_SRC_FILEADDR, "%u@%s" },
8412 { IF_SRC_KERNELADDR, "%u" },
8413 { IF_ACT_NONE, NULL },
8414};
8415
8416
8417
8418
8419static int
8420perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8421 struct list_head *filters)
8422{
8423 struct perf_addr_filter *filter = NULL;
8424 char *start, *orig, *filename = NULL;
8425 struct path path;
8426 substring_t args[MAX_OPT_ARGS];
8427 int state = IF_STATE_ACTION, token;
8428 unsigned int kernel = 0;
8429 int ret = -EINVAL;
8430
8431 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8432 if (!fstr)
8433 return -ENOMEM;
8434
8435 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8436 ret = -EINVAL;
8437
8438 if (!*start)
8439 continue;
8440
8441
8442 if (state == IF_STATE_ACTION) {
8443 filter = perf_addr_filter_new(event, filters);
8444 if (!filter)
8445 goto fail;
8446 }
8447
8448 token = match_token(start, if_tokens, args);
8449 switch (token) {
8450 case IF_ACT_FILTER:
8451 case IF_ACT_START:
8452 filter->filter = 1;
8453
8454 case IF_ACT_STOP:
8455 if (state != IF_STATE_ACTION)
8456 goto fail;
8457
8458 state = IF_STATE_SOURCE;
8459 break;
8460
8461 case IF_SRC_KERNELADDR:
8462 case IF_SRC_KERNEL:
8463 kernel = 1;
8464
8465 case IF_SRC_FILEADDR:
8466 case IF_SRC_FILE:
8467 if (state != IF_STATE_SOURCE)
8468 goto fail;
8469
8470 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8471 filter->range = 1;
8472
8473 *args[0].to = 0;
8474 ret = kstrtoul(args[0].from, 0, &filter->offset);
8475 if (ret)
8476 goto fail;
8477
8478 if (filter->range) {
8479 *args[1].to = 0;
8480 ret = kstrtoul(args[1].from, 0, &filter->size);
8481 if (ret)
8482 goto fail;
8483 }
8484
8485 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8486 int fpos = filter->range ? 2 : 1;
8487
8488 filename = match_strdup(&args[fpos]);
8489 if (!filename) {
8490 ret = -ENOMEM;
8491 goto fail;
8492 }
8493 }
8494
8495 state = IF_STATE_END;
8496 break;
8497
8498 default:
8499 goto fail;
8500 }
8501
8502
8503
8504
8505
8506
8507 if (state == IF_STATE_END) {
8508 ret = -EINVAL;
8509 if (kernel && event->attr.exclude_kernel)
8510 goto fail;
8511
8512 if (!kernel) {
8513 if (!filename)
8514 goto fail;
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524 ret = -EOPNOTSUPP;
8525 if (!event->ctx->task)
8526 goto fail_free_name;
8527
8528
8529 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8530 if (ret)
8531 goto fail_free_name;
8532
8533 filter->inode = igrab(d_inode(path.dentry));
8534 path_put(&path);
8535 kfree(filename);
8536 filename = NULL;
8537
8538 ret = -EINVAL;
8539 if (!filter->inode ||
8540 !S_ISREG(filter->inode->i_mode))
8541
8542 goto fail;
8543
8544 event->addr_filters.nr_file_filters++;
8545 }
8546
8547
8548 state = IF_STATE_ACTION;
8549 filter = NULL;
8550 }
8551 }
8552
8553 if (state != IF_STATE_ACTION)
8554 goto fail;
8555
8556 kfree(orig);
8557
8558 return 0;
8559
8560fail_free_name:
8561 kfree(filename);
8562fail:
8563 free_filters_list(filters);
8564 kfree(orig);
8565
8566 return ret;
8567}
8568
8569static int
8570perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8571{
8572 LIST_HEAD(filters);
8573 int ret;
8574
8575
8576
8577
8578
8579 lockdep_assert_held(&event->ctx->mutex);
8580
8581 if (WARN_ON_ONCE(event->parent))
8582 return -EINVAL;
8583
8584 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8585 if (ret)
8586 goto fail_clear_files;
8587
8588 ret = event->pmu->addr_filters_validate(&filters);
8589 if (ret)
8590 goto fail_free_filters;
8591
8592
8593 perf_addr_filters_splice(event, &filters);
8594
8595
8596 perf_event_for_each_child(event, perf_event_addr_filters_apply);
8597
8598 return ret;
8599
8600fail_free_filters:
8601 free_filters_list(&filters);
8602
8603fail_clear_files:
8604 event->addr_filters.nr_file_filters = 0;
8605
8606 return ret;
8607}
8608
8609static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8610{
8611 char *filter_str;
8612 int ret = -EINVAL;
8613
8614 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8615 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8616 !has_addr_filter(event))
8617 return -EINVAL;
8618
8619 filter_str = strndup_user(arg, PAGE_SIZE);
8620 if (IS_ERR(filter_str))
8621 return PTR_ERR(filter_str);
8622
8623 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8624 event->attr.type == PERF_TYPE_TRACEPOINT)
8625 ret = ftrace_profile_set_filter(event, event->attr.config,
8626 filter_str);
8627 else if (has_addr_filter(event))
8628 ret = perf_event_set_addr_filter(event, filter_str);
8629
8630 kfree(filter_str);
8631 return ret;
8632}
8633
8634
8635
8636
8637
8638static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8639{
8640 enum hrtimer_restart ret = HRTIMER_RESTART;
8641 struct perf_sample_data data;
8642 struct pt_regs *regs;
8643 struct perf_event *event;
8644 u64 period;
8645
8646 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8647
8648 if (event->state != PERF_EVENT_STATE_ACTIVE)
8649 return HRTIMER_NORESTART;
8650
8651 event->pmu->read(event);
8652
8653 perf_sample_data_init(&data, 0, event->hw.last_period);
8654 regs = get_irq_regs();
8655
8656 if (regs && !perf_exclude_event(event, regs)) {
8657 if (!(event->attr.exclude_idle && is_idle_task(current)))
8658 if (__perf_event_overflow(event, 1, &data, regs))
8659 ret = HRTIMER_NORESTART;
8660 }
8661
8662 period = max_t(u64, 10000, event->hw.sample_period);
8663 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8664
8665 return ret;
8666}
8667
8668static void perf_swevent_start_hrtimer(struct perf_event *event)
8669{
8670 struct hw_perf_event *hwc = &event->hw;
8671 s64 period;
8672
8673 if (!is_sampling_event(event))
8674 return;
8675
8676 period = local64_read(&hwc->period_left);
8677 if (period) {
8678 if (period < 0)
8679 period = 10000;
8680
8681 local64_set(&hwc->period_left, 0);
8682 } else {
8683 period = max_t(u64, 10000, hwc->sample_period);
8684 }
8685 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8686 HRTIMER_MODE_REL_PINNED);
8687}
8688
8689static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8690{
8691 struct hw_perf_event *hwc = &event->hw;
8692
8693 if (is_sampling_event(event)) {
8694 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8695 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8696
8697 hrtimer_cancel(&hwc->hrtimer);
8698 }
8699}
8700
8701static void perf_swevent_init_hrtimer(struct perf_event *event)
8702{
8703 struct hw_perf_event *hwc = &event->hw;
8704
8705 if (!is_sampling_event(event))
8706 return;
8707
8708 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8709 hwc->hrtimer.function = perf_swevent_hrtimer;
8710
8711
8712
8713
8714
8715 if (event->attr.freq) {
8716 long freq = event->attr.sample_freq;
8717
8718 event->attr.sample_period = NSEC_PER_SEC / freq;
8719 hwc->sample_period = event->attr.sample_period;
8720 local64_set(&hwc->period_left, hwc->sample_period);
8721 hwc->last_period = hwc->sample_period;
8722 event->attr.freq = 0;
8723 }
8724}
8725
8726
8727
8728
8729
8730static void cpu_clock_event_update(struct perf_event *event)
8731{
8732 s64 prev;
8733 u64 now;
8734
8735 now = local_clock();
8736 prev = local64_xchg(&event->hw.prev_count, now);
8737 local64_add(now - prev, &event->count);
8738}
8739
8740static void cpu_clock_event_start(struct perf_event *event, int flags)
8741{
8742 local64_set(&event->hw.prev_count, local_clock());
8743 perf_swevent_start_hrtimer(event);
8744}
8745
8746static void cpu_clock_event_stop(struct perf_event *event, int flags)
8747{
8748 perf_swevent_cancel_hrtimer(event);
8749 cpu_clock_event_update(event);
8750}
8751
8752static int cpu_clock_event_add(struct perf_event *event, int flags)
8753{
8754 if (flags & PERF_EF_START)
8755 cpu_clock_event_start(event, flags);
8756 perf_event_update_userpage(event);
8757
8758 return 0;
8759}
8760
8761static void cpu_clock_event_del(struct perf_event *event, int flags)
8762{
8763 cpu_clock_event_stop(event, flags);
8764}
8765
8766static void cpu_clock_event_read(struct perf_event *event)
8767{
8768 cpu_clock_event_update(event);
8769}
8770
8771static int cpu_clock_event_init(struct perf_event *event)
8772{
8773 if (event->attr.type != PERF_TYPE_SOFTWARE)
8774 return -ENOENT;
8775
8776 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8777 return -ENOENT;
8778
8779
8780
8781
8782 if (has_branch_stack(event))
8783 return -EOPNOTSUPP;
8784
8785 perf_swevent_init_hrtimer(event);
8786
8787 return 0;
8788}
8789
8790static struct pmu perf_cpu_clock = {
8791 .task_ctx_nr = perf_sw_context,
8792
8793 .capabilities = PERF_PMU_CAP_NO_NMI,
8794
8795 .event_init = cpu_clock_event_init,
8796 .add = cpu_clock_event_add,
8797 .del = cpu_clock_event_del,
8798 .start = cpu_clock_event_start,
8799 .stop = cpu_clock_event_stop,
8800 .read = cpu_clock_event_read,
8801};
8802
8803
8804
8805
8806
8807static void task_clock_event_update(struct perf_event *event, u64 now)
8808{
8809 u64 prev;
8810 s64 delta;
8811
8812 prev = local64_xchg(&event->hw.prev_count, now);
8813 delta = now - prev;
8814 local64_add(delta, &event->count);
8815}
8816
8817static void task_clock_event_start(struct perf_event *event, int flags)
8818{
8819 local64_set(&event->hw.prev_count, event->ctx->time);
8820 perf_swevent_start_hrtimer(event);
8821}
8822
8823static void task_clock_event_stop(struct perf_event *event, int flags)
8824{
8825 perf_swevent_cancel_hrtimer(event);
8826 task_clock_event_update(event, event->ctx->time);
8827}
8828
8829static int task_clock_event_add(struct perf_event *event, int flags)
8830{
8831 if (flags & PERF_EF_START)
8832 task_clock_event_start(event, flags);
8833 perf_event_update_userpage(event);
8834
8835 return 0;
8836}
8837
8838static void task_clock_event_del(struct perf_event *event, int flags)
8839{
8840 task_clock_event_stop(event, PERF_EF_UPDATE);
8841}
8842
8843static void task_clock_event_read(struct perf_event *event)
8844{
8845 u64 now = perf_clock();
8846 u64 delta = now - event->ctx->timestamp;
8847 u64 time = event->ctx->time + delta;
8848
8849 task_clock_event_update(event, time);
8850}
8851
8852static int task_clock_event_init(struct perf_event *event)
8853{
8854 if (event->attr.type != PERF_TYPE_SOFTWARE)
8855 return -ENOENT;
8856
8857 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8858 return -ENOENT;
8859
8860
8861
8862
8863 if (has_branch_stack(event))
8864 return -EOPNOTSUPP;
8865
8866 perf_swevent_init_hrtimer(event);
8867
8868 return 0;
8869}
8870
8871static struct pmu perf_task_clock = {
8872 .task_ctx_nr = perf_sw_context,
8873
8874 .capabilities = PERF_PMU_CAP_NO_NMI,
8875
8876 .event_init = task_clock_event_init,
8877 .add = task_clock_event_add,
8878 .del = task_clock_event_del,
8879 .start = task_clock_event_start,
8880 .stop = task_clock_event_stop,
8881 .read = task_clock_event_read,
8882};
8883
8884static void perf_pmu_nop_void(struct pmu *pmu)
8885{
8886}
8887
8888static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8889{
8890}
8891
8892static int perf_pmu_nop_int(struct pmu *pmu)
8893{
8894 return 0;
8895}
8896
8897static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
8898
8899static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
8900{
8901 __this_cpu_write(nop_txn_flags, flags);
8902
8903 if (flags & ~PERF_PMU_TXN_ADD)
8904 return;
8905
8906 perf_pmu_disable(pmu);
8907}
8908
8909static int perf_pmu_commit_txn(struct pmu *pmu)
8910{
8911 unsigned int flags = __this_cpu_read(nop_txn_flags);
8912
8913 __this_cpu_write(nop_txn_flags, 0);
8914
8915 if (flags & ~PERF_PMU_TXN_ADD)
8916 return 0;
8917
8918 perf_pmu_enable(pmu);
8919 return 0;
8920}
8921
8922static void perf_pmu_cancel_txn(struct pmu *pmu)
8923{
8924 unsigned int flags = __this_cpu_read(nop_txn_flags);
8925
8926 __this_cpu_write(nop_txn_flags, 0);
8927
8928 if (flags & ~PERF_PMU_TXN_ADD)
8929 return;
8930
8931 perf_pmu_enable(pmu);
8932}
8933
8934static int perf_event_idx_default(struct perf_event *event)
8935{
8936 return 0;
8937}
8938
8939
8940
8941
8942
8943static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8944{
8945 struct pmu *pmu;
8946
8947 if (ctxn < 0)
8948 return NULL;
8949
8950 list_for_each_entry(pmu, &pmus, entry) {
8951 if (pmu->task_ctx_nr == ctxn)
8952 return pmu->pmu_cpu_context;
8953 }
8954
8955 return NULL;
8956}
8957
8958static void free_pmu_context(struct pmu *pmu)
8959{
8960
8961
8962
8963
8964
8965 if (pmu->task_ctx_nr > perf_invalid_context)
8966 return;
8967
8968 mutex_lock(&pmus_lock);
8969 free_percpu(pmu->pmu_cpu_context);
8970 mutex_unlock(&pmus_lock);
8971}
8972
8973
8974
8975
8976static ssize_t nr_addr_filters_show(struct device *dev,
8977 struct device_attribute *attr,
8978 char *page)
8979{
8980 struct pmu *pmu = dev_get_drvdata(dev);
8981
8982 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
8983}
8984DEVICE_ATTR_RO(nr_addr_filters);
8985
8986static struct idr pmu_idr;
8987
8988static ssize_t
8989type_show(struct device *dev, struct device_attribute *attr, char *page)
8990{
8991 struct pmu *pmu = dev_get_drvdata(dev);
8992
8993 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
8994}
8995static DEVICE_ATTR_RO(type);
8996
8997static ssize_t
8998perf_event_mux_interval_ms_show(struct device *dev,
8999 struct device_attribute *attr,
9000 char *page)
9001{
9002 struct pmu *pmu = dev_get_drvdata(dev);
9003
9004 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9005}
9006
9007static DEFINE_MUTEX(mux_interval_mutex);
9008
9009static ssize_t
9010perf_event_mux_interval_ms_store(struct device *dev,
9011 struct device_attribute *attr,
9012 const char *buf, size_t count)
9013{
9014 struct pmu *pmu = dev_get_drvdata(dev);
9015 int timer, cpu, ret;
9016
9017 ret = kstrtoint(buf, 0, &timer);
9018 if (ret)
9019 return ret;
9020
9021 if (timer < 1)
9022 return -EINVAL;
9023
9024
9025 if (timer == pmu->hrtimer_interval_ms)
9026 return count;
9027
9028 mutex_lock(&mux_interval_mutex);
9029 pmu->hrtimer_interval_ms = timer;
9030
9031
9032 cpus_read_lock();
9033 for_each_online_cpu(cpu) {
9034 struct perf_cpu_context *cpuctx;
9035 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9036 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9037
9038 cpu_function_call(cpu,
9039 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9040 }
9041 cpus_read_unlock();
9042 mutex_unlock(&mux_interval_mutex);
9043
9044 return count;
9045}
9046static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9047
9048static struct attribute *pmu_dev_attrs[] = {
9049 &dev_attr_type.attr,
9050 &dev_attr_perf_event_mux_interval_ms.attr,
9051 NULL,
9052};
9053ATTRIBUTE_GROUPS(pmu_dev);
9054
9055static int pmu_bus_running;
9056static struct bus_type pmu_bus = {
9057 .name = "event_source",
9058 .dev_groups = pmu_dev_groups,
9059};
9060
9061static void pmu_dev_release(struct device *dev)
9062{
9063 kfree(dev);
9064}
9065
9066static int pmu_dev_alloc(struct pmu *pmu)
9067{
9068 int ret = -ENOMEM;
9069
9070 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9071 if (!pmu->dev)
9072 goto out;
9073
9074 pmu->dev->groups = pmu->attr_groups;
9075 device_initialize(pmu->dev);
9076 ret = dev_set_name(pmu->dev, "%s", pmu->name);
9077 if (ret)
9078 goto free_dev;
9079
9080 dev_set_drvdata(pmu->dev, pmu);
9081 pmu->dev->bus = &pmu_bus;
9082 pmu->dev->release = pmu_dev_release;
9083 ret = device_add(pmu->dev);
9084 if (ret)
9085 goto free_dev;
9086
9087
9088 if (pmu->nr_addr_filters)
9089 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9090
9091 if (ret)
9092 goto del_dev;
9093
9094out:
9095 return ret;
9096
9097del_dev:
9098 device_del(pmu->dev);
9099
9100free_dev:
9101 put_device(pmu->dev);
9102 goto out;
9103}
9104
9105static struct lock_class_key cpuctx_mutex;
9106static struct lock_class_key cpuctx_lock;
9107
9108int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9109{
9110 int cpu, ret;
9111
9112 mutex_lock(&pmus_lock);
9113 ret = -ENOMEM;
9114 pmu->pmu_disable_count = alloc_percpu(int);
9115 if (!pmu->pmu_disable_count)
9116 goto unlock;
9117
9118 pmu->type = -1;
9119 if (!name)
9120 goto skip_type;
9121 pmu->name = name;
9122
9123 if (type < 0) {
9124 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9125 if (type < 0) {
9126 ret = type;
9127 goto free_pdc;
9128 }
9129 }
9130 pmu->type = type;
9131
9132 if (pmu_bus_running) {
9133 ret = pmu_dev_alloc(pmu);
9134 if (ret)
9135 goto free_idr;
9136 }
9137
9138skip_type:
9139 if (pmu->task_ctx_nr == perf_hw_context) {
9140 static int hw_context_taken = 0;
9141
9142
9143
9144
9145
9146
9147 if (WARN_ON_ONCE(hw_context_taken &&
9148 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9149 pmu->task_ctx_nr = perf_invalid_context;
9150
9151 hw_context_taken = 1;
9152 }
9153
9154 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9155 if (pmu->pmu_cpu_context)
9156 goto got_cpu_context;
9157
9158 ret = -ENOMEM;
9159 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9160 if (!pmu->pmu_cpu_context)
9161 goto free_dev;
9162
9163 for_each_possible_cpu(cpu) {
9164 struct perf_cpu_context *cpuctx;
9165
9166 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9167 __perf_event_init_context(&cpuctx->ctx);
9168 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9169 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9170 cpuctx->ctx.pmu = pmu;
9171 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9172
9173 __perf_mux_hrtimer_init(cpuctx, cpu);
9174 }
9175
9176got_cpu_context:
9177 if (!pmu->start_txn) {
9178 if (pmu->pmu_enable) {
9179
9180
9181
9182
9183
9184 pmu->start_txn = perf_pmu_start_txn;
9185 pmu->commit_txn = perf_pmu_commit_txn;
9186 pmu->cancel_txn = perf_pmu_cancel_txn;
9187 } else {
9188 pmu->start_txn = perf_pmu_nop_txn;
9189 pmu->commit_txn = perf_pmu_nop_int;
9190 pmu->cancel_txn = perf_pmu_nop_void;
9191 }
9192 }
9193
9194 if (!pmu->pmu_enable) {
9195 pmu->pmu_enable = perf_pmu_nop_void;
9196 pmu->pmu_disable = perf_pmu_nop_void;
9197 }
9198
9199 if (!pmu->event_idx)
9200 pmu->event_idx = perf_event_idx_default;
9201
9202 list_add_rcu(&pmu->entry, &pmus);
9203 atomic_set(&pmu->exclusive_cnt, 0);
9204 ret = 0;
9205unlock:
9206 mutex_unlock(&pmus_lock);
9207
9208 return ret;
9209
9210free_dev:
9211 device_del(pmu->dev);
9212 put_device(pmu->dev);
9213
9214free_idr:
9215 if (pmu->type >= PERF_TYPE_MAX)
9216 idr_remove(&pmu_idr, pmu->type);
9217
9218free_pdc:
9219 free_percpu(pmu->pmu_disable_count);
9220 goto unlock;
9221}
9222EXPORT_SYMBOL_GPL(perf_pmu_register);
9223
9224void perf_pmu_unregister(struct pmu *pmu)
9225{
9226 int remove_device;
9227
9228 mutex_lock(&pmus_lock);
9229 remove_device = pmu_bus_running;
9230 list_del_rcu(&pmu->entry);
9231 mutex_unlock(&pmus_lock);
9232
9233
9234
9235
9236
9237 synchronize_srcu(&pmus_srcu);
9238 synchronize_rcu();
9239
9240 free_percpu(pmu->pmu_disable_count);
9241 if (pmu->type >= PERF_TYPE_MAX)
9242 idr_remove(&pmu_idr, pmu->type);
9243 if (remove_device) {
9244 if (pmu->nr_addr_filters)
9245 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9246 device_del(pmu->dev);
9247 put_device(pmu->dev);
9248 }
9249 free_pmu_context(pmu);
9250}
9251EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9252
9253static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9254{
9255 struct perf_event_context *ctx = NULL;
9256 int ret;
9257
9258 if (!try_module_get(pmu->module))
9259 return -ENODEV;
9260
9261 if (event->group_leader != event) {
9262
9263
9264
9265
9266 ctx = perf_event_ctx_lock_nested(event->group_leader,
9267 SINGLE_DEPTH_NESTING);
9268 BUG_ON(!ctx);
9269 }
9270
9271 event->pmu = pmu;
9272 ret = pmu->event_init(event);
9273
9274 if (ctx)
9275 perf_event_ctx_unlock(event->group_leader, ctx);
9276
9277 if (ret)
9278 module_put(pmu->module);
9279
9280 return ret;
9281}
9282
9283static struct pmu *perf_init_event(struct perf_event *event)
9284{
9285 struct pmu *pmu;
9286 int idx;
9287 int ret;
9288
9289 idx = srcu_read_lock(&pmus_srcu);
9290
9291
9292 if (event->parent && event->parent->pmu) {
9293 pmu = event->parent->pmu;
9294 ret = perf_try_init_event(pmu, event);
9295 if (!ret)
9296 goto unlock;
9297 }
9298
9299 rcu_read_lock();
9300 pmu = idr_find(&pmu_idr, event->attr.type);
9301 rcu_read_unlock();
9302 if (pmu) {
9303 ret = perf_try_init_event(pmu, event);
9304 if (ret)
9305 pmu = ERR_PTR(ret);
9306 goto unlock;
9307 }
9308
9309 list_for_each_entry_rcu(pmu, &pmus, entry) {
9310 ret = perf_try_init_event(pmu, event);
9311 if (!ret)
9312 goto unlock;
9313
9314 if (ret != -ENOENT) {
9315 pmu = ERR_PTR(ret);
9316 goto unlock;
9317 }
9318 }
9319 pmu = ERR_PTR(-ENOENT);
9320unlock:
9321 srcu_read_unlock(&pmus_srcu, idx);
9322
9323 return pmu;
9324}
9325
9326static void attach_sb_event(struct perf_event *event)
9327{
9328 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9329
9330 raw_spin_lock(&pel->lock);
9331 list_add_rcu(&event->sb_list, &pel->list);
9332 raw_spin_unlock(&pel->lock);
9333}
9334
9335
9336
9337
9338
9339
9340
9341
9342static void account_pmu_sb_event(struct perf_event *event)
9343{
9344 if (is_sb_event(event))
9345 attach_sb_event(event);
9346}
9347
9348static void account_event_cpu(struct perf_event *event, int cpu)
9349{
9350 if (event->parent)
9351 return;
9352
9353 if (is_cgroup_event(event))
9354 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9355}
9356
9357
9358static void account_freq_event_nohz(void)
9359{
9360#ifdef CONFIG_NO_HZ_FULL
9361
9362 spin_lock(&nr_freq_lock);
9363 if (atomic_inc_return(&nr_freq_events) == 1)
9364 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9365 spin_unlock(&nr_freq_lock);
9366#endif
9367}
9368
9369static void account_freq_event(void)
9370{
9371 if (tick_nohz_full_enabled())
9372 account_freq_event_nohz();
9373 else
9374 atomic_inc(&nr_freq_events);
9375}
9376
9377
9378static void account_event(struct perf_event *event)
9379{
9380 bool inc = false;
9381
9382 if (event->parent)
9383 return;
9384
9385 if (event->attach_state & PERF_ATTACH_TASK)
9386 inc = true;
9387 if (event->attr.mmap || event->attr.mmap_data)
9388 atomic_inc(&nr_mmap_events);
9389 if (event->attr.comm)
9390 atomic_inc(&nr_comm_events);
9391 if (event->attr.namespaces)
9392 atomic_inc(&nr_namespaces_events);
9393 if (event->attr.task)
9394 atomic_inc(&nr_task_events);
9395 if (event->attr.freq)
9396 account_freq_event();
9397 if (event->attr.context_switch) {
9398 atomic_inc(&nr_switch_events);
9399 inc = true;
9400 }
9401 if (has_branch_stack(event))
9402 inc = true;
9403 if (is_cgroup_event(event))
9404 inc = true;
9405
9406 if (inc) {
9407 if (atomic_inc_not_zero(&perf_sched_count))
9408 goto enabled;
9409
9410 mutex_lock(&perf_sched_mutex);
9411 if (!atomic_read(&perf_sched_count)) {
9412 static_branch_enable(&perf_sched_events);
9413
9414
9415
9416
9417
9418 synchronize_sched();
9419 }
9420
9421
9422
9423
9424 atomic_inc(&perf_sched_count);
9425 mutex_unlock(&perf_sched_mutex);
9426 }
9427enabled:
9428
9429 account_event_cpu(event, event->cpu);
9430
9431 account_pmu_sb_event(event);
9432}
9433
9434
9435
9436
9437static struct perf_event *
9438perf_event_alloc(struct perf_event_attr *attr, int cpu,
9439 struct task_struct *task,
9440 struct perf_event *group_leader,
9441 struct perf_event *parent_event,
9442 perf_overflow_handler_t overflow_handler,
9443 void *context, int cgroup_fd)
9444{
9445 struct pmu *pmu;
9446 struct perf_event *event;
9447 struct hw_perf_event *hwc;
9448 long err = -EINVAL;
9449
9450 if ((unsigned)cpu >= nr_cpu_ids) {
9451 if (!task || cpu != -1)
9452 return ERR_PTR(-EINVAL);
9453 }
9454
9455 event = kzalloc(sizeof(*event), GFP_KERNEL);
9456 if (!event)
9457 return ERR_PTR(-ENOMEM);
9458
9459
9460
9461
9462
9463 if (!group_leader)
9464 group_leader = event;
9465
9466 mutex_init(&event->child_mutex);
9467 INIT_LIST_HEAD(&event->child_list);
9468
9469 INIT_LIST_HEAD(&event->group_entry);
9470 INIT_LIST_HEAD(&event->event_entry);
9471 INIT_LIST_HEAD(&event->sibling_list);
9472 INIT_LIST_HEAD(&event->rb_entry);
9473 INIT_LIST_HEAD(&event->active_entry);
9474 INIT_LIST_HEAD(&event->addr_filters.list);
9475 INIT_HLIST_NODE(&event->hlist_entry);
9476
9477
9478 init_waitqueue_head(&event->waitq);
9479 init_irq_work(&event->pending, perf_pending_event);
9480
9481 mutex_init(&event->mmap_mutex);
9482 raw_spin_lock_init(&event->addr_filters.lock);
9483
9484 atomic_long_set(&event->refcount, 1);
9485 event->cpu = cpu;
9486 event->attr = *attr;
9487 event->group_leader = group_leader;
9488 event->pmu = NULL;
9489 event->oncpu = -1;
9490
9491 event->parent = parent_event;
9492
9493 event->ns = get_pid_ns(task_active_pid_ns(current));
9494 event->id = atomic64_inc_return(&perf_event_id);
9495
9496 event->state = PERF_EVENT_STATE_INACTIVE;
9497
9498 if (task) {
9499 event->attach_state = PERF_ATTACH_TASK;
9500
9501
9502
9503
9504
9505 event->hw.target = task;
9506 }
9507
9508 event->clock = &local_clock;
9509 if (parent_event)
9510 event->clock = parent_event->clock;
9511
9512 if (!overflow_handler && parent_event) {
9513 overflow_handler = parent_event->overflow_handler;
9514 context = parent_event->overflow_handler_context;
9515#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9516 if (overflow_handler == bpf_overflow_handler) {
9517 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9518
9519 if (IS_ERR(prog)) {
9520 err = PTR_ERR(prog);
9521 goto err_ns;
9522 }
9523 event->prog = prog;
9524 event->orig_overflow_handler =
9525 parent_event->orig_overflow_handler;
9526 }
9527#endif
9528 }
9529
9530 if (overflow_handler) {
9531 event->overflow_handler = overflow_handler;
9532 event->overflow_handler_context = context;
9533 } else if (is_write_backward(event)){
9534 event->overflow_handler = perf_event_output_backward;
9535 event->overflow_handler_context = NULL;
9536 } else {
9537 event->overflow_handler = perf_event_output_forward;
9538 event->overflow_handler_context = NULL;
9539 }
9540
9541 perf_event__state_init(event);
9542
9543 pmu = NULL;
9544
9545 hwc = &event->hw;
9546 hwc->sample_period = attr->sample_period;
9547 if (attr->freq && attr->sample_freq)
9548 hwc->sample_period = 1;
9549 hwc->last_period = hwc->sample_period;
9550
9551 local64_set(&hwc->period_left, hwc->sample_period);
9552
9553
9554
9555
9556
9557 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9558 goto err_ns;
9559
9560 if (!has_branch_stack(event))
9561 event->attr.branch_sample_type = 0;
9562
9563 if (cgroup_fd != -1) {
9564 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9565 if (err)
9566 goto err_ns;
9567 }
9568
9569 pmu = perf_init_event(event);
9570 if (IS_ERR(pmu)) {
9571 err = PTR_ERR(pmu);
9572 goto err_ns;
9573 }
9574
9575 err = exclusive_event_init(event);
9576 if (err)
9577 goto err_pmu;
9578
9579 if (has_addr_filter(event)) {
9580 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9581 sizeof(unsigned long),
9582 GFP_KERNEL);
9583 if (!event->addr_filters_offs) {
9584 err = -ENOMEM;
9585 goto err_per_task;
9586 }
9587
9588
9589 event->addr_filters_gen = 1;
9590 }
9591
9592 if (!event->parent) {
9593 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9594 err = get_callchain_buffers(attr->sample_max_stack);
9595 if (err)
9596 goto err_addr_filters;
9597 }
9598 }
9599
9600
9601 account_event(event);
9602
9603 return event;
9604
9605err_addr_filters:
9606 kfree(event->addr_filters_offs);
9607
9608err_per_task:
9609 exclusive_event_destroy(event);
9610
9611err_pmu:
9612 if (event->destroy)
9613 event->destroy(event);
9614 module_put(pmu->module);
9615err_ns:
9616 if (is_cgroup_event(event))
9617 perf_detach_cgroup(event);
9618 if (event->ns)
9619 put_pid_ns(event->ns);
9620 kfree(event);
9621
9622 return ERR_PTR(err);
9623}
9624
9625static int perf_copy_attr(struct perf_event_attr __user *uattr,
9626 struct perf_event_attr *attr)
9627{
9628 u32 size;
9629 int ret;
9630
9631 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9632 return -EFAULT;
9633
9634
9635
9636
9637 memset(attr, 0, sizeof(*attr));
9638
9639 ret = get_user(size, &uattr->size);
9640 if (ret)
9641 return ret;
9642
9643 if (size > PAGE_SIZE)
9644 goto err_size;
9645
9646 if (!size)
9647 size = PERF_ATTR_SIZE_VER0;
9648
9649 if (size < PERF_ATTR_SIZE_VER0)
9650 goto err_size;
9651
9652
9653
9654
9655
9656
9657
9658 if (size > sizeof(*attr)) {
9659 unsigned char __user *addr;
9660 unsigned char __user *end;
9661 unsigned char val;
9662
9663 addr = (void __user *)uattr + sizeof(*attr);
9664 end = (void __user *)uattr + size;
9665
9666 for (; addr < end; addr++) {
9667 ret = get_user(val, addr);
9668 if (ret)
9669 return ret;
9670 if (val)
9671 goto err_size;
9672 }
9673 size = sizeof(*attr);
9674 }
9675
9676 ret = copy_from_user(attr, uattr, size);
9677 if (ret)
9678 return -EFAULT;
9679
9680 attr->size = size;
9681
9682 if (attr->__reserved_1)
9683 return -EINVAL;
9684
9685 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9686 return -EINVAL;
9687
9688 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9689 return -EINVAL;
9690
9691 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9692 u64 mask = attr->branch_sample_type;
9693
9694
9695 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9696 return -EINVAL;
9697
9698
9699 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9700 return -EINVAL;
9701
9702
9703 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9704
9705
9706 if (!attr->exclude_kernel)
9707 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9708
9709 if (!attr->exclude_user)
9710 mask |= PERF_SAMPLE_BRANCH_USER;
9711
9712 if (!attr->exclude_hv)
9713 mask |= PERF_SAMPLE_BRANCH_HV;
9714
9715
9716
9717 attr->branch_sample_type = mask;
9718 }
9719
9720 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9721 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9722 return -EACCES;
9723 }
9724
9725 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9726 ret = perf_reg_validate(attr->sample_regs_user);
9727 if (ret)
9728 return ret;
9729 }
9730
9731 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9732 if (!arch_perf_have_user_stack_dump())
9733 return -ENOSYS;
9734
9735
9736
9737
9738
9739
9740 if (attr->sample_stack_user >= USHRT_MAX)
9741 ret = -EINVAL;
9742 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9743 ret = -EINVAL;
9744 }
9745
9746 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9747 ret = perf_reg_validate(attr->sample_regs_intr);
9748out:
9749 return ret;
9750
9751err_size:
9752 put_user(sizeof(*attr), &uattr->size);
9753 ret = -E2BIG;
9754 goto out;
9755}
9756
9757static int
9758perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9759{
9760 struct ring_buffer *rb = NULL;
9761 int ret = -EINVAL;
9762
9763 if (!output_event)
9764 goto set;
9765
9766
9767 if (event == output_event)
9768 goto out;
9769
9770
9771
9772
9773 if (output_event->cpu != event->cpu)
9774 goto out;
9775
9776
9777
9778
9779 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9780 goto out;
9781
9782
9783
9784
9785 if (output_event->clock != event->clock)
9786 goto out;
9787
9788
9789
9790
9791
9792 if (is_write_backward(output_event) != is_write_backward(event))
9793 goto out;
9794
9795
9796
9797
9798 if (has_aux(event) && has_aux(output_event) &&
9799 event->pmu != output_event->pmu)
9800 goto out;
9801
9802set:
9803 mutex_lock(&event->mmap_mutex);
9804
9805 if (atomic_read(&event->mmap_count))
9806 goto unlock;
9807
9808 if (output_event) {
9809
9810 rb = ring_buffer_get(output_event);
9811 if (!rb)
9812 goto unlock;
9813 }
9814
9815 ring_buffer_attach(event, rb);
9816
9817 ret = 0;
9818unlock:
9819 mutex_unlock(&event->mmap_mutex);
9820
9821out:
9822 return ret;
9823}
9824
9825static void mutex_lock_double(struct mutex *a, struct mutex *b)
9826{
9827 if (b < a)
9828 swap(a, b);
9829
9830 mutex_lock(a);
9831 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9832}
9833
9834static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9835{
9836 bool nmi_safe = false;
9837
9838 switch (clk_id) {
9839 case CLOCK_MONOTONIC:
9840 event->clock = &ktime_get_mono_fast_ns;
9841 nmi_safe = true;
9842 break;
9843
9844 case CLOCK_MONOTONIC_RAW:
9845 event->clock = &ktime_get_raw_fast_ns;
9846 nmi_safe = true;
9847 break;
9848
9849 case CLOCK_REALTIME:
9850 event->clock = &ktime_get_real_ns;
9851 break;
9852
9853 case CLOCK_BOOTTIME:
9854 event->clock = &ktime_get_boot_ns;
9855 break;
9856
9857 case CLOCK_TAI:
9858 event->clock = &ktime_get_tai_ns;
9859 break;
9860
9861 default:
9862 return -EINVAL;
9863 }
9864
9865 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9866 return -EINVAL;
9867
9868 return 0;
9869}
9870
9871
9872
9873
9874
9875static struct perf_event_context *
9876__perf_event_ctx_lock_double(struct perf_event *group_leader,
9877 struct perf_event_context *ctx)
9878{
9879 struct perf_event_context *gctx;
9880
9881again:
9882 rcu_read_lock();
9883 gctx = READ_ONCE(group_leader->ctx);
9884 if (!atomic_inc_not_zero(&gctx->refcount)) {
9885 rcu_read_unlock();
9886 goto again;
9887 }
9888 rcu_read_unlock();
9889
9890 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9891
9892 if (group_leader->ctx != gctx) {
9893 mutex_unlock(&ctx->mutex);
9894 mutex_unlock(&gctx->mutex);
9895 put_ctx(gctx);
9896 goto again;
9897 }
9898
9899 return gctx;
9900}
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910SYSCALL_DEFINE5(perf_event_open,
9911 struct perf_event_attr __user *, attr_uptr,
9912 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
9913{
9914 struct perf_event *group_leader = NULL, *output_event = NULL;
9915 struct perf_event *event, *sibling;
9916 struct perf_event_attr attr;
9917 struct perf_event_context *ctx, *uninitialized_var(gctx);
9918 struct file *event_file = NULL;
9919 struct fd group = {NULL, 0};
9920 struct task_struct *task = NULL;
9921 struct pmu *pmu;
9922 int event_fd;
9923 int move_group = 0;
9924 int err;
9925 int f_flags = O_RDWR;
9926 int cgroup_fd = -1;
9927
9928
9929 if (flags & ~PERF_FLAG_ALL)
9930 return -EINVAL;
9931
9932 err = perf_copy_attr(attr_uptr, &attr);
9933 if (err)
9934 return err;
9935
9936 if (!attr.exclude_kernel) {
9937 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9938 return -EACCES;
9939 }
9940
9941 if (attr.namespaces) {
9942 if (!capable(CAP_SYS_ADMIN))
9943 return -EACCES;
9944 }
9945
9946 if (attr.freq) {
9947 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9948 return -EINVAL;
9949 } else {
9950 if (attr.sample_period & (1ULL << 63))
9951 return -EINVAL;
9952 }
9953
9954
9955 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
9956 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9957 return -EACCES;
9958
9959 if (!attr.sample_max_stack)
9960 attr.sample_max_stack = sysctl_perf_event_max_stack;
9961
9962
9963
9964
9965
9966
9967
9968 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9969 return -EINVAL;
9970
9971 if (flags & PERF_FLAG_FD_CLOEXEC)
9972 f_flags |= O_CLOEXEC;
9973
9974 event_fd = get_unused_fd_flags(f_flags);
9975 if (event_fd < 0)
9976 return event_fd;
9977
9978 if (group_fd != -1) {
9979 err = perf_fget_light(group_fd, &group);
9980 if (err)
9981 goto err_fd;
9982 group_leader = group.file->private_data;
9983 if (flags & PERF_FLAG_FD_OUTPUT)
9984 output_event = group_leader;
9985 if (flags & PERF_FLAG_FD_NO_GROUP)
9986 group_leader = NULL;
9987 }
9988
9989 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
9990 task = find_lively_task_by_vpid(pid);
9991 if (IS_ERR(task)) {
9992 err = PTR_ERR(task);
9993 goto err_group_fd;
9994 }
9995 }
9996
9997 if (task && group_leader &&
9998 group_leader->attr.inherit != attr.inherit) {
9999 err = -EINVAL;
10000 goto err_task;
10001 }
10002
10003 if (task) {
10004 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10005 if (err)
10006 goto err_task;
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016 err = -EACCES;
10017 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10018 goto err_cred;
10019 }
10020
10021 if (flags & PERF_FLAG_PID_CGROUP)
10022 cgroup_fd = pid;
10023
10024 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10025 NULL, NULL, cgroup_fd);
10026 if (IS_ERR(event)) {
10027 err = PTR_ERR(event);
10028 goto err_cred;
10029 }
10030
10031 if (is_sampling_event(event)) {
10032 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10033 err = -EOPNOTSUPP;
10034 goto err_alloc;
10035 }
10036 }
10037
10038
10039
10040
10041
10042 pmu = event->pmu;
10043
10044 if (attr.use_clockid) {
10045 err = perf_event_set_clock(event, attr.clockid);
10046 if (err)
10047 goto err_alloc;
10048 }
10049
10050 if (pmu->task_ctx_nr == perf_sw_context)
10051 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10052
10053 if (group_leader &&
10054 (is_software_event(event) != is_software_event(group_leader))) {
10055 if (is_software_event(event)) {
10056
10057
10058
10059
10060
10061
10062
10063
10064 pmu = group_leader->pmu;
10065 } else if (is_software_event(group_leader) &&
10066 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10067
10068
10069
10070
10071
10072 move_group = 1;
10073 }
10074 }
10075
10076
10077
10078
10079 ctx = find_get_context(pmu, task, event);
10080 if (IS_ERR(ctx)) {
10081 err = PTR_ERR(ctx);
10082 goto err_alloc;
10083 }
10084
10085 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10086 err = -EBUSY;
10087 goto err_context;
10088 }
10089
10090
10091
10092
10093 if (group_leader) {
10094 err = -EINVAL;
10095
10096
10097
10098
10099
10100 if (group_leader->group_leader != group_leader)
10101 goto err_context;
10102
10103
10104 if (group_leader->clock != event->clock)
10105 goto err_context;
10106
10107
10108
10109
10110
10111
10112 if (group_leader->cpu != event->cpu)
10113 goto err_context;
10114
10115
10116
10117
10118
10119 if (group_leader->ctx->task != ctx->task)
10120 goto err_context;
10121
10122
10123
10124
10125
10126
10127 if (!move_group && group_leader->ctx != ctx)
10128 goto err_context;
10129
10130
10131
10132
10133 if (attr.exclusive || attr.pinned)
10134 goto err_context;
10135 }
10136
10137 if (output_event) {
10138 err = perf_event_set_output(event, output_event);
10139 if (err)
10140 goto err_context;
10141 }
10142
10143 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10144 f_flags);
10145 if (IS_ERR(event_file)) {
10146 err = PTR_ERR(event_file);
10147 event_file = NULL;
10148 goto err_context;
10149 }
10150
10151 if (move_group) {
10152 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10153
10154 if (gctx->task == TASK_TOMBSTONE) {
10155 err = -ESRCH;
10156 goto err_locked;
10157 }
10158
10159
10160
10161
10162
10163 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10164
10165
10166
10167
10168
10169 if (gctx != ctx) {
10170 err = -EINVAL;
10171 goto err_locked;
10172 } else {
10173 perf_event_ctx_unlock(group_leader, gctx);
10174 move_group = 0;
10175 }
10176 }
10177 } else {
10178 mutex_lock(&ctx->mutex);
10179 }
10180
10181 if (ctx->task == TASK_TOMBSTONE) {
10182 err = -ESRCH;
10183 goto err_locked;
10184 }
10185
10186 if (!perf_event_validate_size(event)) {
10187 err = -E2BIG;
10188 goto err_locked;
10189 }
10190
10191 if (!task) {
10192
10193
10194
10195
10196
10197
10198 struct perf_cpu_context *cpuctx =
10199 container_of(ctx, struct perf_cpu_context, ctx);
10200
10201 if (!cpuctx->online) {
10202 err = -ENODEV;
10203 goto err_locked;
10204 }
10205 }
10206
10207
10208
10209
10210
10211
10212 if (!exclusive_event_installable(event, ctx)) {
10213
10214 WARN_ON_ONCE(move_group);
10215
10216 err = -EBUSY;
10217 goto err_locked;
10218 }
10219
10220 WARN_ON_ONCE(ctx->parent_ctx);
10221
10222
10223
10224
10225
10226
10227 if (move_group) {
10228
10229
10230
10231
10232 perf_remove_from_context(group_leader, 0);
10233 put_ctx(gctx);
10234
10235 list_for_each_entry(sibling, &group_leader->sibling_list,
10236 group_entry) {
10237 perf_remove_from_context(sibling, 0);
10238 put_ctx(gctx);
10239 }
10240
10241
10242
10243
10244
10245 synchronize_rcu();
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257 list_for_each_entry(sibling, &group_leader->sibling_list,
10258 group_entry) {
10259 perf_event__state_init(sibling);
10260 perf_install_in_context(ctx, sibling, sibling->cpu);
10261 get_ctx(ctx);
10262 }
10263
10264
10265
10266
10267
10268
10269 perf_event__state_init(group_leader);
10270 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10271 get_ctx(ctx);
10272 }
10273
10274
10275
10276
10277
10278
10279
10280 perf_event__header_size(event);
10281 perf_event__id_header_size(event);
10282
10283 event->owner = current;
10284
10285 perf_install_in_context(ctx, event, event->cpu);
10286 perf_unpin_context(ctx);
10287
10288 if (move_group)
10289 perf_event_ctx_unlock(group_leader, gctx);
10290 mutex_unlock(&ctx->mutex);
10291
10292 if (task) {
10293 mutex_unlock(&task->signal->cred_guard_mutex);
10294 put_task_struct(task);
10295 }
10296
10297 mutex_lock(¤t->perf_event_mutex);
10298 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
10299 mutex_unlock(¤t->perf_event_mutex);
10300
10301
10302
10303
10304
10305
10306
10307 fdput(group);
10308 fd_install(event_fd, event_file);
10309 return event_fd;
10310
10311err_locked:
10312 if (move_group)
10313 perf_event_ctx_unlock(group_leader, gctx);
10314 mutex_unlock(&ctx->mutex);
10315
10316 fput(event_file);
10317err_context:
10318 perf_unpin_context(ctx);
10319 put_ctx(ctx);
10320err_alloc:
10321
10322
10323
10324
10325 if (!event_file)
10326 free_event(event);
10327err_cred:
10328 if (task)
10329 mutex_unlock(&task->signal->cred_guard_mutex);
10330err_task:
10331 if (task)
10332 put_task_struct(task);
10333err_group_fd:
10334 fdput(group);
10335err_fd:
10336 put_unused_fd(event_fd);
10337 return err;
10338}
10339
10340
10341
10342
10343
10344
10345
10346
10347struct perf_event *
10348perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10349 struct task_struct *task,
10350 perf_overflow_handler_t overflow_handler,
10351 void *context)
10352{
10353 struct perf_event_context *ctx;
10354 struct perf_event *event;
10355 int err;
10356
10357
10358
10359
10360
10361 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10362 overflow_handler, context, -1);
10363 if (IS_ERR(event)) {
10364 err = PTR_ERR(event);
10365 goto err;
10366 }
10367
10368
10369 event->owner = TASK_TOMBSTONE;
10370
10371 ctx = find_get_context(event->pmu, task, event);
10372 if (IS_ERR(ctx)) {
10373 err = PTR_ERR(ctx);
10374 goto err_free;
10375 }
10376
10377 WARN_ON_ONCE(ctx->parent_ctx);
10378 mutex_lock(&ctx->mutex);
10379 if (ctx->task == TASK_TOMBSTONE) {
10380 err = -ESRCH;
10381 goto err_unlock;
10382 }
10383
10384 if (!task) {
10385
10386
10387
10388
10389
10390
10391 struct perf_cpu_context *cpuctx =
10392 container_of(ctx, struct perf_cpu_context, ctx);
10393 if (!cpuctx->online) {
10394 err = -ENODEV;
10395 goto err_unlock;
10396 }
10397 }
10398
10399 if (!exclusive_event_installable(event, ctx)) {
10400 err = -EBUSY;
10401 goto err_unlock;
10402 }
10403
10404 perf_install_in_context(ctx, event, cpu);
10405 perf_unpin_context(ctx);
10406 mutex_unlock(&ctx->mutex);
10407
10408 return event;
10409
10410err_unlock:
10411 mutex_unlock(&ctx->mutex);
10412 perf_unpin_context(ctx);
10413 put_ctx(ctx);
10414err_free:
10415 free_event(event);
10416err:
10417 return ERR_PTR(err);
10418}
10419EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10420
10421void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10422{
10423 struct perf_event_context *src_ctx;
10424 struct perf_event_context *dst_ctx;
10425 struct perf_event *event, *tmp;
10426 LIST_HEAD(events);
10427
10428 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10429 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10430
10431
10432
10433
10434
10435 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10436 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10437 event_entry) {
10438 perf_remove_from_context(event, 0);
10439 unaccount_event_cpu(event, src_cpu);
10440 put_ctx(src_ctx);
10441 list_add(&event->migrate_entry, &events);
10442 }
10443
10444
10445
10446
10447 synchronize_rcu();
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10458 if (event->group_leader == event)
10459 continue;
10460
10461 list_del(&event->migrate_entry);
10462 if (event->state >= PERF_EVENT_STATE_OFF)
10463 event->state = PERF_EVENT_STATE_INACTIVE;
10464 account_event_cpu(event, dst_cpu);
10465 perf_install_in_context(dst_ctx, event, dst_cpu);
10466 get_ctx(dst_ctx);
10467 }
10468
10469
10470
10471
10472
10473 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10474 list_del(&event->migrate_entry);
10475 if (event->state >= PERF_EVENT_STATE_OFF)
10476 event->state = PERF_EVENT_STATE_INACTIVE;
10477 account_event_cpu(event, dst_cpu);
10478 perf_install_in_context(dst_ctx, event, dst_cpu);
10479 get_ctx(dst_ctx);
10480 }
10481 mutex_unlock(&dst_ctx->mutex);
10482 mutex_unlock(&src_ctx->mutex);
10483}
10484EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10485
10486static void sync_child_event(struct perf_event *child_event,
10487 struct task_struct *child)
10488{
10489 struct perf_event *parent_event = child_event->parent;
10490 u64 child_val;
10491
10492 if (child_event->attr.inherit_stat)
10493 perf_event_read_event(child_event, child);
10494
10495 child_val = perf_event_count(child_event);
10496
10497
10498
10499
10500 atomic64_add(child_val, &parent_event->child_count);
10501 atomic64_add(child_event->total_time_enabled,
10502 &parent_event->child_total_time_enabled);
10503 atomic64_add(child_event->total_time_running,
10504 &parent_event->child_total_time_running);
10505}
10506
10507static void
10508perf_event_exit_event(struct perf_event *child_event,
10509 struct perf_event_context *child_ctx,
10510 struct task_struct *child)
10511{
10512 struct perf_event *parent_event = child_event->parent;
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526 raw_spin_lock_irq(&child_ctx->lock);
10527 WARN_ON_ONCE(child_ctx->is_active);
10528
10529 if (parent_event)
10530 perf_group_detach(child_event);
10531 list_del_event(child_event, child_ctx);
10532 child_event->state = PERF_EVENT_STATE_EXIT;
10533 raw_spin_unlock_irq(&child_ctx->lock);
10534
10535
10536
10537
10538 if (!parent_event) {
10539 perf_event_wakeup(child_event);
10540 return;
10541 }
10542
10543
10544
10545
10546 sync_child_event(child_event, child);
10547
10548
10549
10550
10551 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10552 mutex_lock(&parent_event->child_mutex);
10553 list_del_init(&child_event->child_list);
10554 mutex_unlock(&parent_event->child_mutex);
10555
10556
10557
10558
10559 perf_event_wakeup(parent_event);
10560 free_event(child_event);
10561 put_event(parent_event);
10562}
10563
10564static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10565{
10566 struct perf_event_context *child_ctx, *clone_ctx = NULL;
10567 struct perf_event *child_event, *next;
10568
10569 WARN_ON_ONCE(child != current);
10570
10571 child_ctx = perf_pin_task_context(child, ctxn);
10572 if (!child_ctx)
10573 return;
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585 mutex_lock(&child_ctx->mutex);
10586
10587
10588
10589
10590
10591
10592 raw_spin_lock_irq(&child_ctx->lock);
10593 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
10594
10595
10596
10597
10598
10599 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10600 put_ctx(child_ctx);
10601 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10602 put_task_struct(current);
10603
10604 clone_ctx = unclone_ctx(child_ctx);
10605 raw_spin_unlock_irq(&child_ctx->lock);
10606
10607 if (clone_ctx)
10608 put_ctx(clone_ctx);
10609
10610
10611
10612
10613
10614
10615 perf_event_task(child, child_ctx, 0);
10616
10617 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10618 perf_event_exit_event(child_event, child_ctx, child);
10619
10620 mutex_unlock(&child_ctx->mutex);
10621
10622 put_ctx(child_ctx);
10623}
10624
10625
10626
10627
10628
10629
10630
10631void perf_event_exit_task(struct task_struct *child)
10632{
10633 struct perf_event *event, *tmp;
10634 int ctxn;
10635
10636 mutex_lock(&child->perf_event_mutex);
10637 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10638 owner_entry) {
10639 list_del_init(&event->owner_entry);
10640
10641
10642
10643
10644
10645
10646 smp_store_release(&event->owner, NULL);
10647 }
10648 mutex_unlock(&child->perf_event_mutex);
10649
10650 for_each_task_context_nr(ctxn)
10651 perf_event_exit_task_context(child, ctxn);
10652
10653
10654
10655
10656
10657
10658
10659 perf_event_task(child, NULL, 0);
10660}
10661
10662static void perf_free_event(struct perf_event *event,
10663 struct perf_event_context *ctx)
10664{
10665 struct perf_event *parent = event->parent;
10666
10667 if (WARN_ON_ONCE(!parent))
10668 return;
10669
10670 mutex_lock(&parent->child_mutex);
10671 list_del_init(&event->child_list);
10672 mutex_unlock(&parent->child_mutex);
10673
10674 put_event(parent);
10675
10676 raw_spin_lock_irq(&ctx->lock);
10677 perf_group_detach(event);
10678 list_del_event(event, ctx);
10679 raw_spin_unlock_irq(&ctx->lock);
10680 free_event(event);
10681}
10682
10683
10684
10685
10686
10687
10688
10689
10690void perf_event_free_task(struct task_struct *task)
10691{
10692 struct perf_event_context *ctx;
10693 struct perf_event *event, *tmp;
10694 int ctxn;
10695
10696 for_each_task_context_nr(ctxn) {
10697 ctx = task->perf_event_ctxp[ctxn];
10698 if (!ctx)
10699 continue;
10700
10701 mutex_lock(&ctx->mutex);
10702 raw_spin_lock_irq(&ctx->lock);
10703
10704
10705
10706
10707
10708
10709 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10710 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10711 put_task_struct(task);
10712 raw_spin_unlock_irq(&ctx->lock);
10713
10714 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
10715 perf_free_event(event, ctx);
10716
10717 mutex_unlock(&ctx->mutex);
10718 put_ctx(ctx);
10719 }
10720}
10721
10722void perf_event_delayed_put(struct task_struct *task)
10723{
10724 int ctxn;
10725
10726 for_each_task_context_nr(ctxn)
10727 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10728}
10729
10730struct file *perf_event_get(unsigned int fd)
10731{
10732 struct file *file;
10733
10734 file = fget_raw(fd);
10735 if (!file)
10736 return ERR_PTR(-EBADF);
10737
10738 if (file->f_op != &perf_fops) {
10739 fput(file);
10740 return ERR_PTR(-EBADF);
10741 }
10742
10743 return file;
10744}
10745
10746const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10747{
10748 if (!event)
10749 return ERR_PTR(-EINVAL);
10750
10751 return &event->attr;
10752}
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762static struct perf_event *
10763inherit_event(struct perf_event *parent_event,
10764 struct task_struct *parent,
10765 struct perf_event_context *parent_ctx,
10766 struct task_struct *child,
10767 struct perf_event *group_leader,
10768 struct perf_event_context *child_ctx)
10769{
10770 enum perf_event_active_state parent_state = parent_event->state;
10771 struct perf_event *child_event;
10772 unsigned long flags;
10773
10774
10775
10776
10777
10778
10779
10780 if (parent_event->parent)
10781 parent_event = parent_event->parent;
10782
10783 child_event = perf_event_alloc(&parent_event->attr,
10784 parent_event->cpu,
10785 child,
10786 group_leader, parent_event,
10787 NULL, NULL, -1);
10788 if (IS_ERR(child_event))
10789 return child_event;
10790
10791
10792
10793
10794
10795
10796
10797 mutex_lock(&parent_event->child_mutex);
10798 if (is_orphaned_event(parent_event) ||
10799 !atomic_long_inc_not_zero(&parent_event->refcount)) {
10800 mutex_unlock(&parent_event->child_mutex);
10801 free_event(child_event);
10802 return NULL;
10803 }
10804
10805 get_ctx(child_ctx);
10806
10807
10808
10809
10810
10811
10812 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10813 child_event->state = PERF_EVENT_STATE_INACTIVE;
10814 else
10815 child_event->state = PERF_EVENT_STATE_OFF;
10816
10817 if (parent_event->attr.freq) {
10818 u64 sample_period = parent_event->hw.sample_period;
10819 struct hw_perf_event *hwc = &child_event->hw;
10820
10821 hwc->sample_period = sample_period;
10822 hwc->last_period = sample_period;
10823
10824 local64_set(&hwc->period_left, sample_period);
10825 }
10826
10827 child_event->ctx = child_ctx;
10828 child_event->overflow_handler = parent_event->overflow_handler;
10829 child_event->overflow_handler_context
10830 = parent_event->overflow_handler_context;
10831
10832
10833
10834
10835 perf_event__header_size(child_event);
10836 perf_event__id_header_size(child_event);
10837
10838
10839
10840
10841 raw_spin_lock_irqsave(&child_ctx->lock, flags);
10842 add_event_to_ctx(child_event, child_ctx);
10843 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10844
10845
10846
10847
10848 list_add_tail(&child_event->child_list, &parent_event->child_list);
10849 mutex_unlock(&parent_event->child_mutex);
10850
10851 return child_event;
10852}
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864static int inherit_group(struct perf_event *parent_event,
10865 struct task_struct *parent,
10866 struct perf_event_context *parent_ctx,
10867 struct task_struct *child,
10868 struct perf_event_context *child_ctx)
10869{
10870 struct perf_event *leader;
10871 struct perf_event *sub;
10872 struct perf_event *child_ctr;
10873
10874 leader = inherit_event(parent_event, parent, parent_ctx,
10875 child, NULL, child_ctx);
10876 if (IS_ERR(leader))
10877 return PTR_ERR(leader);
10878
10879
10880
10881
10882
10883 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10884 child_ctr = inherit_event(sub, parent, parent_ctx,
10885 child, leader, child_ctx);
10886 if (IS_ERR(child_ctr))
10887 return PTR_ERR(child_ctr);
10888 }
10889 return 0;
10890}
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903static int
10904inherit_task_group(struct perf_event *event, struct task_struct *parent,
10905 struct perf_event_context *parent_ctx,
10906 struct task_struct *child, int ctxn,
10907 int *inherited_all)
10908{
10909 int ret;
10910 struct perf_event_context *child_ctx;
10911
10912 if (!event->attr.inherit) {
10913 *inherited_all = 0;
10914 return 0;
10915 }
10916
10917 child_ctx = child->perf_event_ctxp[ctxn];
10918 if (!child_ctx) {
10919
10920
10921
10922
10923
10924
10925 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10926 if (!child_ctx)
10927 return -ENOMEM;
10928
10929 child->perf_event_ctxp[ctxn] = child_ctx;
10930 }
10931
10932 ret = inherit_group(event, parent, parent_ctx,
10933 child, child_ctx);
10934
10935 if (ret)
10936 *inherited_all = 0;
10937
10938 return ret;
10939}
10940
10941
10942
10943
10944static int perf_event_init_context(struct task_struct *child, int ctxn)
10945{
10946 struct perf_event_context *child_ctx, *parent_ctx;
10947 struct perf_event_context *cloned_ctx;
10948 struct perf_event *event;
10949 struct task_struct *parent = current;
10950 int inherited_all = 1;
10951 unsigned long flags;
10952 int ret = 0;
10953
10954 if (likely(!parent->perf_event_ctxp[ctxn]))
10955 return 0;
10956
10957
10958
10959
10960
10961 parent_ctx = perf_pin_task_context(parent, ctxn);
10962 if (!parent_ctx)
10963 return 0;
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976 mutex_lock(&parent_ctx->mutex);
10977
10978
10979
10980
10981
10982 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
10983 ret = inherit_task_group(event, parent, parent_ctx,
10984 child, ctxn, &inherited_all);
10985 if (ret)
10986 goto out_unlock;
10987 }
10988
10989
10990
10991
10992
10993
10994 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10995 parent_ctx->rotate_disable = 1;
10996 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10997
10998 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
10999 ret = inherit_task_group(event, parent, parent_ctx,
11000 child, ctxn, &inherited_all);
11001 if (ret)
11002 goto out_unlock;
11003 }
11004
11005 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11006 parent_ctx->rotate_disable = 0;
11007
11008 child_ctx = child->perf_event_ctxp[ctxn];
11009
11010 if (child_ctx && inherited_all) {
11011
11012
11013
11014
11015
11016
11017
11018 cloned_ctx = parent_ctx->parent_ctx;
11019 if (cloned_ctx) {
11020 child_ctx->parent_ctx = cloned_ctx;
11021 child_ctx->parent_gen = parent_ctx->parent_gen;
11022 } else {
11023 child_ctx->parent_ctx = parent_ctx;
11024 child_ctx->parent_gen = parent_ctx->generation;
11025 }
11026 get_ctx(child_ctx->parent_ctx);
11027 }
11028
11029 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11030out_unlock:
11031 mutex_unlock(&parent_ctx->mutex);
11032
11033 perf_unpin_context(parent_ctx);
11034 put_ctx(parent_ctx);
11035
11036 return ret;
11037}
11038
11039
11040
11041
11042int perf_event_init_task(struct task_struct *child)
11043{
11044 int ctxn, ret;
11045
11046 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11047 mutex_init(&child->perf_event_mutex);
11048 INIT_LIST_HEAD(&child->perf_event_list);
11049
11050 for_each_task_context_nr(ctxn) {
11051 ret = perf_event_init_context(child, ctxn);
11052 if (ret) {
11053 perf_event_free_task(child);
11054 return ret;
11055 }
11056 }
11057
11058 return 0;
11059}
11060
11061static void __init perf_event_init_all_cpus(void)
11062{
11063 struct swevent_htable *swhash;
11064 int cpu;
11065
11066 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11067
11068 for_each_possible_cpu(cpu) {
11069 swhash = &per_cpu(swevent_htable, cpu);
11070 mutex_init(&swhash->hlist_mutex);
11071 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11072
11073 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11074 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11075
11076#ifdef CONFIG_CGROUP_PERF
11077 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11078#endif
11079 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11080 }
11081}
11082
11083void perf_swevent_init_cpu(unsigned int cpu)
11084{
11085 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11086
11087 mutex_lock(&swhash->hlist_mutex);
11088 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11089 struct swevent_hlist *hlist;
11090
11091 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11092 WARN_ON(!hlist);
11093 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11094 }
11095 mutex_unlock(&swhash->hlist_mutex);
11096}
11097
11098#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11099static void __perf_event_exit_context(void *__info)
11100{
11101 struct perf_event_context *ctx = __info;
11102 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11103 struct perf_event *event;
11104
11105 raw_spin_lock(&ctx->lock);
11106 list_for_each_entry(event, &ctx->event_list, event_entry)
11107 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11108 raw_spin_unlock(&ctx->lock);
11109}
11110
11111static void perf_event_exit_cpu_context(int cpu)
11112{
11113 struct perf_cpu_context *cpuctx;
11114 struct perf_event_context *ctx;
11115 struct pmu *pmu;
11116
11117 mutex_lock(&pmus_lock);
11118 list_for_each_entry(pmu, &pmus, entry) {
11119 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11120 ctx = &cpuctx->ctx;
11121
11122 mutex_lock(&ctx->mutex);
11123 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11124 cpuctx->online = 0;
11125 mutex_unlock(&ctx->mutex);
11126 }
11127 cpumask_clear_cpu(cpu, perf_online_mask);
11128 mutex_unlock(&pmus_lock);
11129}
11130#else
11131
11132static void perf_event_exit_cpu_context(int cpu) { }
11133
11134#endif
11135
11136int perf_event_init_cpu(unsigned int cpu)
11137{
11138 struct perf_cpu_context *cpuctx;
11139 struct perf_event_context *ctx;
11140 struct pmu *pmu;
11141
11142 perf_swevent_init_cpu(cpu);
11143
11144 mutex_lock(&pmus_lock);
11145 cpumask_set_cpu(cpu, perf_online_mask);
11146 list_for_each_entry(pmu, &pmus, entry) {
11147 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11148 ctx = &cpuctx->ctx;
11149
11150 mutex_lock(&ctx->mutex);
11151 cpuctx->online = 1;
11152 mutex_unlock(&ctx->mutex);
11153 }
11154 mutex_unlock(&pmus_lock);
11155
11156 return 0;
11157}
11158
11159int perf_event_exit_cpu(unsigned int cpu)
11160{
11161 perf_event_exit_cpu_context(cpu);
11162 return 0;
11163}
11164
11165static int
11166perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11167{
11168 int cpu;
11169
11170 for_each_online_cpu(cpu)
11171 perf_event_exit_cpu(cpu);
11172
11173 return NOTIFY_OK;
11174}
11175
11176
11177
11178
11179
11180static struct notifier_block perf_reboot_notifier = {
11181 .notifier_call = perf_reboot,
11182 .priority = INT_MIN,
11183};
11184
11185void __init perf_event_init(void)
11186{
11187 int ret;
11188
11189 idr_init(&pmu_idr);
11190
11191 perf_event_init_all_cpus();
11192 init_srcu_struct(&pmus_srcu);
11193 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11194 perf_pmu_register(&perf_cpu_clock, NULL, -1);
11195 perf_pmu_register(&perf_task_clock, NULL, -1);
11196 perf_tp_register();
11197 perf_event_init_cpu(smp_processor_id());
11198 register_reboot_notifier(&perf_reboot_notifier);
11199
11200 ret = init_hw_breakpoint();
11201 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11202
11203
11204
11205
11206
11207 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11208 != 1024);
11209}
11210
11211ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11212 char *page)
11213{
11214 struct perf_pmu_events_attr *pmu_attr =
11215 container_of(attr, struct perf_pmu_events_attr, attr);
11216
11217 if (pmu_attr->event_str)
11218 return sprintf(page, "%s\n", pmu_attr->event_str);
11219
11220 return 0;
11221}
11222EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11223
11224static int __init perf_event_sysfs_init(void)
11225{
11226 struct pmu *pmu;
11227 int ret;
11228
11229 mutex_lock(&pmus_lock);
11230
11231 ret = bus_register(&pmu_bus);
11232 if (ret)
11233 goto unlock;
11234
11235 list_for_each_entry(pmu, &pmus, entry) {
11236 if (!pmu->name || pmu->type < 0)
11237 continue;
11238
11239 ret = pmu_dev_alloc(pmu);
11240 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11241 }
11242 pmu_bus_running = 1;
11243 ret = 0;
11244
11245unlock:
11246 mutex_unlock(&pmus_lock);
11247
11248 return ret;
11249}
11250device_initcall(perf_event_sysfs_init);
11251
11252#ifdef CONFIG_CGROUP_PERF
11253static struct cgroup_subsys_state *
11254perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11255{
11256 struct perf_cgroup *jc;
11257
11258 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11259 if (!jc)
11260 return ERR_PTR(-ENOMEM);
11261
11262 jc->info = alloc_percpu(struct perf_cgroup_info);
11263 if (!jc->info) {
11264 kfree(jc);
11265 return ERR_PTR(-ENOMEM);
11266 }
11267
11268 return &jc->css;
11269}
11270
11271static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11272{
11273 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11274
11275 free_percpu(jc->info);
11276 kfree(jc);
11277}
11278
11279static int __perf_cgroup_move(void *info)
11280{
11281 struct task_struct *task = info;
11282 rcu_read_lock();
11283 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11284 rcu_read_unlock();
11285 return 0;
11286}
11287
11288static void perf_cgroup_attach(struct cgroup_taskset *tset)
11289{
11290 struct task_struct *task;
11291 struct cgroup_subsys_state *css;
11292
11293 cgroup_taskset_for_each(task, css, tset)
11294 task_function_call(task, __perf_cgroup_move, task);
11295}
11296
11297struct cgroup_subsys perf_event_cgrp_subsys = {
11298 .css_alloc = perf_cgroup_css_alloc,
11299 .css_free = perf_cgroup_css_free,
11300 .attach = perf_cgroup_attach,
11301
11302
11303
11304
11305
11306 .implicit_on_dfl = true,
11307 .threaded = true,
11308};
11309#endif
11310