1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53
54#include "internal.h"
55
56#include <asm/irq_regs.h>
57
58typedef int (*remote_function_f)(void *);
59
60struct remote_function_call {
61 struct task_struct *p;
62 remote_function_f func;
63 void *info;
64 int ret;
65};
66
67static void remote_function(void *data)
68{
69 struct remote_function_call *tfc = data;
70 struct task_struct *p = tfc->p;
71
72 if (p) {
73
74 if (task_cpu(p) != smp_processor_id())
75 return;
76
77
78
79
80
81
82 tfc->ret = -ESRCH;
83 if (p != current)
84 return;
85 }
86
87 tfc->ret = tfc->func(tfc->info);
88}
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103static int
104task_function_call(struct task_struct *p, remote_function_f func, void *info)
105{
106 struct remote_function_call data = {
107 .p = p,
108 .func = func,
109 .info = info,
110 .ret = -EAGAIN,
111 };
112 int ret;
113
114 do {
115 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
116 if (!ret)
117 ret = data.ret;
118 } while (ret == -EAGAIN);
119
120 return ret;
121}
122
123
124
125
126
127
128
129
130
131
132static int cpu_function_call(int cpu, remote_function_f func, void *info)
133{
134 struct remote_function_call data = {
135 .p = NULL,
136 .func = func,
137 .info = info,
138 .ret = -ENXIO,
139 };
140
141 smp_call_function_single(cpu, remote_function, &data, 1);
142
143 return data.ret;
144}
145
146static inline struct perf_cpu_context *
147__get_cpu_context(struct perf_event_context *ctx)
148{
149 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
150}
151
152static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
153 struct perf_event_context *ctx)
154{
155 raw_spin_lock(&cpuctx->ctx.lock);
156 if (ctx)
157 raw_spin_lock(&ctx->lock);
158}
159
160static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
161 struct perf_event_context *ctx)
162{
163 if (ctx)
164 raw_spin_unlock(&ctx->lock);
165 raw_spin_unlock(&cpuctx->ctx.lock);
166}
167
168#define TASK_TOMBSTONE ((void *)-1L)
169
170static bool is_kernel_event(struct perf_event *event)
171{
172 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
173}
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
195 struct perf_event_context *, void *);
196
197struct event_function_struct {
198 struct perf_event *event;
199 event_f func;
200 void *data;
201};
202
203static int event_function(void *info)
204{
205 struct event_function_struct *efs = info;
206 struct perf_event *event = efs->event;
207 struct perf_event_context *ctx = event->ctx;
208 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
209 struct perf_event_context *task_ctx = cpuctx->task_ctx;
210 int ret = 0;
211
212 WARN_ON_ONCE(!irqs_disabled());
213
214 perf_ctx_lock(cpuctx, task_ctx);
215
216
217
218
219 if (ctx->task) {
220 if (ctx->task != current) {
221 ret = -ESRCH;
222 goto unlock;
223 }
224
225
226
227
228
229
230
231
232 WARN_ON_ONCE(!ctx->is_active);
233
234
235
236
237 WARN_ON_ONCE(task_ctx != ctx);
238 } else {
239 WARN_ON_ONCE(&cpuctx->ctx != ctx);
240 }
241
242 efs->func(event, cpuctx, ctx, efs->data);
243unlock:
244 perf_ctx_unlock(cpuctx, task_ctx);
245
246 return ret;
247}
248
249static void event_function_call(struct perf_event *event, event_f func, void *data)
250{
251 struct perf_event_context *ctx = event->ctx;
252 struct task_struct *task = READ_ONCE(ctx->task);
253 struct event_function_struct efs = {
254 .event = event,
255 .func = func,
256 .data = data,
257 };
258
259 if (!event->parent) {
260
261
262
263
264
265 lockdep_assert_held(&ctx->mutex);
266 }
267
268 if (!task) {
269 cpu_function_call(event->cpu, event_function, &efs);
270 return;
271 }
272
273 if (task == TASK_TOMBSTONE)
274 return;
275
276again:
277 if (!task_function_call(task, event_function, &efs))
278 return;
279
280 raw_spin_lock_irq(&ctx->lock);
281
282
283
284
285 task = ctx->task;
286 if (task == TASK_TOMBSTONE) {
287 raw_spin_unlock_irq(&ctx->lock);
288 return;
289 }
290 if (ctx->is_active) {
291 raw_spin_unlock_irq(&ctx->lock);
292 goto again;
293 }
294 func(event, NULL, ctx, data);
295 raw_spin_unlock_irq(&ctx->lock);
296}
297
298
299
300
301
302static void event_function_local(struct perf_event *event, event_f func, void *data)
303{
304 struct perf_event_context *ctx = event->ctx;
305 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
306 struct task_struct *task = READ_ONCE(ctx->task);
307 struct perf_event_context *task_ctx = NULL;
308
309 WARN_ON_ONCE(!irqs_disabled());
310
311 if (task) {
312 if (task == TASK_TOMBSTONE)
313 return;
314
315 task_ctx = ctx;
316 }
317
318 perf_ctx_lock(cpuctx, task_ctx);
319
320 task = ctx->task;
321 if (task == TASK_TOMBSTONE)
322 goto unlock;
323
324 if (task) {
325
326
327
328
329
330 if (ctx->is_active) {
331 if (WARN_ON_ONCE(task != current))
332 goto unlock;
333
334 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
335 goto unlock;
336 }
337 } else {
338 WARN_ON_ONCE(&cpuctx->ctx != ctx);
339 }
340
341 func(event, cpuctx, ctx, data);
342unlock:
343 perf_ctx_unlock(cpuctx, task_ctx);
344}
345
346#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
347 PERF_FLAG_FD_OUTPUT |\
348 PERF_FLAG_PID_CGROUP |\
349 PERF_FLAG_FD_CLOEXEC)
350
351
352
353
354#define PERF_SAMPLE_BRANCH_PERM_PLM \
355 (PERF_SAMPLE_BRANCH_KERNEL |\
356 PERF_SAMPLE_BRANCH_HV)
357
358enum event_type_t {
359 EVENT_FLEXIBLE = 0x1,
360 EVENT_PINNED = 0x2,
361 EVENT_TIME = 0x4,
362
363 EVENT_CPU = 0x8,
364 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
365};
366
367
368
369
370
371
372static void perf_sched_delayed(struct work_struct *work);
373DEFINE_STATIC_KEY_FALSE(perf_sched_events);
374static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
375static DEFINE_MUTEX(perf_sched_mutex);
376static atomic_t perf_sched_count;
377
378static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
379static DEFINE_PER_CPU(int, perf_sched_cb_usages);
380static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
381
382static atomic_t nr_mmap_events __read_mostly;
383static atomic_t nr_comm_events __read_mostly;
384static atomic_t nr_namespaces_events __read_mostly;
385static atomic_t nr_task_events __read_mostly;
386static atomic_t nr_freq_events __read_mostly;
387static atomic_t nr_switch_events __read_mostly;
388
389static LIST_HEAD(pmus);
390static DEFINE_MUTEX(pmus_lock);
391static struct srcu_struct pmus_srcu;
392static cpumask_var_t perf_online_mask;
393
394
395
396
397
398
399
400
401int sysctl_perf_event_paranoid __read_mostly = 2;
402
403
404int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
405
406
407
408
409#define DEFAULT_MAX_SAMPLE_RATE 100000
410#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
411#define DEFAULT_CPU_TIME_MAX_PERCENT 25
412
413int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
414
415static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
416static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
417
418static int perf_sample_allowed_ns __read_mostly =
419 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
420
421static void update_perf_cpu_limits(void)
422{
423 u64 tmp = perf_sample_period_ns;
424
425 tmp *= sysctl_perf_cpu_time_max_percent;
426 tmp = div_u64(tmp, 100);
427 if (!tmp)
428 tmp = 1;
429
430 WRITE_ONCE(perf_sample_allowed_ns, tmp);
431}
432
433static int perf_rotate_context(struct perf_cpu_context *cpuctx);
434
435int perf_proc_update_handler(struct ctl_table *table, int write,
436 void __user *buffer, size_t *lenp,
437 loff_t *ppos)
438{
439 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
440
441 if (ret || !write)
442 return ret;
443
444
445
446
447 if (sysctl_perf_cpu_time_max_percent == 100 ||
448 sysctl_perf_cpu_time_max_percent == 0)
449 return -EINVAL;
450
451 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
452 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
453 update_perf_cpu_limits();
454
455 return 0;
456}
457
458int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
459
460int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
461 void __user *buffer, size_t *lenp,
462 loff_t *ppos)
463{
464 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465
466 if (ret || !write)
467 return ret;
468
469 if (sysctl_perf_cpu_time_max_percent == 100 ||
470 sysctl_perf_cpu_time_max_percent == 0) {
471 printk(KERN_WARNING
472 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
473 WRITE_ONCE(perf_sample_allowed_ns, 0);
474 } else {
475 update_perf_cpu_limits();
476 }
477
478 return 0;
479}
480
481
482
483
484
485
486
487#define NR_ACCUMULATED_SAMPLES 128
488static DEFINE_PER_CPU(u64, running_sample_length);
489
490static u64 __report_avg;
491static u64 __report_allowed;
492
493static void perf_duration_warn(struct irq_work *w)
494{
495 printk_ratelimited(KERN_INFO
496 "perf: interrupt took too long (%lld > %lld), lowering "
497 "kernel.perf_event_max_sample_rate to %d\n",
498 __report_avg, __report_allowed,
499 sysctl_perf_event_sample_rate);
500}
501
502static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
503
504void perf_sample_event_took(u64 sample_len_ns)
505{
506 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
507 u64 running_len;
508 u64 avg_len;
509 u32 max;
510
511 if (max_len == 0)
512 return;
513
514
515 running_len = __this_cpu_read(running_sample_length);
516 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
517 running_len += sample_len_ns;
518 __this_cpu_write(running_sample_length, running_len);
519
520
521
522
523
524
525 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
526 if (avg_len <= max_len)
527 return;
528
529 __report_avg = avg_len;
530 __report_allowed = max_len;
531
532
533
534
535 avg_len += avg_len / 4;
536 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
537 if (avg_len < max)
538 max /= (u32)avg_len;
539 else
540 max = 1;
541
542 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
543 WRITE_ONCE(max_samples_per_tick, max);
544
545 sysctl_perf_event_sample_rate = max * HZ;
546 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
547
548 if (!irq_work_queue(&perf_duration_work)) {
549 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
550 "kernel.perf_event_max_sample_rate to %d\n",
551 __report_avg, __report_allowed,
552 sysctl_perf_event_sample_rate);
553 }
554}
555
556static atomic64_t perf_event_id;
557
558static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
559 enum event_type_t event_type);
560
561static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
562 enum event_type_t event_type,
563 struct task_struct *task);
564
565static void update_context_time(struct perf_event_context *ctx);
566static u64 perf_event_time(struct perf_event *event);
567
568void __weak perf_event_print_debug(void) { }
569
570extern __weak const char *perf_pmu_name(void)
571{
572 return "pmu";
573}
574
575static inline u64 perf_clock(void)
576{
577 return local_clock();
578}
579
580static inline u64 perf_event_clock(struct perf_event *event)
581{
582 return event->clock();
583}
584
585#ifdef CONFIG_CGROUP_PERF
586
587static inline bool
588perf_cgroup_match(struct perf_event *event)
589{
590 struct perf_event_context *ctx = event->ctx;
591 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
592
593
594 if (!event->cgrp)
595 return true;
596
597
598 if (!cpuctx->cgrp)
599 return false;
600
601
602
603
604
605
606
607 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
608 event->cgrp->css.cgroup);
609}
610
611static inline void perf_detach_cgroup(struct perf_event *event)
612{
613 css_put(&event->cgrp->css);
614 event->cgrp = NULL;
615}
616
617static inline int is_cgroup_event(struct perf_event *event)
618{
619 return event->cgrp != NULL;
620}
621
622static inline u64 perf_cgroup_event_time(struct perf_event *event)
623{
624 struct perf_cgroup_info *t;
625
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 return t->time;
628}
629
630static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
631{
632 struct perf_cgroup_info *info;
633 u64 now;
634
635 now = perf_clock();
636
637 info = this_cpu_ptr(cgrp->info);
638
639 info->time += now - info->timestamp;
640 info->timestamp = now;
641}
642
643static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
644{
645 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
646 if (cgrp_out)
647 __update_cgrp_time(cgrp_out);
648}
649
650static inline void update_cgrp_time_from_event(struct perf_event *event)
651{
652 struct perf_cgroup *cgrp;
653
654
655
656
657
658 if (!is_cgroup_event(event))
659 return;
660
661 cgrp = perf_cgroup_from_task(current, event->ctx);
662
663
664
665 if (cgrp == event->cgrp)
666 __update_cgrp_time(event->cgrp);
667}
668
669static inline void
670perf_cgroup_set_timestamp(struct task_struct *task,
671 struct perf_event_context *ctx)
672{
673 struct perf_cgroup *cgrp;
674 struct perf_cgroup_info *info;
675
676
677
678
679
680
681 if (!task || !ctx->nr_cgroups)
682 return;
683
684 cgrp = perf_cgroup_from_task(task, ctx);
685 info = this_cpu_ptr(cgrp->info);
686 info->timestamp = ctx->timestamp;
687}
688
689static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
690
691#define PERF_CGROUP_SWOUT 0x1
692#define PERF_CGROUP_SWIN 0x2
693
694
695
696
697
698
699
700static void perf_cgroup_switch(struct task_struct *task, int mode)
701{
702 struct perf_cpu_context *cpuctx;
703 struct list_head *list;
704 unsigned long flags;
705
706
707
708
709
710 local_irq_save(flags);
711
712 list = this_cpu_ptr(&cgrp_cpuctx_list);
713 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
714 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
715
716 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
717 perf_pmu_disable(cpuctx->ctx.pmu);
718
719 if (mode & PERF_CGROUP_SWOUT) {
720 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
721
722
723
724
725 cpuctx->cgrp = NULL;
726 }
727
728 if (mode & PERF_CGROUP_SWIN) {
729 WARN_ON_ONCE(cpuctx->cgrp);
730
731
732
733
734
735
736
737 cpuctx->cgrp = perf_cgroup_from_task(task,
738 &cpuctx->ctx);
739 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
740 }
741 perf_pmu_enable(cpuctx->ctx.pmu);
742 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
743 }
744
745 local_irq_restore(flags);
746}
747
748static inline void perf_cgroup_sched_out(struct task_struct *task,
749 struct task_struct *next)
750{
751 struct perf_cgroup *cgrp1;
752 struct perf_cgroup *cgrp2 = NULL;
753
754 rcu_read_lock();
755
756
757
758
759
760 cgrp1 = perf_cgroup_from_task(task, NULL);
761 cgrp2 = perf_cgroup_from_task(next, NULL);
762
763
764
765
766
767
768 if (cgrp1 != cgrp2)
769 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
770
771 rcu_read_unlock();
772}
773
774static inline void perf_cgroup_sched_in(struct task_struct *prev,
775 struct task_struct *task)
776{
777 struct perf_cgroup *cgrp1;
778 struct perf_cgroup *cgrp2 = NULL;
779
780 rcu_read_lock();
781
782
783
784
785
786 cgrp1 = perf_cgroup_from_task(task, NULL);
787 cgrp2 = perf_cgroup_from_task(prev, NULL);
788
789
790
791
792
793
794 if (cgrp1 != cgrp2)
795 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
796
797 rcu_read_unlock();
798}
799
800static inline int perf_cgroup_connect(int fd, struct perf_event *event,
801 struct perf_event_attr *attr,
802 struct perf_event *group_leader)
803{
804 struct perf_cgroup *cgrp;
805 struct cgroup_subsys_state *css;
806 struct fd f = fdget(fd);
807 int ret = 0;
808
809 if (!f.file)
810 return -EBADF;
811
812 css = css_tryget_online_from_dir(f.file->f_path.dentry,
813 &perf_event_cgrp_subsys);
814 if (IS_ERR(css)) {
815 ret = PTR_ERR(css);
816 goto out;
817 }
818
819 cgrp = container_of(css, struct perf_cgroup, css);
820 event->cgrp = cgrp;
821
822
823
824
825
826
827 if (group_leader && group_leader->cgrp != cgrp) {
828 perf_detach_cgroup(event);
829 ret = -EINVAL;
830 }
831out:
832 fdput(f);
833 return ret;
834}
835
836static inline void
837perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
838{
839 struct perf_cgroup_info *t;
840 t = per_cpu_ptr(event->cgrp->info, event->cpu);
841 event->shadow_ctx_time = now - t->timestamp;
842}
843
844static inline void
845perf_cgroup_defer_enabled(struct perf_event *event)
846{
847
848
849
850
851
852
853 if (is_cgroup_event(event) && !perf_cgroup_match(event))
854 event->cgrp_defer_enabled = 1;
855}
856
857static inline void
858perf_cgroup_mark_enabled(struct perf_event *event,
859 struct perf_event_context *ctx)
860{
861 struct perf_event *sub;
862 u64 tstamp = perf_event_time(event);
863
864 if (!event->cgrp_defer_enabled)
865 return;
866
867 event->cgrp_defer_enabled = 0;
868
869 event->tstamp_enabled = tstamp - event->total_time_enabled;
870 list_for_each_entry(sub, &event->sibling_list, group_entry) {
871 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
872 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
873 sub->cgrp_defer_enabled = 0;
874 }
875 }
876}
877
878
879
880
881
882static inline void
883list_update_cgroup_event(struct perf_event *event,
884 struct perf_event_context *ctx, bool add)
885{
886 struct perf_cpu_context *cpuctx;
887 struct list_head *cpuctx_entry;
888
889 if (!is_cgroup_event(event))
890 return;
891
892 if (add && ctx->nr_cgroups++)
893 return;
894 else if (!add && --ctx->nr_cgroups)
895 return;
896
897
898
899
900 cpuctx = __get_cpu_context(ctx);
901 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
902
903 if (add) {
904 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
905 if (perf_cgroup_from_task(current, ctx) == event->cgrp)
906 cpuctx->cgrp = event->cgrp;
907 } else {
908 list_del(cpuctx_entry);
909 cpuctx->cgrp = NULL;
910 }
911}
912
913#else
914
915static inline bool
916perf_cgroup_match(struct perf_event *event)
917{
918 return true;
919}
920
921static inline void perf_detach_cgroup(struct perf_event *event)
922{}
923
924static inline int is_cgroup_event(struct perf_event *event)
925{
926 return 0;
927}
928
929static inline void update_cgrp_time_from_event(struct perf_event *event)
930{
931}
932
933static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
934{
935}
936
937static inline void perf_cgroup_sched_out(struct task_struct *task,
938 struct task_struct *next)
939{
940}
941
942static inline void perf_cgroup_sched_in(struct task_struct *prev,
943 struct task_struct *task)
944{
945}
946
947static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
948 struct perf_event_attr *attr,
949 struct perf_event *group_leader)
950{
951 return -EINVAL;
952}
953
954static inline void
955perf_cgroup_set_timestamp(struct task_struct *task,
956 struct perf_event_context *ctx)
957{
958}
959
960void
961perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
962{
963}
964
965static inline void
966perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
967{
968}
969
970static inline u64 perf_cgroup_event_time(struct perf_event *event)
971{
972 return 0;
973}
974
975static inline void
976perf_cgroup_defer_enabled(struct perf_event *event)
977{
978}
979
980static inline void
981perf_cgroup_mark_enabled(struct perf_event *event,
982 struct perf_event_context *ctx)
983{
984}
985
986static inline void
987list_update_cgroup_event(struct perf_event *event,
988 struct perf_event_context *ctx, bool add)
989{
990}
991
992#endif
993
994
995
996
997
998#define PERF_CPU_HRTIMER (1000 / HZ)
999
1000
1001
1002static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1003{
1004 struct perf_cpu_context *cpuctx;
1005 int rotations = 0;
1006
1007 WARN_ON(!irqs_disabled());
1008
1009 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1010 rotations = perf_rotate_context(cpuctx);
1011
1012 raw_spin_lock(&cpuctx->hrtimer_lock);
1013 if (rotations)
1014 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1015 else
1016 cpuctx->hrtimer_active = 0;
1017 raw_spin_unlock(&cpuctx->hrtimer_lock);
1018
1019 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1020}
1021
1022static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1023{
1024 struct hrtimer *timer = &cpuctx->hrtimer;
1025 struct pmu *pmu = cpuctx->ctx.pmu;
1026 u64 interval;
1027
1028
1029 if (pmu->task_ctx_nr == perf_sw_context)
1030 return;
1031
1032
1033
1034
1035
1036 interval = pmu->hrtimer_interval_ms;
1037 if (interval < 1)
1038 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1039
1040 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1041
1042 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1043 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1044 timer->function = perf_mux_hrtimer_handler;
1045}
1046
1047static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1048{
1049 struct hrtimer *timer = &cpuctx->hrtimer;
1050 struct pmu *pmu = cpuctx->ctx.pmu;
1051 unsigned long flags;
1052
1053
1054 if (pmu->task_ctx_nr == perf_sw_context)
1055 return 0;
1056
1057 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1058 if (!cpuctx->hrtimer_active) {
1059 cpuctx->hrtimer_active = 1;
1060 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1061 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1062 }
1063 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1064
1065 return 0;
1066}
1067
1068void perf_pmu_disable(struct pmu *pmu)
1069{
1070 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1071 if (!(*count)++)
1072 pmu->pmu_disable(pmu);
1073}
1074
1075void perf_pmu_enable(struct pmu *pmu)
1076{
1077 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1078 if (!--(*count))
1079 pmu->pmu_enable(pmu);
1080}
1081
1082static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1083
1084
1085
1086
1087
1088
1089
1090static void perf_event_ctx_activate(struct perf_event_context *ctx)
1091{
1092 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1093
1094 WARN_ON(!irqs_disabled());
1095
1096 WARN_ON(!list_empty(&ctx->active_ctx_list));
1097
1098 list_add(&ctx->active_ctx_list, head);
1099}
1100
1101static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1102{
1103 WARN_ON(!irqs_disabled());
1104
1105 WARN_ON(list_empty(&ctx->active_ctx_list));
1106
1107 list_del_init(&ctx->active_ctx_list);
1108}
1109
1110static void get_ctx(struct perf_event_context *ctx)
1111{
1112 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1113}
1114
1115static void free_ctx(struct rcu_head *head)
1116{
1117 struct perf_event_context *ctx;
1118
1119 ctx = container_of(head, struct perf_event_context, rcu_head);
1120 kfree(ctx->task_ctx_data);
1121 kfree(ctx);
1122}
1123
1124static void put_ctx(struct perf_event_context *ctx)
1125{
1126 if (atomic_dec_and_test(&ctx->refcount)) {
1127 if (ctx->parent_ctx)
1128 put_ctx(ctx->parent_ctx);
1129 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1130 put_task_struct(ctx->task);
1131 call_rcu(&ctx->rcu_head, free_ctx);
1132 }
1133}
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196static struct perf_event_context *
1197perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1198{
1199 struct perf_event_context *ctx;
1200
1201again:
1202 rcu_read_lock();
1203 ctx = ACCESS_ONCE(event->ctx);
1204 if (!atomic_inc_not_zero(&ctx->refcount)) {
1205 rcu_read_unlock();
1206 goto again;
1207 }
1208 rcu_read_unlock();
1209
1210 mutex_lock_nested(&ctx->mutex, nesting);
1211 if (event->ctx != ctx) {
1212 mutex_unlock(&ctx->mutex);
1213 put_ctx(ctx);
1214 goto again;
1215 }
1216
1217 return ctx;
1218}
1219
1220static inline struct perf_event_context *
1221perf_event_ctx_lock(struct perf_event *event)
1222{
1223 return perf_event_ctx_lock_nested(event, 0);
1224}
1225
1226static void perf_event_ctx_unlock(struct perf_event *event,
1227 struct perf_event_context *ctx)
1228{
1229 mutex_unlock(&ctx->mutex);
1230 put_ctx(ctx);
1231}
1232
1233
1234
1235
1236
1237
1238static __must_check struct perf_event_context *
1239unclone_ctx(struct perf_event_context *ctx)
1240{
1241 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1242
1243 lockdep_assert_held(&ctx->lock);
1244
1245 if (parent_ctx)
1246 ctx->parent_ctx = NULL;
1247 ctx->generation++;
1248
1249 return parent_ctx;
1250}
1251
1252static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1253{
1254
1255
1256
1257 if (event->parent)
1258 event = event->parent;
1259
1260 return task_tgid_nr_ns(p, event->ns);
1261}
1262
1263static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1264{
1265
1266
1267
1268 if (event->parent)
1269 event = event->parent;
1270
1271 return task_pid_nr_ns(p, event->ns);
1272}
1273
1274
1275
1276
1277
1278static u64 primary_event_id(struct perf_event *event)
1279{
1280 u64 id = event->id;
1281
1282 if (event->parent)
1283 id = event->parent->id;
1284
1285 return id;
1286}
1287
1288
1289
1290
1291
1292
1293
1294static struct perf_event_context *
1295perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1296{
1297 struct perf_event_context *ctx;
1298
1299retry:
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309 local_irq_save(*flags);
1310 rcu_read_lock();
1311 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1312 if (ctx) {
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323 raw_spin_lock(&ctx->lock);
1324 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1325 raw_spin_unlock(&ctx->lock);
1326 rcu_read_unlock();
1327 local_irq_restore(*flags);
1328 goto retry;
1329 }
1330
1331 if (ctx->task == TASK_TOMBSTONE ||
1332 !atomic_inc_not_zero(&ctx->refcount)) {
1333 raw_spin_unlock(&ctx->lock);
1334 ctx = NULL;
1335 } else {
1336 WARN_ON_ONCE(ctx->task != task);
1337 }
1338 }
1339 rcu_read_unlock();
1340 if (!ctx)
1341 local_irq_restore(*flags);
1342 return ctx;
1343}
1344
1345
1346
1347
1348
1349
1350static struct perf_event_context *
1351perf_pin_task_context(struct task_struct *task, int ctxn)
1352{
1353 struct perf_event_context *ctx;
1354 unsigned long flags;
1355
1356 ctx = perf_lock_task_context(task, ctxn, &flags);
1357 if (ctx) {
1358 ++ctx->pin_count;
1359 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1360 }
1361 return ctx;
1362}
1363
1364static void perf_unpin_context(struct perf_event_context *ctx)
1365{
1366 unsigned long flags;
1367
1368 raw_spin_lock_irqsave(&ctx->lock, flags);
1369 --ctx->pin_count;
1370 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1371}
1372
1373
1374
1375
1376static void update_context_time(struct perf_event_context *ctx)
1377{
1378 u64 now = perf_clock();
1379
1380 ctx->time += now - ctx->timestamp;
1381 ctx->timestamp = now;
1382}
1383
1384static u64 perf_event_time(struct perf_event *event)
1385{
1386 struct perf_event_context *ctx = event->ctx;
1387
1388 if (is_cgroup_event(event))
1389 return perf_cgroup_event_time(event);
1390
1391 return ctx ? ctx->time : 0;
1392}
1393
1394
1395
1396
1397static void update_event_times(struct perf_event *event)
1398{
1399 struct perf_event_context *ctx = event->ctx;
1400 u64 run_end;
1401
1402 lockdep_assert_held(&ctx->lock);
1403
1404 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1405 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1406 return;
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418 if (is_cgroup_event(event))
1419 run_end = perf_cgroup_event_time(event);
1420 else if (ctx->is_active)
1421 run_end = ctx->time;
1422 else
1423 run_end = event->tstamp_stopped;
1424
1425 event->total_time_enabled = run_end - event->tstamp_enabled;
1426
1427 if (event->state == PERF_EVENT_STATE_INACTIVE)
1428 run_end = event->tstamp_stopped;
1429 else
1430 run_end = perf_event_time(event);
1431
1432 event->total_time_running = run_end - event->tstamp_running;
1433
1434}
1435
1436
1437
1438
1439static void update_group_times(struct perf_event *leader)
1440{
1441 struct perf_event *event;
1442
1443 update_event_times(leader);
1444 list_for_each_entry(event, &leader->sibling_list, group_entry)
1445 update_event_times(event);
1446}
1447
1448static enum event_type_t get_event_type(struct perf_event *event)
1449{
1450 struct perf_event_context *ctx = event->ctx;
1451 enum event_type_t event_type;
1452
1453 lockdep_assert_held(&ctx->lock);
1454
1455
1456
1457
1458
1459 if (event->group_leader != event)
1460 event = event->group_leader;
1461
1462 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1463 if (!ctx->task)
1464 event_type |= EVENT_CPU;
1465
1466 return event_type;
1467}
1468
1469static struct list_head *
1470ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1471{
1472 if (event->attr.pinned)
1473 return &ctx->pinned_groups;
1474 else
1475 return &ctx->flexible_groups;
1476}
1477
1478
1479
1480
1481
1482static void
1483list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1484{
1485 lockdep_assert_held(&ctx->lock);
1486
1487 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1488 event->attach_state |= PERF_ATTACH_CONTEXT;
1489
1490
1491
1492
1493
1494
1495 if (event->group_leader == event) {
1496 struct list_head *list;
1497
1498 event->group_caps = event->event_caps;
1499
1500 list = ctx_group_list(event, ctx);
1501 list_add_tail(&event->group_entry, list);
1502 }
1503
1504 list_update_cgroup_event(event, ctx, true);
1505
1506 list_add_rcu(&event->event_entry, &ctx->event_list);
1507 ctx->nr_events++;
1508 if (event->attr.inherit_stat)
1509 ctx->nr_stat++;
1510
1511 ctx->generation++;
1512}
1513
1514
1515
1516
1517static inline void perf_event__state_init(struct perf_event *event)
1518{
1519 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1520 PERF_EVENT_STATE_INACTIVE;
1521}
1522
1523static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1524{
1525 int entry = sizeof(u64);
1526 int size = 0;
1527 int nr = 1;
1528
1529 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1530 size += sizeof(u64);
1531
1532 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1533 size += sizeof(u64);
1534
1535 if (event->attr.read_format & PERF_FORMAT_ID)
1536 entry += sizeof(u64);
1537
1538 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1539 nr += nr_siblings;
1540 size += sizeof(u64);
1541 }
1542
1543 size += entry * nr;
1544 event->read_size = size;
1545}
1546
1547static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1548{
1549 struct perf_sample_data *data;
1550 u16 size = 0;
1551
1552 if (sample_type & PERF_SAMPLE_IP)
1553 size += sizeof(data->ip);
1554
1555 if (sample_type & PERF_SAMPLE_ADDR)
1556 size += sizeof(data->addr);
1557
1558 if (sample_type & PERF_SAMPLE_PERIOD)
1559 size += sizeof(data->period);
1560
1561 if (sample_type & PERF_SAMPLE_WEIGHT)
1562 size += sizeof(data->weight);
1563
1564 if (sample_type & PERF_SAMPLE_READ)
1565 size += event->read_size;
1566
1567 if (sample_type & PERF_SAMPLE_DATA_SRC)
1568 size += sizeof(data->data_src.val);
1569
1570 if (sample_type & PERF_SAMPLE_TRANSACTION)
1571 size += sizeof(data->txn);
1572
1573 event->header_size = size;
1574}
1575
1576
1577
1578
1579
1580static void perf_event__header_size(struct perf_event *event)
1581{
1582 __perf_event_read_size(event,
1583 event->group_leader->nr_siblings);
1584 __perf_event_header_size(event, event->attr.sample_type);
1585}
1586
1587static void perf_event__id_header_size(struct perf_event *event)
1588{
1589 struct perf_sample_data *data;
1590 u64 sample_type = event->attr.sample_type;
1591 u16 size = 0;
1592
1593 if (sample_type & PERF_SAMPLE_TID)
1594 size += sizeof(data->tid_entry);
1595
1596 if (sample_type & PERF_SAMPLE_TIME)
1597 size += sizeof(data->time);
1598
1599 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1600 size += sizeof(data->id);
1601
1602 if (sample_type & PERF_SAMPLE_ID)
1603 size += sizeof(data->id);
1604
1605 if (sample_type & PERF_SAMPLE_STREAM_ID)
1606 size += sizeof(data->stream_id);
1607
1608 if (sample_type & PERF_SAMPLE_CPU)
1609 size += sizeof(data->cpu_entry);
1610
1611 event->id_header_size = size;
1612}
1613
1614static bool perf_event_validate_size(struct perf_event *event)
1615{
1616
1617
1618
1619
1620 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1621 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1622 perf_event__id_header_size(event);
1623
1624
1625
1626
1627
1628 if (event->read_size + event->header_size +
1629 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1630 return false;
1631
1632 return true;
1633}
1634
1635static void perf_group_attach(struct perf_event *event)
1636{
1637 struct perf_event *group_leader = event->group_leader, *pos;
1638
1639 lockdep_assert_held(&event->ctx->lock);
1640
1641
1642
1643
1644 if (event->attach_state & PERF_ATTACH_GROUP)
1645 return;
1646
1647 event->attach_state |= PERF_ATTACH_GROUP;
1648
1649 if (group_leader == event)
1650 return;
1651
1652 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1653
1654 group_leader->group_caps &= event->event_caps;
1655
1656 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1657 group_leader->nr_siblings++;
1658
1659 perf_event__header_size(group_leader);
1660
1661 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1662 perf_event__header_size(pos);
1663}
1664
1665
1666
1667
1668
1669static void
1670list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1671{
1672 WARN_ON_ONCE(event->ctx != ctx);
1673 lockdep_assert_held(&ctx->lock);
1674
1675
1676
1677
1678 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1679 return;
1680
1681 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1682
1683 list_update_cgroup_event(event, ctx, false);
1684
1685 ctx->nr_events--;
1686 if (event->attr.inherit_stat)
1687 ctx->nr_stat--;
1688
1689 list_del_rcu(&event->event_entry);
1690
1691 if (event->group_leader == event)
1692 list_del_init(&event->group_entry);
1693
1694 update_group_times(event);
1695
1696
1697
1698
1699
1700
1701
1702
1703 if (event->state > PERF_EVENT_STATE_OFF)
1704 event->state = PERF_EVENT_STATE_OFF;
1705
1706 ctx->generation++;
1707}
1708
1709static void perf_group_detach(struct perf_event *event)
1710{
1711 struct perf_event *sibling, *tmp;
1712 struct list_head *list = NULL;
1713
1714 lockdep_assert_held(&event->ctx->lock);
1715
1716
1717
1718
1719 if (!(event->attach_state & PERF_ATTACH_GROUP))
1720 return;
1721
1722 event->attach_state &= ~PERF_ATTACH_GROUP;
1723
1724
1725
1726
1727 if (event->group_leader != event) {
1728 list_del_init(&event->group_entry);
1729 event->group_leader->nr_siblings--;
1730 goto out;
1731 }
1732
1733 if (!list_empty(&event->group_entry))
1734 list = &event->group_entry;
1735
1736
1737
1738
1739
1740
1741 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1742 if (list)
1743 list_move_tail(&sibling->group_entry, list);
1744 sibling->group_leader = sibling;
1745
1746
1747 sibling->group_caps = event->group_caps;
1748
1749 WARN_ON_ONCE(sibling->ctx != event->ctx);
1750 }
1751
1752out:
1753 perf_event__header_size(event->group_leader);
1754
1755 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1756 perf_event__header_size(tmp);
1757}
1758
1759static bool is_orphaned_event(struct perf_event *event)
1760{
1761 return event->state == PERF_EVENT_STATE_DEAD;
1762}
1763
1764static inline int __pmu_filter_match(struct perf_event *event)
1765{
1766 struct pmu *pmu = event->pmu;
1767 return pmu->filter_match ? pmu->filter_match(event) : 1;
1768}
1769
1770
1771
1772
1773
1774
1775
1776static inline int pmu_filter_match(struct perf_event *event)
1777{
1778 struct perf_event *child;
1779
1780 if (!__pmu_filter_match(event))
1781 return 0;
1782
1783 list_for_each_entry(child, &event->sibling_list, group_entry) {
1784 if (!__pmu_filter_match(child))
1785 return 0;
1786 }
1787
1788 return 1;
1789}
1790
1791static inline int
1792event_filter_match(struct perf_event *event)
1793{
1794 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1795 perf_cgroup_match(event) && pmu_filter_match(event);
1796}
1797
1798static void
1799event_sched_out(struct perf_event *event,
1800 struct perf_cpu_context *cpuctx,
1801 struct perf_event_context *ctx)
1802{
1803 u64 tstamp = perf_event_time(event);
1804 u64 delta;
1805
1806 WARN_ON_ONCE(event->ctx != ctx);
1807 lockdep_assert_held(&ctx->lock);
1808
1809
1810
1811
1812
1813
1814
1815 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1816 !event_filter_match(event)) {
1817 delta = tstamp - event->tstamp_stopped;
1818 event->tstamp_running += delta;
1819 event->tstamp_stopped = tstamp;
1820 }
1821
1822 if (event->state != PERF_EVENT_STATE_ACTIVE)
1823 return;
1824
1825 perf_pmu_disable(event->pmu);
1826
1827 event->tstamp_stopped = tstamp;
1828 event->pmu->del(event, 0);
1829 event->oncpu = -1;
1830 event->state = PERF_EVENT_STATE_INACTIVE;
1831 if (event->pending_disable) {
1832 event->pending_disable = 0;
1833 event->state = PERF_EVENT_STATE_OFF;
1834 }
1835
1836 if (!is_software_event(event))
1837 cpuctx->active_oncpu--;
1838 if (!--ctx->nr_active)
1839 perf_event_ctx_deactivate(ctx);
1840 if (event->attr.freq && event->attr.sample_freq)
1841 ctx->nr_freq--;
1842 if (event->attr.exclusive || !cpuctx->active_oncpu)
1843 cpuctx->exclusive = 0;
1844
1845 perf_pmu_enable(event->pmu);
1846}
1847
1848static void
1849group_sched_out(struct perf_event *group_event,
1850 struct perf_cpu_context *cpuctx,
1851 struct perf_event_context *ctx)
1852{
1853 struct perf_event *event;
1854 int state = group_event->state;
1855
1856 perf_pmu_disable(ctx->pmu);
1857
1858 event_sched_out(group_event, cpuctx, ctx);
1859
1860
1861
1862
1863 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1864 event_sched_out(event, cpuctx, ctx);
1865
1866 perf_pmu_enable(ctx->pmu);
1867
1868 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1869 cpuctx->exclusive = 0;
1870}
1871
1872#define DETACH_GROUP 0x01UL
1873
1874
1875
1876
1877
1878
1879
1880static void
1881__perf_remove_from_context(struct perf_event *event,
1882 struct perf_cpu_context *cpuctx,
1883 struct perf_event_context *ctx,
1884 void *info)
1885{
1886 unsigned long flags = (unsigned long)info;
1887
1888 event_sched_out(event, cpuctx, ctx);
1889 if (flags & DETACH_GROUP)
1890 perf_group_detach(event);
1891 list_del_event(event, ctx);
1892
1893 if (!ctx->nr_events && ctx->is_active) {
1894 ctx->is_active = 0;
1895 if (ctx->task) {
1896 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1897 cpuctx->task_ctx = NULL;
1898 }
1899 }
1900}
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1913{
1914 struct perf_event_context *ctx = event->ctx;
1915
1916 lockdep_assert_held(&ctx->mutex);
1917
1918 event_function_call(event, __perf_remove_from_context, (void *)flags);
1919
1920
1921
1922
1923
1924
1925
1926 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1927 if ((flags & DETACH_GROUP) &&
1928 (event->attach_state & PERF_ATTACH_GROUP)) {
1929
1930
1931
1932
1933 raw_spin_lock_irq(&ctx->lock);
1934 perf_group_detach(event);
1935 raw_spin_unlock_irq(&ctx->lock);
1936 }
1937}
1938
1939
1940
1941
1942static void __perf_event_disable(struct perf_event *event,
1943 struct perf_cpu_context *cpuctx,
1944 struct perf_event_context *ctx,
1945 void *info)
1946{
1947 if (event->state < PERF_EVENT_STATE_INACTIVE)
1948 return;
1949
1950 update_context_time(ctx);
1951 update_cgrp_time_from_event(event);
1952 update_group_times(event);
1953 if (event == event->group_leader)
1954 group_sched_out(event, cpuctx, ctx);
1955 else
1956 event_sched_out(event, cpuctx, ctx);
1957 event->state = PERF_EVENT_STATE_OFF;
1958}
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974static void _perf_event_disable(struct perf_event *event)
1975{
1976 struct perf_event_context *ctx = event->ctx;
1977
1978 raw_spin_lock_irq(&ctx->lock);
1979 if (event->state <= PERF_EVENT_STATE_OFF) {
1980 raw_spin_unlock_irq(&ctx->lock);
1981 return;
1982 }
1983 raw_spin_unlock_irq(&ctx->lock);
1984
1985 event_function_call(event, __perf_event_disable, NULL);
1986}
1987
1988void perf_event_disable_local(struct perf_event *event)
1989{
1990 event_function_local(event, __perf_event_disable, NULL);
1991}
1992
1993
1994
1995
1996
1997void perf_event_disable(struct perf_event *event)
1998{
1999 struct perf_event_context *ctx;
2000
2001 ctx = perf_event_ctx_lock(event);
2002 _perf_event_disable(event);
2003 perf_event_ctx_unlock(event, ctx);
2004}
2005EXPORT_SYMBOL_GPL(perf_event_disable);
2006
2007void perf_event_disable_inatomic(struct perf_event *event)
2008{
2009 event->pending_disable = 1;
2010 irq_work_queue(&event->pending);
2011}
2012
2013static void perf_set_shadow_time(struct perf_event *event,
2014 struct perf_event_context *ctx,
2015 u64 tstamp)
2016{
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042 if (is_cgroup_event(event))
2043 perf_cgroup_set_shadow_time(event, tstamp);
2044 else
2045 event->shadow_ctx_time = tstamp - ctx->timestamp;
2046}
2047
2048#define MAX_INTERRUPTS (~0ULL)
2049
2050static void perf_log_throttle(struct perf_event *event, int enable);
2051static void perf_log_itrace_start(struct perf_event *event);
2052
2053static int
2054event_sched_in(struct perf_event *event,
2055 struct perf_cpu_context *cpuctx,
2056 struct perf_event_context *ctx)
2057{
2058 u64 tstamp = perf_event_time(event);
2059 int ret = 0;
2060
2061 lockdep_assert_held(&ctx->lock);
2062
2063 if (event->state <= PERF_EVENT_STATE_OFF)
2064 return 0;
2065
2066 WRITE_ONCE(event->oncpu, smp_processor_id());
2067
2068
2069
2070
2071 smp_wmb();
2072 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2073
2074
2075
2076
2077
2078
2079 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2080 perf_log_throttle(event, 1);
2081 event->hw.interrupts = 0;
2082 }
2083
2084
2085
2086
2087 smp_wmb();
2088
2089 perf_pmu_disable(event->pmu);
2090
2091 perf_set_shadow_time(event, ctx, tstamp);
2092
2093 perf_log_itrace_start(event);
2094
2095 if (event->pmu->add(event, PERF_EF_START)) {
2096 event->state = PERF_EVENT_STATE_INACTIVE;
2097 event->oncpu = -1;
2098 ret = -EAGAIN;
2099 goto out;
2100 }
2101
2102 event->tstamp_running += tstamp - event->tstamp_stopped;
2103
2104 if (!is_software_event(event))
2105 cpuctx->active_oncpu++;
2106 if (!ctx->nr_active++)
2107 perf_event_ctx_activate(ctx);
2108 if (event->attr.freq && event->attr.sample_freq)
2109 ctx->nr_freq++;
2110
2111 if (event->attr.exclusive)
2112 cpuctx->exclusive = 1;
2113
2114out:
2115 perf_pmu_enable(event->pmu);
2116
2117 return ret;
2118}
2119
2120static int
2121group_sched_in(struct perf_event *group_event,
2122 struct perf_cpu_context *cpuctx,
2123 struct perf_event_context *ctx)
2124{
2125 struct perf_event *event, *partial_group = NULL;
2126 struct pmu *pmu = ctx->pmu;
2127 u64 now = ctx->time;
2128 bool simulate = false;
2129
2130 if (group_event->state == PERF_EVENT_STATE_OFF)
2131 return 0;
2132
2133 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2134
2135 if (event_sched_in(group_event, cpuctx, ctx)) {
2136 pmu->cancel_txn(pmu);
2137 perf_mux_hrtimer_restart(cpuctx);
2138 return -EAGAIN;
2139 }
2140
2141
2142
2143
2144 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2145 if (event_sched_in(event, cpuctx, ctx)) {
2146 partial_group = event;
2147 goto group_error;
2148 }
2149 }
2150
2151 if (!pmu->commit_txn(pmu))
2152 return 0;
2153
2154group_error:
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2170 if (event == partial_group)
2171 simulate = true;
2172
2173 if (simulate) {
2174 event->tstamp_running += now - event->tstamp_stopped;
2175 event->tstamp_stopped = now;
2176 } else {
2177 event_sched_out(event, cpuctx, ctx);
2178 }
2179 }
2180 event_sched_out(group_event, cpuctx, ctx);
2181
2182 pmu->cancel_txn(pmu);
2183
2184 perf_mux_hrtimer_restart(cpuctx);
2185
2186 return -EAGAIN;
2187}
2188
2189
2190
2191
2192static int group_can_go_on(struct perf_event *event,
2193 struct perf_cpu_context *cpuctx,
2194 int can_add_hw)
2195{
2196
2197
2198
2199 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2200 return 1;
2201
2202
2203
2204
2205 if (cpuctx->exclusive)
2206 return 0;
2207
2208
2209
2210
2211 if (event->attr.exclusive && cpuctx->active_oncpu)
2212 return 0;
2213
2214
2215
2216
2217 return can_add_hw;
2218}
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238static void __perf_event_enable_time(struct perf_event *event, u64 now)
2239{
2240 WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
2241
2242 event->tstamp_stopped = now;
2243 event->tstamp_enabled = now - event->total_time_enabled;
2244 event->tstamp_running = now - event->total_time_running;
2245}
2246
2247static void add_event_to_ctx(struct perf_event *event,
2248 struct perf_event_context *ctx)
2249{
2250 u64 tstamp = perf_event_time(event);
2251
2252 list_add_event(event, ctx);
2253 perf_group_attach(event);
2254
2255
2256
2257
2258 if (event->state == PERF_EVENT_STATE_INACTIVE)
2259 __perf_event_enable_time(event, tstamp);
2260}
2261
2262static void ctx_sched_out(struct perf_event_context *ctx,
2263 struct perf_cpu_context *cpuctx,
2264 enum event_type_t event_type);
2265static void
2266ctx_sched_in(struct perf_event_context *ctx,
2267 struct perf_cpu_context *cpuctx,
2268 enum event_type_t event_type,
2269 struct task_struct *task);
2270
2271static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2272 struct perf_event_context *ctx,
2273 enum event_type_t event_type)
2274{
2275 if (!cpuctx->task_ctx)
2276 return;
2277
2278 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2279 return;
2280
2281 ctx_sched_out(ctx, cpuctx, event_type);
2282}
2283
2284static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2285 struct perf_event_context *ctx,
2286 struct task_struct *task)
2287{
2288 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2289 if (ctx)
2290 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2291 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2292 if (ctx)
2293 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2294}
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311static void ctx_resched(struct perf_cpu_context *cpuctx,
2312 struct perf_event_context *task_ctx,
2313 enum event_type_t event_type)
2314{
2315 enum event_type_t ctx_event_type = event_type & EVENT_ALL;
2316 bool cpu_event = !!(event_type & EVENT_CPU);
2317
2318
2319
2320
2321
2322 if (event_type & EVENT_PINNED)
2323 event_type |= EVENT_FLEXIBLE;
2324
2325 perf_pmu_disable(cpuctx->ctx.pmu);
2326 if (task_ctx)
2327 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2328
2329
2330
2331
2332
2333
2334
2335
2336 if (cpu_event)
2337 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2338 else if (ctx_event_type & EVENT_PINNED)
2339 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2340
2341 perf_event_sched_in(cpuctx, task_ctx, current);
2342 perf_pmu_enable(cpuctx->ctx.pmu);
2343}
2344
2345
2346
2347
2348
2349
2350
2351static int __perf_install_in_context(void *info)
2352{
2353 struct perf_event *event = info;
2354 struct perf_event_context *ctx = event->ctx;
2355 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2356 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2357 bool reprogram = true;
2358 int ret = 0;
2359
2360 raw_spin_lock(&cpuctx->ctx.lock);
2361 if (ctx->task) {
2362 raw_spin_lock(&ctx->lock);
2363 task_ctx = ctx;
2364
2365 reprogram = (ctx->task == current);
2366
2367
2368
2369
2370
2371
2372
2373
2374 if (task_curr(ctx->task) && !reprogram) {
2375 ret = -ESRCH;
2376 goto unlock;
2377 }
2378
2379 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2380 } else if (task_ctx) {
2381 raw_spin_lock(&task_ctx->lock);
2382 }
2383
2384 if (reprogram) {
2385 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2386 add_event_to_ctx(event, ctx);
2387 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2388 } else {
2389 add_event_to_ctx(event, ctx);
2390 }
2391
2392unlock:
2393 perf_ctx_unlock(cpuctx, task_ctx);
2394
2395 return ret;
2396}
2397
2398
2399
2400
2401
2402
2403static void
2404perf_install_in_context(struct perf_event_context *ctx,
2405 struct perf_event *event,
2406 int cpu)
2407{
2408 struct task_struct *task = READ_ONCE(ctx->task);
2409
2410 lockdep_assert_held(&ctx->mutex);
2411
2412 if (event->cpu != -1)
2413 event->cpu = cpu;
2414
2415
2416
2417
2418
2419 smp_store_release(&event->ctx, ctx);
2420
2421 if (!task) {
2422 cpu_function_call(cpu, __perf_install_in_context, event);
2423 return;
2424 }
2425
2426
2427
2428
2429 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2430 return;
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462 smp_mb();
2463again:
2464 if (!task_function_call(task, __perf_install_in_context, event))
2465 return;
2466
2467 raw_spin_lock_irq(&ctx->lock);
2468 task = ctx->task;
2469 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2470
2471
2472
2473
2474
2475 raw_spin_unlock_irq(&ctx->lock);
2476 return;
2477 }
2478
2479
2480
2481
2482 if (task_curr(task)) {
2483 raw_spin_unlock_irq(&ctx->lock);
2484 goto again;
2485 }
2486 add_event_to_ctx(event, ctx);
2487 raw_spin_unlock_irq(&ctx->lock);
2488}
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498static void __perf_event_mark_enabled(struct perf_event *event)
2499{
2500 struct perf_event *sub;
2501 u64 tstamp = perf_event_time(event);
2502
2503 event->state = PERF_EVENT_STATE_INACTIVE;
2504 __perf_event_enable_time(event, tstamp);
2505 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2506
2507 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2508 __perf_event_enable_time(sub, tstamp);
2509 }
2510}
2511
2512
2513
2514
2515static void __perf_event_enable(struct perf_event *event,
2516 struct perf_cpu_context *cpuctx,
2517 struct perf_event_context *ctx,
2518 void *info)
2519{
2520 struct perf_event *leader = event->group_leader;
2521 struct perf_event_context *task_ctx;
2522
2523 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2524 event->state <= PERF_EVENT_STATE_ERROR)
2525 return;
2526
2527 if (ctx->is_active)
2528 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2529
2530 __perf_event_mark_enabled(event);
2531
2532 if (!ctx->is_active)
2533 return;
2534
2535 if (!event_filter_match(event)) {
2536 if (is_cgroup_event(event))
2537 perf_cgroup_defer_enabled(event);
2538 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2539 return;
2540 }
2541
2542
2543
2544
2545
2546 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2547 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2548 return;
2549 }
2550
2551 task_ctx = cpuctx->task_ctx;
2552 if (ctx->task)
2553 WARN_ON_ONCE(task_ctx != ctx);
2554
2555 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2556}
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567static void _perf_event_enable(struct perf_event *event)
2568{
2569 struct perf_event_context *ctx = event->ctx;
2570
2571 raw_spin_lock_irq(&ctx->lock);
2572 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2573 event->state < PERF_EVENT_STATE_ERROR) {
2574 raw_spin_unlock_irq(&ctx->lock);
2575 return;
2576 }
2577
2578
2579
2580
2581
2582
2583
2584
2585 if (event->state == PERF_EVENT_STATE_ERROR)
2586 event->state = PERF_EVENT_STATE_OFF;
2587 raw_spin_unlock_irq(&ctx->lock);
2588
2589 event_function_call(event, __perf_event_enable, NULL);
2590}
2591
2592
2593
2594
2595void perf_event_enable(struct perf_event *event)
2596{
2597 struct perf_event_context *ctx;
2598
2599 ctx = perf_event_ctx_lock(event);
2600 _perf_event_enable(event);
2601 perf_event_ctx_unlock(event, ctx);
2602}
2603EXPORT_SYMBOL_GPL(perf_event_enable);
2604
2605struct stop_event_data {
2606 struct perf_event *event;
2607 unsigned int restart;
2608};
2609
2610static int __perf_event_stop(void *info)
2611{
2612 struct stop_event_data *sd = info;
2613 struct perf_event *event = sd->event;
2614
2615
2616 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2617 return 0;
2618
2619
2620 smp_rmb();
2621
2622
2623
2624
2625
2626 if (READ_ONCE(event->oncpu) != smp_processor_id())
2627 return -EAGAIN;
2628
2629 event->pmu->stop(event, PERF_EF_UPDATE);
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 if (sd->restart)
2641 event->pmu->start(event, 0);
2642
2643 return 0;
2644}
2645
2646static int perf_event_stop(struct perf_event *event, int restart)
2647{
2648 struct stop_event_data sd = {
2649 .event = event,
2650 .restart = restart,
2651 };
2652 int ret = 0;
2653
2654 do {
2655 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2656 return 0;
2657
2658
2659 smp_rmb();
2660
2661
2662
2663
2664
2665
2666 ret = cpu_function_call(READ_ONCE(event->oncpu),
2667 __perf_event_stop, &sd);
2668 } while (ret == -EAGAIN);
2669
2670 return ret;
2671}
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695void perf_event_addr_filters_sync(struct perf_event *event)
2696{
2697 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2698
2699 if (!has_addr_filter(event))
2700 return;
2701
2702 raw_spin_lock(&ifh->lock);
2703 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2704 event->pmu->addr_filters_sync(event);
2705 event->hw.addr_filters_gen = event->addr_filters_gen;
2706 }
2707 raw_spin_unlock(&ifh->lock);
2708}
2709EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2710
2711static int _perf_event_refresh(struct perf_event *event, int refresh)
2712{
2713
2714
2715
2716 if (event->attr.inherit || !is_sampling_event(event))
2717 return -EINVAL;
2718
2719 atomic_add(refresh, &event->event_limit);
2720 _perf_event_enable(event);
2721
2722 return 0;
2723}
2724
2725
2726
2727
2728int perf_event_refresh(struct perf_event *event, int refresh)
2729{
2730 struct perf_event_context *ctx;
2731 int ret;
2732
2733 ctx = perf_event_ctx_lock(event);
2734 ret = _perf_event_refresh(event, refresh);
2735 perf_event_ctx_unlock(event, ctx);
2736
2737 return ret;
2738}
2739EXPORT_SYMBOL_GPL(perf_event_refresh);
2740
2741static void ctx_sched_out(struct perf_event_context *ctx,
2742 struct perf_cpu_context *cpuctx,
2743 enum event_type_t event_type)
2744{
2745 int is_active = ctx->is_active;
2746 struct perf_event *event;
2747
2748 lockdep_assert_held(&ctx->lock);
2749
2750 if (likely(!ctx->nr_events)) {
2751
2752
2753
2754 WARN_ON_ONCE(ctx->is_active);
2755 if (ctx->task)
2756 WARN_ON_ONCE(cpuctx->task_ctx);
2757 return;
2758 }
2759
2760 ctx->is_active &= ~event_type;
2761 if (!(ctx->is_active & EVENT_ALL))
2762 ctx->is_active = 0;
2763
2764 if (ctx->task) {
2765 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2766 if (!ctx->is_active)
2767 cpuctx->task_ctx = NULL;
2768 }
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780 if (is_active & EVENT_TIME) {
2781
2782 update_context_time(ctx);
2783 update_cgrp_time_from_cpuctx(cpuctx);
2784 }
2785
2786 is_active ^= ctx->is_active;
2787
2788 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2789 return;
2790
2791 perf_pmu_disable(ctx->pmu);
2792 if (is_active & EVENT_PINNED) {
2793 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2794 group_sched_out(event, cpuctx, ctx);
2795 }
2796
2797 if (is_active & EVENT_FLEXIBLE) {
2798 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2799 group_sched_out(event, cpuctx, ctx);
2800 }
2801 perf_pmu_enable(ctx->pmu);
2802}
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812static int context_equiv(struct perf_event_context *ctx1,
2813 struct perf_event_context *ctx2)
2814{
2815 lockdep_assert_held(&ctx1->lock);
2816 lockdep_assert_held(&ctx2->lock);
2817
2818
2819 if (ctx1->pin_count || ctx2->pin_count)
2820 return 0;
2821
2822
2823 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2824 return 1;
2825
2826
2827 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2828 return 1;
2829
2830
2831
2832
2833
2834 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2835 ctx1->parent_gen == ctx2->parent_gen)
2836 return 1;
2837
2838
2839 return 0;
2840}
2841
2842static void __perf_event_sync_stat(struct perf_event *event,
2843 struct perf_event *next_event)
2844{
2845 u64 value;
2846
2847 if (!event->attr.inherit_stat)
2848 return;
2849
2850
2851
2852
2853
2854
2855
2856
2857 switch (event->state) {
2858 case PERF_EVENT_STATE_ACTIVE:
2859 event->pmu->read(event);
2860
2861
2862 case PERF_EVENT_STATE_INACTIVE:
2863 update_event_times(event);
2864 break;
2865
2866 default:
2867 break;
2868 }
2869
2870
2871
2872
2873
2874 value = local64_read(&next_event->count);
2875 value = local64_xchg(&event->count, value);
2876 local64_set(&next_event->count, value);
2877
2878 swap(event->total_time_enabled, next_event->total_time_enabled);
2879 swap(event->total_time_running, next_event->total_time_running);
2880
2881
2882
2883
2884 perf_event_update_userpage(event);
2885 perf_event_update_userpage(next_event);
2886}
2887
2888static void perf_event_sync_stat(struct perf_event_context *ctx,
2889 struct perf_event_context *next_ctx)
2890{
2891 struct perf_event *event, *next_event;
2892
2893 if (!ctx->nr_stat)
2894 return;
2895
2896 update_context_time(ctx);
2897
2898 event = list_first_entry(&ctx->event_list,
2899 struct perf_event, event_entry);
2900
2901 next_event = list_first_entry(&next_ctx->event_list,
2902 struct perf_event, event_entry);
2903
2904 while (&event->event_entry != &ctx->event_list &&
2905 &next_event->event_entry != &next_ctx->event_list) {
2906
2907 __perf_event_sync_stat(event, next_event);
2908
2909 event = list_next_entry(event, event_entry);
2910 next_event = list_next_entry(next_event, event_entry);
2911 }
2912}
2913
2914static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2915 struct task_struct *next)
2916{
2917 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2918 struct perf_event_context *next_ctx;
2919 struct perf_event_context *parent, *next_parent;
2920 struct perf_cpu_context *cpuctx;
2921 int do_switch = 1;
2922
2923 if (likely(!ctx))
2924 return;
2925
2926 cpuctx = __get_cpu_context(ctx);
2927 if (!cpuctx->task_ctx)
2928 return;
2929
2930 rcu_read_lock();
2931 next_ctx = next->perf_event_ctxp[ctxn];
2932 if (!next_ctx)
2933 goto unlock;
2934
2935 parent = rcu_dereference(ctx->parent_ctx);
2936 next_parent = rcu_dereference(next_ctx->parent_ctx);
2937
2938
2939 if (!parent && !next_parent)
2940 goto unlock;
2941
2942 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952 raw_spin_lock(&ctx->lock);
2953 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2954 if (context_equiv(ctx, next_ctx)) {
2955 WRITE_ONCE(ctx->task, next);
2956 WRITE_ONCE(next_ctx->task, task);
2957
2958 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2959
2960
2961
2962
2963
2964
2965
2966
2967 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2968 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2969
2970 do_switch = 0;
2971
2972 perf_event_sync_stat(ctx, next_ctx);
2973 }
2974 raw_spin_unlock(&next_ctx->lock);
2975 raw_spin_unlock(&ctx->lock);
2976 }
2977unlock:
2978 rcu_read_unlock();
2979
2980 if (do_switch) {
2981 raw_spin_lock(&ctx->lock);
2982 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
2983 raw_spin_unlock(&ctx->lock);
2984 }
2985}
2986
2987static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2988
2989void perf_sched_cb_dec(struct pmu *pmu)
2990{
2991 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2992
2993 this_cpu_dec(perf_sched_cb_usages);
2994
2995 if (!--cpuctx->sched_cb_usage)
2996 list_del(&cpuctx->sched_cb_entry);
2997}
2998
2999
3000void perf_sched_cb_inc(struct pmu *pmu)
3001{
3002 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3003
3004 if (!cpuctx->sched_cb_usage++)
3005 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3006
3007 this_cpu_inc(perf_sched_cb_usages);
3008}
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018static void perf_pmu_sched_task(struct task_struct *prev,
3019 struct task_struct *next,
3020 bool sched_in)
3021{
3022 struct perf_cpu_context *cpuctx;
3023 struct pmu *pmu;
3024
3025 if (prev == next)
3026 return;
3027
3028 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3029 pmu = cpuctx->ctx.pmu;
3030
3031 if (WARN_ON_ONCE(!pmu->sched_task))
3032 continue;
3033
3034 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3035 perf_pmu_disable(pmu);
3036
3037 pmu->sched_task(cpuctx->task_ctx, sched_in);
3038
3039 perf_pmu_enable(pmu);
3040 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3041 }
3042}
3043
3044static void perf_event_switch(struct task_struct *task,
3045 struct task_struct *next_prev, bool sched_in);
3046
3047#define for_each_task_context_nr(ctxn) \
3048 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061void __perf_event_task_sched_out(struct task_struct *task,
3062 struct task_struct *next)
3063{
3064 int ctxn;
3065
3066 if (__this_cpu_read(perf_sched_cb_usages))
3067 perf_pmu_sched_task(task, next, false);
3068
3069 if (atomic_read(&nr_switch_events))
3070 perf_event_switch(task, next, false);
3071
3072 for_each_task_context_nr(ctxn)
3073 perf_event_context_sched_out(task, ctxn, next);
3074
3075
3076
3077
3078
3079
3080 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3081 perf_cgroup_sched_out(task, next);
3082}
3083
3084
3085
3086
3087static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3088 enum event_type_t event_type)
3089{
3090 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3091}
3092
3093static void
3094ctx_pinned_sched_in(struct perf_event_context *ctx,
3095 struct perf_cpu_context *cpuctx)
3096{
3097 struct perf_event *event;
3098
3099 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3100 if (event->state <= PERF_EVENT_STATE_OFF)
3101 continue;
3102 if (!event_filter_match(event))
3103 continue;
3104
3105
3106 if (is_cgroup_event(event))
3107 perf_cgroup_mark_enabled(event, ctx);
3108
3109 if (group_can_go_on(event, cpuctx, 1))
3110 group_sched_in(event, cpuctx, ctx);
3111
3112
3113
3114
3115
3116 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3117 update_group_times(event);
3118 event->state = PERF_EVENT_STATE_ERROR;
3119 }
3120 }
3121}
3122
3123static void
3124ctx_flexible_sched_in(struct perf_event_context *ctx,
3125 struct perf_cpu_context *cpuctx)
3126{
3127 struct perf_event *event;
3128 int can_add_hw = 1;
3129
3130 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3131
3132 if (event->state <= PERF_EVENT_STATE_OFF)
3133 continue;
3134
3135
3136
3137
3138 if (!event_filter_match(event))
3139 continue;
3140
3141
3142 if (is_cgroup_event(event))
3143 perf_cgroup_mark_enabled(event, ctx);
3144
3145 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3146 if (group_sched_in(event, cpuctx, ctx))
3147 can_add_hw = 0;
3148 }
3149 }
3150}
3151
3152static void
3153ctx_sched_in(struct perf_event_context *ctx,
3154 struct perf_cpu_context *cpuctx,
3155 enum event_type_t event_type,
3156 struct task_struct *task)
3157{
3158 int is_active = ctx->is_active;
3159 u64 now;
3160
3161 lockdep_assert_held(&ctx->lock);
3162
3163 if (likely(!ctx->nr_events))
3164 return;
3165
3166 ctx->is_active |= (event_type | EVENT_TIME);
3167 if (ctx->task) {
3168 if (!is_active)
3169 cpuctx->task_ctx = ctx;
3170 else
3171 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3172 }
3173
3174 is_active ^= ctx->is_active;
3175
3176 if (is_active & EVENT_TIME) {
3177
3178 now = perf_clock();
3179 ctx->timestamp = now;
3180 perf_cgroup_set_timestamp(task, ctx);
3181 }
3182
3183
3184
3185
3186
3187 if (is_active & EVENT_PINNED)
3188 ctx_pinned_sched_in(ctx, cpuctx);
3189
3190
3191 if (is_active & EVENT_FLEXIBLE)
3192 ctx_flexible_sched_in(ctx, cpuctx);
3193}
3194
3195static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3196 enum event_type_t event_type,
3197 struct task_struct *task)
3198{
3199 struct perf_event_context *ctx = &cpuctx->ctx;
3200
3201 ctx_sched_in(ctx, cpuctx, event_type, task);
3202}
3203
3204static void perf_event_context_sched_in(struct perf_event_context *ctx,
3205 struct task_struct *task)
3206{
3207 struct perf_cpu_context *cpuctx;
3208
3209 cpuctx = __get_cpu_context(ctx);
3210 if (cpuctx->task_ctx == ctx)
3211 return;
3212
3213 perf_ctx_lock(cpuctx, ctx);
3214 perf_pmu_disable(ctx->pmu);
3215
3216
3217
3218
3219
3220
3221
3222
3223 if (!list_empty(&ctx->pinned_groups))
3224 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3225 perf_event_sched_in(cpuctx, ctx, task);
3226 perf_pmu_enable(ctx->pmu);
3227 perf_ctx_unlock(cpuctx, ctx);
3228}
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241void __perf_event_task_sched_in(struct task_struct *prev,
3242 struct task_struct *task)
3243{
3244 struct perf_event_context *ctx;
3245 int ctxn;
3246
3247
3248
3249
3250
3251
3252
3253
3254 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3255 perf_cgroup_sched_in(prev, task);
3256
3257 for_each_task_context_nr(ctxn) {
3258 ctx = task->perf_event_ctxp[ctxn];
3259 if (likely(!ctx))
3260 continue;
3261
3262 perf_event_context_sched_in(ctx, task);
3263 }
3264
3265 if (atomic_read(&nr_switch_events))
3266 perf_event_switch(task, prev, true);
3267
3268 if (__this_cpu_read(perf_sched_cb_usages))
3269 perf_pmu_sched_task(prev, task, true);
3270}
3271
3272static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3273{
3274 u64 frequency = event->attr.sample_freq;
3275 u64 sec = NSEC_PER_SEC;
3276 u64 divisor, dividend;
3277
3278 int count_fls, nsec_fls, frequency_fls, sec_fls;
3279
3280 count_fls = fls64(count);
3281 nsec_fls = fls64(nsec);
3282 frequency_fls = fls64(frequency);
3283 sec_fls = 30;
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299#define REDUCE_FLS(a, b) \
3300do { \
3301 if (a##_fls > b##_fls) { \
3302 a >>= 1; \
3303 a##_fls--; \
3304 } else { \
3305 b >>= 1; \
3306 b##_fls--; \
3307 } \
3308} while (0)
3309
3310
3311
3312
3313
3314 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3315 REDUCE_FLS(nsec, frequency);
3316 REDUCE_FLS(sec, count);
3317 }
3318
3319 if (count_fls + sec_fls > 64) {
3320 divisor = nsec * frequency;
3321
3322 while (count_fls + sec_fls > 64) {
3323 REDUCE_FLS(count, sec);
3324 divisor >>= 1;
3325 }
3326
3327 dividend = count * sec;
3328 } else {
3329 dividend = count * sec;
3330
3331 while (nsec_fls + frequency_fls > 64) {
3332 REDUCE_FLS(nsec, frequency);
3333 dividend >>= 1;
3334 }
3335
3336 divisor = nsec * frequency;
3337 }
3338
3339 if (!divisor)
3340 return dividend;
3341
3342 return div64_u64(dividend, divisor);
3343}
3344
3345static DEFINE_PER_CPU(int, perf_throttled_count);
3346static DEFINE_PER_CPU(u64, perf_throttled_seq);
3347
3348static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3349{
3350 struct hw_perf_event *hwc = &event->hw;
3351 s64 period, sample_period;
3352 s64 delta;
3353
3354 period = perf_calculate_period(event, nsec, count);
3355
3356 delta = (s64)(period - hwc->sample_period);
3357 delta = (delta + 7) / 8;
3358
3359 sample_period = hwc->sample_period + delta;
3360
3361 if (!sample_period)
3362 sample_period = 1;
3363
3364 hwc->sample_period = sample_period;
3365
3366 if (local64_read(&hwc->period_left) > 8*sample_period) {
3367 if (disable)
3368 event->pmu->stop(event, PERF_EF_UPDATE);
3369
3370 local64_set(&hwc->period_left, 0);
3371
3372 if (disable)
3373 event->pmu->start(event, PERF_EF_RELOAD);
3374 }
3375}
3376
3377
3378
3379
3380
3381
3382static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3383 int needs_unthr)
3384{
3385 struct perf_event *event;
3386 struct hw_perf_event *hwc;
3387 u64 now, period = TICK_NSEC;
3388 s64 delta;
3389
3390
3391
3392
3393
3394
3395 if (!(ctx->nr_freq || needs_unthr))
3396 return;
3397
3398 raw_spin_lock(&ctx->lock);
3399 perf_pmu_disable(ctx->pmu);
3400
3401 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3402 if (event->state != PERF_EVENT_STATE_ACTIVE)
3403 continue;
3404
3405 if (!event_filter_match(event))
3406 continue;
3407
3408 perf_pmu_disable(event->pmu);
3409
3410 hwc = &event->hw;
3411
3412 if (hwc->interrupts == MAX_INTERRUPTS) {
3413 hwc->interrupts = 0;
3414 perf_log_throttle(event, 1);
3415 event->pmu->start(event, 0);
3416 }
3417
3418 if (!event->attr.freq || !event->attr.sample_freq)
3419 goto next;
3420
3421
3422
3423
3424 event->pmu->stop(event, PERF_EF_UPDATE);
3425
3426 now = local64_read(&event->count);
3427 delta = now - hwc->freq_count_stamp;
3428 hwc->freq_count_stamp = now;
3429
3430
3431
3432
3433
3434
3435
3436
3437 if (delta > 0)
3438 perf_adjust_period(event, period, delta, false);
3439
3440 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3441 next:
3442 perf_pmu_enable(event->pmu);
3443 }
3444
3445 perf_pmu_enable(ctx->pmu);
3446 raw_spin_unlock(&ctx->lock);
3447}
3448
3449
3450
3451
3452static void rotate_ctx(struct perf_event_context *ctx)
3453{
3454
3455
3456
3457
3458 if (!ctx->rotate_disable)
3459 list_rotate_left(&ctx->flexible_groups);
3460}
3461
3462static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3463{
3464 struct perf_event_context *ctx = NULL;
3465 int rotate = 0;
3466
3467 if (cpuctx->ctx.nr_events) {
3468 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3469 rotate = 1;
3470 }
3471
3472 ctx = cpuctx->task_ctx;
3473 if (ctx && ctx->nr_events) {
3474 if (ctx->nr_events != ctx->nr_active)
3475 rotate = 1;
3476 }
3477
3478 if (!rotate)
3479 goto done;
3480
3481 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3482 perf_pmu_disable(cpuctx->ctx.pmu);
3483
3484 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3485 if (ctx)
3486 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3487
3488 rotate_ctx(&cpuctx->ctx);
3489 if (ctx)
3490 rotate_ctx(ctx);
3491
3492 perf_event_sched_in(cpuctx, ctx, current);
3493
3494 perf_pmu_enable(cpuctx->ctx.pmu);
3495 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3496done:
3497
3498 return rotate;
3499}
3500
3501void perf_event_task_tick(void)
3502{
3503 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3504 struct perf_event_context *ctx, *tmp;
3505 int throttled;
3506
3507 WARN_ON(!irqs_disabled());
3508
3509 __this_cpu_inc(perf_throttled_seq);
3510 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3511 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3512
3513 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3514 perf_adjust_freq_unthr_context(ctx, throttled);
3515}
3516
3517static int event_enable_on_exec(struct perf_event *event,
3518 struct perf_event_context *ctx)
3519{
3520 if (!event->attr.enable_on_exec)
3521 return 0;
3522
3523 event->attr.enable_on_exec = 0;
3524 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3525 return 0;
3526
3527 __perf_event_mark_enabled(event);
3528
3529 return 1;
3530}
3531
3532
3533
3534
3535
3536static void perf_event_enable_on_exec(int ctxn)
3537{
3538 struct perf_event_context *ctx, *clone_ctx = NULL;
3539 enum event_type_t event_type = 0;
3540 struct perf_cpu_context *cpuctx;
3541 struct perf_event *event;
3542 unsigned long flags;
3543 int enabled = 0;
3544
3545 local_irq_save(flags);
3546 ctx = current->perf_event_ctxp[ctxn];
3547 if (!ctx || !ctx->nr_events)
3548 goto out;
3549
3550 cpuctx = __get_cpu_context(ctx);
3551 perf_ctx_lock(cpuctx, ctx);
3552 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3553 list_for_each_entry(event, &ctx->event_list, event_entry) {
3554 enabled |= event_enable_on_exec(event, ctx);
3555 event_type |= get_event_type(event);
3556 }
3557
3558
3559
3560
3561 if (enabled) {
3562 clone_ctx = unclone_ctx(ctx);
3563 ctx_resched(cpuctx, ctx, event_type);
3564 } else {
3565 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3566 }
3567 perf_ctx_unlock(cpuctx, ctx);
3568
3569out:
3570 local_irq_restore(flags);
3571
3572 if (clone_ctx)
3573 put_ctx(clone_ctx);
3574}
3575
3576struct perf_read_data {
3577 struct perf_event *event;
3578 bool group;
3579 int ret;
3580};
3581
3582static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3583{
3584 u16 local_pkg, event_pkg;
3585
3586 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3587 int local_cpu = smp_processor_id();
3588
3589 event_pkg = topology_physical_package_id(event_cpu);
3590 local_pkg = topology_physical_package_id(local_cpu);
3591
3592 if (event_pkg == local_pkg)
3593 return local_cpu;
3594 }
3595
3596 return event_cpu;
3597}
3598
3599
3600
3601
3602static void __perf_event_read(void *info)
3603{
3604 struct perf_read_data *data = info;
3605 struct perf_event *sub, *event = data->event;
3606 struct perf_event_context *ctx = event->ctx;
3607 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3608 struct pmu *pmu = event->pmu;
3609
3610
3611
3612
3613
3614
3615
3616
3617 if (ctx->task && cpuctx->task_ctx != ctx)
3618 return;
3619
3620 raw_spin_lock(&ctx->lock);
3621 if (ctx->is_active) {
3622 update_context_time(ctx);
3623 update_cgrp_time_from_event(event);
3624 }
3625
3626 update_event_times(event);
3627 if (event->state != PERF_EVENT_STATE_ACTIVE)
3628 goto unlock;
3629
3630 if (!data->group) {
3631 pmu->read(event);
3632 data->ret = 0;
3633 goto unlock;
3634 }
3635
3636 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3637
3638 pmu->read(event);
3639
3640 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3641 update_event_times(sub);
3642 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3643
3644
3645
3646
3647 sub->pmu->read(sub);
3648 }
3649 }
3650
3651 data->ret = pmu->commit_txn(pmu);
3652
3653unlock:
3654 raw_spin_unlock(&ctx->lock);
3655}
3656
3657static inline u64 perf_event_count(struct perf_event *event)
3658{
3659 if (event->pmu->count)
3660 return event->pmu->count(event);
3661
3662 return __perf_event_count(event);
3663}
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673int perf_event_read_local(struct perf_event *event, u64 *value)
3674{
3675 unsigned long flags;
3676 int ret = 0;
3677
3678
3679
3680
3681
3682 local_irq_save(flags);
3683
3684
3685
3686
3687
3688 if (event->attr.inherit) {
3689 ret = -EOPNOTSUPP;
3690 goto out;
3691 }
3692
3693
3694
3695
3696
3697 if (event->pmu->count) {
3698 ret = -EOPNOTSUPP;
3699 goto out;
3700 }
3701
3702
3703 if ((event->attach_state & PERF_ATTACH_TASK) &&
3704 event->hw.target != current) {
3705 ret = -EINVAL;
3706 goto out;
3707 }
3708
3709
3710 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3711 event->cpu != smp_processor_id()) {
3712 ret = -EINVAL;
3713 goto out;
3714 }
3715
3716
3717
3718
3719
3720
3721 if (event->oncpu == smp_processor_id())
3722 event->pmu->read(event);
3723
3724 *value = local64_read(&event->count);
3725out:
3726 local_irq_restore(flags);
3727
3728 return ret;
3729}
3730
3731static int perf_event_read(struct perf_event *event, bool group)
3732{
3733 int event_cpu, ret = 0;
3734
3735
3736
3737
3738
3739 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3740 struct perf_read_data data = {
3741 .event = event,
3742 .group = group,
3743 .ret = 0,
3744 };
3745
3746 event_cpu = READ_ONCE(event->oncpu);
3747 if ((unsigned)event_cpu >= nr_cpu_ids)
3748 return 0;
3749
3750 preempt_disable();
3751 event_cpu = __perf_event_read_cpu(event, event_cpu);
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3764 preempt_enable();
3765 ret = data.ret;
3766 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3767 struct perf_event_context *ctx = event->ctx;
3768 unsigned long flags;
3769
3770 raw_spin_lock_irqsave(&ctx->lock, flags);
3771
3772
3773
3774
3775
3776 if (ctx->is_active) {
3777 update_context_time(ctx);
3778 update_cgrp_time_from_event(event);
3779 }
3780 if (group)
3781 update_group_times(event);
3782 else
3783 update_event_times(event);
3784 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3785 }
3786
3787 return ret;
3788}
3789
3790
3791
3792
3793static void __perf_event_init_context(struct perf_event_context *ctx)
3794{
3795 raw_spin_lock_init(&ctx->lock);
3796 mutex_init(&ctx->mutex);
3797 INIT_LIST_HEAD(&ctx->active_ctx_list);
3798 INIT_LIST_HEAD(&ctx->pinned_groups);
3799 INIT_LIST_HEAD(&ctx->flexible_groups);
3800 INIT_LIST_HEAD(&ctx->event_list);
3801 atomic_set(&ctx->refcount, 1);
3802}
3803
3804static struct perf_event_context *
3805alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3806{
3807 struct perf_event_context *ctx;
3808
3809 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3810 if (!ctx)
3811 return NULL;
3812
3813 __perf_event_init_context(ctx);
3814 if (task) {
3815 ctx->task = task;
3816 get_task_struct(task);
3817 }
3818 ctx->pmu = pmu;
3819
3820 return ctx;
3821}
3822
3823static struct task_struct *
3824find_lively_task_by_vpid(pid_t vpid)
3825{
3826 struct task_struct *task;
3827
3828 rcu_read_lock();
3829 if (!vpid)
3830 task = current;
3831 else
3832 task = find_task_by_vpid(vpid);
3833 if (task)
3834 get_task_struct(task);
3835 rcu_read_unlock();
3836
3837 if (!task)
3838 return ERR_PTR(-ESRCH);
3839
3840 return task;
3841}
3842
3843
3844
3845
3846static struct perf_event_context *
3847find_get_context(struct pmu *pmu, struct task_struct *task,
3848 struct perf_event *event)
3849{
3850 struct perf_event_context *ctx, *clone_ctx = NULL;
3851 struct perf_cpu_context *cpuctx;
3852 void *task_ctx_data = NULL;
3853 unsigned long flags;
3854 int ctxn, err;
3855 int cpu = event->cpu;
3856
3857 if (!task) {
3858
3859 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3860 return ERR_PTR(-EACCES);
3861
3862 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3863 ctx = &cpuctx->ctx;
3864 get_ctx(ctx);
3865 ++ctx->pin_count;
3866
3867 return ctx;
3868 }
3869
3870 err = -EINVAL;
3871 ctxn = pmu->task_ctx_nr;
3872 if (ctxn < 0)
3873 goto errout;
3874
3875 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3876 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3877 if (!task_ctx_data) {
3878 err = -ENOMEM;
3879 goto errout;
3880 }
3881 }
3882
3883retry:
3884 ctx = perf_lock_task_context(task, ctxn, &flags);
3885 if (ctx) {
3886 clone_ctx = unclone_ctx(ctx);
3887 ++ctx->pin_count;
3888
3889 if (task_ctx_data && !ctx->task_ctx_data) {
3890 ctx->task_ctx_data = task_ctx_data;
3891 task_ctx_data = NULL;
3892 }
3893 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3894
3895 if (clone_ctx)
3896 put_ctx(clone_ctx);
3897 } else {
3898 ctx = alloc_perf_context(pmu, task);
3899 err = -ENOMEM;
3900 if (!ctx)
3901 goto errout;
3902
3903 if (task_ctx_data) {
3904 ctx->task_ctx_data = task_ctx_data;
3905 task_ctx_data = NULL;
3906 }
3907
3908 err = 0;
3909 mutex_lock(&task->perf_event_mutex);
3910
3911
3912
3913
3914 if (task->flags & PF_EXITING)
3915 err = -ESRCH;
3916 else if (task->perf_event_ctxp[ctxn])
3917 err = -EAGAIN;
3918 else {
3919 get_ctx(ctx);
3920 ++ctx->pin_count;
3921 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3922 }
3923 mutex_unlock(&task->perf_event_mutex);
3924
3925 if (unlikely(err)) {
3926 put_ctx(ctx);
3927
3928 if (err == -EAGAIN)
3929 goto retry;
3930 goto errout;
3931 }
3932 }
3933
3934 kfree(task_ctx_data);
3935 return ctx;
3936
3937errout:
3938 kfree(task_ctx_data);
3939 return ERR_PTR(err);
3940}
3941
3942static void perf_event_free_filter(struct perf_event *event);
3943static void perf_event_free_bpf_prog(struct perf_event *event);
3944
3945static void free_event_rcu(struct rcu_head *head)
3946{
3947 struct perf_event *event;
3948
3949 event = container_of(head, struct perf_event, rcu_head);
3950 if (event->ns)
3951 put_pid_ns(event->ns);
3952 perf_event_free_filter(event);
3953 kfree(event);
3954}
3955
3956static void ring_buffer_attach(struct perf_event *event,
3957 struct ring_buffer *rb);
3958
3959static void detach_sb_event(struct perf_event *event)
3960{
3961 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3962
3963 raw_spin_lock(&pel->lock);
3964 list_del_rcu(&event->sb_list);
3965 raw_spin_unlock(&pel->lock);
3966}
3967
3968static bool is_sb_event(struct perf_event *event)
3969{
3970 struct perf_event_attr *attr = &event->attr;
3971
3972 if (event->parent)
3973 return false;
3974
3975 if (event->attach_state & PERF_ATTACH_TASK)
3976 return false;
3977
3978 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3979 attr->comm || attr->comm_exec ||
3980 attr->task ||
3981 attr->context_switch)
3982 return true;
3983 return false;
3984}
3985
3986static void unaccount_pmu_sb_event(struct perf_event *event)
3987{
3988 if (is_sb_event(event))
3989 detach_sb_event(event);
3990}
3991
3992static void unaccount_event_cpu(struct perf_event *event, int cpu)
3993{
3994 if (event->parent)
3995 return;
3996
3997 if (is_cgroup_event(event))
3998 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3999}
4000
4001#ifdef CONFIG_NO_HZ_FULL
4002static DEFINE_SPINLOCK(nr_freq_lock);
4003#endif
4004
4005static void unaccount_freq_event_nohz(void)
4006{
4007#ifdef CONFIG_NO_HZ_FULL
4008 spin_lock(&nr_freq_lock);
4009 if (atomic_dec_and_test(&nr_freq_events))
4010 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4011 spin_unlock(&nr_freq_lock);
4012#endif
4013}
4014
4015static void unaccount_freq_event(void)
4016{
4017 if (tick_nohz_full_enabled())
4018 unaccount_freq_event_nohz();
4019 else
4020 atomic_dec(&nr_freq_events);
4021}
4022
4023static void unaccount_event(struct perf_event *event)
4024{
4025 bool dec = false;
4026
4027 if (event->parent)
4028 return;
4029
4030 if (event->attach_state & PERF_ATTACH_TASK)
4031 dec = true;
4032 if (event->attr.mmap || event->attr.mmap_data)
4033 atomic_dec(&nr_mmap_events);
4034 if (event->attr.comm)
4035 atomic_dec(&nr_comm_events);
4036 if (event->attr.namespaces)
4037 atomic_dec(&nr_namespaces_events);
4038 if (event->attr.task)
4039 atomic_dec(&nr_task_events);
4040 if (event->attr.freq)
4041 unaccount_freq_event();
4042 if (event->attr.context_switch) {
4043 dec = true;
4044 atomic_dec(&nr_switch_events);
4045 }
4046 if (is_cgroup_event(event))
4047 dec = true;
4048 if (has_branch_stack(event))
4049 dec = true;
4050
4051 if (dec) {
4052 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4053 schedule_delayed_work(&perf_sched_work, HZ);
4054 }
4055
4056 unaccount_event_cpu(event, event->cpu);
4057
4058 unaccount_pmu_sb_event(event);
4059}
4060
4061static void perf_sched_delayed(struct work_struct *work)
4062{
4063 mutex_lock(&perf_sched_mutex);
4064 if (atomic_dec_and_test(&perf_sched_count))
4065 static_branch_disable(&perf_sched_events);
4066 mutex_unlock(&perf_sched_mutex);
4067}
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081static int exclusive_event_init(struct perf_event *event)
4082{
4083 struct pmu *pmu = event->pmu;
4084
4085 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4086 return 0;
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101 if (event->attach_state & PERF_ATTACH_TASK) {
4102 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4103 return -EBUSY;
4104 } else {
4105 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4106 return -EBUSY;
4107 }
4108
4109 return 0;
4110}
4111
4112static void exclusive_event_destroy(struct perf_event *event)
4113{
4114 struct pmu *pmu = event->pmu;
4115
4116 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4117 return;
4118
4119
4120 if (event->attach_state & PERF_ATTACH_TASK)
4121 atomic_dec(&pmu->exclusive_cnt);
4122 else
4123 atomic_inc(&pmu->exclusive_cnt);
4124}
4125
4126static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4127{
4128 if ((e1->pmu == e2->pmu) &&
4129 (e1->cpu == e2->cpu ||
4130 e1->cpu == -1 ||
4131 e2->cpu == -1))
4132 return true;
4133 return false;
4134}
4135
4136
4137static bool exclusive_event_installable(struct perf_event *event,
4138 struct perf_event_context *ctx)
4139{
4140 struct perf_event *iter_event;
4141 struct pmu *pmu = event->pmu;
4142
4143 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4144 return true;
4145
4146 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4147 if (exclusive_event_match(iter_event, event))
4148 return false;
4149 }
4150
4151 return true;
4152}
4153
4154static void perf_addr_filters_splice(struct perf_event *event,
4155 struct list_head *head);
4156
4157static void _free_event(struct perf_event *event)
4158{
4159 irq_work_sync(&event->pending);
4160
4161 unaccount_event(event);
4162
4163 if (event->rb) {
4164
4165
4166
4167
4168
4169
4170 mutex_lock(&event->mmap_mutex);
4171 ring_buffer_attach(event, NULL);
4172 mutex_unlock(&event->mmap_mutex);
4173 }
4174
4175 if (is_cgroup_event(event))
4176 perf_detach_cgroup(event);
4177
4178 if (!event->parent) {
4179 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4180 put_callchain_buffers();
4181 }
4182
4183 perf_event_free_bpf_prog(event);
4184 perf_addr_filters_splice(event, NULL);
4185 kfree(event->addr_filters_offs);
4186
4187 if (event->destroy)
4188 event->destroy(event);
4189
4190 if (event->ctx)
4191 put_ctx(event->ctx);
4192
4193 exclusive_event_destroy(event);
4194 module_put(event->pmu->module);
4195
4196 call_rcu(&event->rcu_head, free_event_rcu);
4197}
4198
4199
4200
4201
4202
4203static void free_event(struct perf_event *event)
4204{
4205 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4206 "unexpected event refcount: %ld; ptr=%p\n",
4207 atomic_long_read(&event->refcount), event)) {
4208
4209 return;
4210 }
4211
4212 _free_event(event);
4213}
4214
4215
4216
4217
4218static void perf_remove_from_owner(struct perf_event *event)
4219{
4220 struct task_struct *owner;
4221
4222 rcu_read_lock();
4223
4224
4225
4226
4227
4228
4229 owner = lockless_dereference(event->owner);
4230 if (owner) {
4231
4232
4233
4234
4235
4236 get_task_struct(owner);
4237 }
4238 rcu_read_unlock();
4239
4240 if (owner) {
4241
4242
4243
4244
4245
4246
4247
4248
4249 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4250
4251
4252
4253
4254
4255
4256
4257 if (event->owner) {
4258 list_del_init(&event->owner_entry);
4259 smp_store_release(&event->owner, NULL);
4260 }
4261 mutex_unlock(&owner->perf_event_mutex);
4262 put_task_struct(owner);
4263 }
4264}
4265
4266static void put_event(struct perf_event *event)
4267{
4268 if (!atomic_long_dec_and_test(&event->refcount))
4269 return;
4270
4271 _free_event(event);
4272}
4273
4274
4275
4276
4277
4278
4279int perf_event_release_kernel(struct perf_event *event)
4280{
4281 struct perf_event_context *ctx = event->ctx;
4282 struct perf_event *child, *tmp;
4283
4284
4285
4286
4287
4288 if (!ctx) {
4289 WARN_ON_ONCE(event->attach_state &
4290 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4291 goto no_ctx;
4292 }
4293
4294 if (!is_kernel_event(event))
4295 perf_remove_from_owner(event);
4296
4297 ctx = perf_event_ctx_lock(event);
4298 WARN_ON_ONCE(ctx->parent_ctx);
4299 perf_remove_from_context(event, DETACH_GROUP);
4300
4301 raw_spin_lock_irq(&ctx->lock);
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313 event->state = PERF_EVENT_STATE_DEAD;
4314 raw_spin_unlock_irq(&ctx->lock);
4315
4316 perf_event_ctx_unlock(event, ctx);
4317
4318again:
4319 mutex_lock(&event->child_mutex);
4320 list_for_each_entry(child, &event->child_list, child_list) {
4321
4322
4323
4324
4325
4326 ctx = lockless_dereference(child->ctx);
4327
4328
4329
4330
4331
4332
4333
4334
4335 get_ctx(ctx);
4336
4337
4338
4339
4340
4341
4342 mutex_unlock(&event->child_mutex);
4343 mutex_lock(&ctx->mutex);
4344 mutex_lock(&event->child_mutex);
4345
4346
4347
4348
4349
4350
4351 tmp = list_first_entry_or_null(&event->child_list,
4352 struct perf_event, child_list);
4353 if (tmp == child) {
4354 perf_remove_from_context(child, DETACH_GROUP);
4355 list_del(&child->child_list);
4356 free_event(child);
4357
4358
4359
4360
4361 put_event(event);
4362 }
4363
4364 mutex_unlock(&event->child_mutex);
4365 mutex_unlock(&ctx->mutex);
4366 put_ctx(ctx);
4367 goto again;
4368 }
4369 mutex_unlock(&event->child_mutex);
4370
4371no_ctx:
4372 put_event(event);
4373 return 0;
4374}
4375EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4376
4377
4378
4379
4380static int perf_release(struct inode *inode, struct file *file)
4381{
4382 perf_event_release_kernel(file->private_data);
4383 return 0;
4384}
4385
4386u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4387{
4388 struct perf_event *child;
4389 u64 total = 0;
4390
4391 *enabled = 0;
4392 *running = 0;
4393
4394 mutex_lock(&event->child_mutex);
4395
4396 (void)perf_event_read(event, false);
4397 total += perf_event_count(event);
4398
4399 *enabled += event->total_time_enabled +
4400 atomic64_read(&event->child_total_time_enabled);
4401 *running += event->total_time_running +
4402 atomic64_read(&event->child_total_time_running);
4403
4404 list_for_each_entry(child, &event->child_list, child_list) {
4405 (void)perf_event_read(child, false);
4406 total += perf_event_count(child);
4407 *enabled += child->total_time_enabled;
4408 *running += child->total_time_running;
4409 }
4410 mutex_unlock(&event->child_mutex);
4411
4412 return total;
4413}
4414EXPORT_SYMBOL_GPL(perf_event_read_value);
4415
4416static int __perf_read_group_add(struct perf_event *leader,
4417 u64 read_format, u64 *values)
4418{
4419 struct perf_event_context *ctx = leader->ctx;
4420 struct perf_event *sub;
4421 unsigned long flags;
4422 int n = 1;
4423 int ret;
4424
4425 ret = perf_event_read(leader, true);
4426 if (ret)
4427 return ret;
4428
4429
4430
4431
4432
4433
4434 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4435 values[n++] += leader->total_time_enabled +
4436 atomic64_read(&leader->child_total_time_enabled);
4437 }
4438
4439 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4440 values[n++] += leader->total_time_running +
4441 atomic64_read(&leader->child_total_time_running);
4442 }
4443
4444
4445
4446
4447 values[n++] += perf_event_count(leader);
4448 if (read_format & PERF_FORMAT_ID)
4449 values[n++] = primary_event_id(leader);
4450
4451 raw_spin_lock_irqsave(&ctx->lock, flags);
4452
4453 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4454 values[n++] += perf_event_count(sub);
4455 if (read_format & PERF_FORMAT_ID)
4456 values[n++] = primary_event_id(sub);
4457 }
4458
4459 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4460 return 0;
4461}
4462
4463static int perf_read_group(struct perf_event *event,
4464 u64 read_format, char __user *buf)
4465{
4466 struct perf_event *leader = event->group_leader, *child;
4467 struct perf_event_context *ctx = leader->ctx;
4468 int ret;
4469 u64 *values;
4470
4471 lockdep_assert_held(&ctx->mutex);
4472
4473 values = kzalloc(event->read_size, GFP_KERNEL);
4474 if (!values)
4475 return -ENOMEM;
4476
4477 values[0] = 1 + leader->nr_siblings;
4478
4479
4480
4481
4482
4483 mutex_lock(&leader->child_mutex);
4484
4485 ret = __perf_read_group_add(leader, read_format, values);
4486 if (ret)
4487 goto unlock;
4488
4489 list_for_each_entry(child, &leader->child_list, child_list) {
4490 ret = __perf_read_group_add(child, read_format, values);
4491 if (ret)
4492 goto unlock;
4493 }
4494
4495 mutex_unlock(&leader->child_mutex);
4496
4497 ret = event->read_size;
4498 if (copy_to_user(buf, values, event->read_size))
4499 ret = -EFAULT;
4500 goto out;
4501
4502unlock:
4503 mutex_unlock(&leader->child_mutex);
4504out:
4505 kfree(values);
4506 return ret;
4507}
4508
4509static int perf_read_one(struct perf_event *event,
4510 u64 read_format, char __user *buf)
4511{
4512 u64 enabled, running;
4513 u64 values[4];
4514 int n = 0;
4515
4516 values[n++] = perf_event_read_value(event, &enabled, &running);
4517 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4518 values[n++] = enabled;
4519 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4520 values[n++] = running;
4521 if (read_format & PERF_FORMAT_ID)
4522 values[n++] = primary_event_id(event);
4523
4524 if (copy_to_user(buf, values, n * sizeof(u64)))
4525 return -EFAULT;
4526
4527 return n * sizeof(u64);
4528}
4529
4530static bool is_event_hup(struct perf_event *event)
4531{
4532 bool no_children;
4533
4534 if (event->state > PERF_EVENT_STATE_EXIT)
4535 return false;
4536
4537 mutex_lock(&event->child_mutex);
4538 no_children = list_empty(&event->child_list);
4539 mutex_unlock(&event->child_mutex);
4540 return no_children;
4541}
4542
4543
4544
4545
4546static ssize_t
4547__perf_read(struct perf_event *event, char __user *buf, size_t count)
4548{
4549 u64 read_format = event->attr.read_format;
4550 int ret;
4551
4552
4553
4554
4555
4556
4557 if (event->state == PERF_EVENT_STATE_ERROR)
4558 return 0;
4559
4560 if (count < event->read_size)
4561 return -ENOSPC;
4562
4563 WARN_ON_ONCE(event->ctx->parent_ctx);
4564 if (read_format & PERF_FORMAT_GROUP)
4565 ret = perf_read_group(event, read_format, buf);
4566 else
4567 ret = perf_read_one(event, read_format, buf);
4568
4569 return ret;
4570}
4571
4572static ssize_t
4573perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4574{
4575 struct perf_event *event = file->private_data;
4576 struct perf_event_context *ctx;
4577 int ret;
4578
4579 ctx = perf_event_ctx_lock(event);
4580 ret = __perf_read(event, buf, count);
4581 perf_event_ctx_unlock(event, ctx);
4582
4583 return ret;
4584}
4585
4586static unsigned int perf_poll(struct file *file, poll_table *wait)
4587{
4588 struct perf_event *event = file->private_data;
4589 struct ring_buffer *rb;
4590 unsigned int events = POLLHUP;
4591
4592 poll_wait(file, &event->waitq, wait);
4593
4594 if (is_event_hup(event))
4595 return events;
4596
4597
4598
4599
4600
4601 mutex_lock(&event->mmap_mutex);
4602 rb = event->rb;
4603 if (rb)
4604 events = atomic_xchg(&rb->poll, 0);
4605 mutex_unlock(&event->mmap_mutex);
4606 return events;
4607}
4608
4609static void _perf_event_reset(struct perf_event *event)
4610{
4611 (void)perf_event_read(event, false);
4612 local64_set(&event->count, 0);
4613 perf_event_update_userpage(event);
4614}
4615
4616
4617
4618
4619
4620
4621
4622static void perf_event_for_each_child(struct perf_event *event,
4623 void (*func)(struct perf_event *))
4624{
4625 struct perf_event *child;
4626
4627 WARN_ON_ONCE(event->ctx->parent_ctx);
4628
4629 mutex_lock(&event->child_mutex);
4630 func(event);
4631 list_for_each_entry(child, &event->child_list, child_list)
4632 func(child);
4633 mutex_unlock(&event->child_mutex);
4634}
4635
4636static void perf_event_for_each(struct perf_event *event,
4637 void (*func)(struct perf_event *))
4638{
4639 struct perf_event_context *ctx = event->ctx;
4640 struct perf_event *sibling;
4641
4642 lockdep_assert_held(&ctx->mutex);
4643
4644 event = event->group_leader;
4645
4646 perf_event_for_each_child(event, func);
4647 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4648 perf_event_for_each_child(sibling, func);
4649}
4650
4651static void __perf_event_period(struct perf_event *event,
4652 struct perf_cpu_context *cpuctx,
4653 struct perf_event_context *ctx,
4654 void *info)
4655{
4656 u64 value = *((u64 *)info);
4657 bool active;
4658
4659 if (event->attr.freq) {
4660 event->attr.sample_freq = value;
4661 } else {
4662 event->attr.sample_period = value;
4663 event->hw.sample_period = value;
4664 }
4665
4666 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4667 if (active) {
4668 perf_pmu_disable(ctx->pmu);
4669
4670
4671
4672
4673 if (event->hw.interrupts == MAX_INTERRUPTS) {
4674 event->hw.interrupts = 0;
4675 perf_log_throttle(event, 1);
4676 }
4677 event->pmu->stop(event, PERF_EF_UPDATE);
4678 }
4679
4680 local64_set(&event->hw.period_left, 0);
4681
4682 if (active) {
4683 event->pmu->start(event, PERF_EF_RELOAD);
4684 perf_pmu_enable(ctx->pmu);
4685 }
4686}
4687
4688static int perf_event_period(struct perf_event *event, u64 __user *arg)
4689{
4690 u64 value;
4691
4692 if (!is_sampling_event(event))
4693 return -EINVAL;
4694
4695 if (copy_from_user(&value, arg, sizeof(value)))
4696 return -EFAULT;
4697
4698 if (!value)
4699 return -EINVAL;
4700
4701 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4702 return -EINVAL;
4703
4704 event_function_call(event, __perf_event_period, &value);
4705
4706 return 0;
4707}
4708
4709static const struct file_operations perf_fops;
4710
4711static inline int perf_fget_light(int fd, struct fd *p)
4712{
4713 struct fd f = fdget(fd);
4714 if (!f.file)
4715 return -EBADF;
4716
4717 if (f.file->f_op != &perf_fops) {
4718 fdput(f);
4719 return -EBADF;
4720 }
4721 *p = f;
4722 return 0;
4723}
4724
4725static int perf_event_set_output(struct perf_event *event,
4726 struct perf_event *output_event);
4727static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4728static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4729
4730static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4731{
4732 void (*func)(struct perf_event *);
4733 u32 flags = arg;
4734
4735 switch (cmd) {
4736 case PERF_EVENT_IOC_ENABLE:
4737 func = _perf_event_enable;
4738 break;
4739 case PERF_EVENT_IOC_DISABLE:
4740 func = _perf_event_disable;
4741 break;
4742 case PERF_EVENT_IOC_RESET:
4743 func = _perf_event_reset;
4744 break;
4745
4746 case PERF_EVENT_IOC_REFRESH:
4747 return _perf_event_refresh(event, arg);
4748
4749 case PERF_EVENT_IOC_PERIOD:
4750 return perf_event_period(event, (u64 __user *)arg);
4751
4752 case PERF_EVENT_IOC_ID:
4753 {
4754 u64 id = primary_event_id(event);
4755
4756 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4757 return -EFAULT;
4758 return 0;
4759 }
4760
4761 case PERF_EVENT_IOC_SET_OUTPUT:
4762 {
4763 int ret;
4764 if (arg != -1) {
4765 struct perf_event *output_event;
4766 struct fd output;
4767 ret = perf_fget_light(arg, &output);
4768 if (ret)
4769 return ret;
4770 output_event = output.file->private_data;
4771 ret = perf_event_set_output(event, output_event);
4772 fdput(output);
4773 } else {
4774 ret = perf_event_set_output(event, NULL);
4775 }
4776 return ret;
4777 }
4778
4779 case PERF_EVENT_IOC_SET_FILTER:
4780 return perf_event_set_filter(event, (void __user *)arg);
4781
4782 case PERF_EVENT_IOC_SET_BPF:
4783 return perf_event_set_bpf_prog(event, arg);
4784
4785 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4786 struct ring_buffer *rb;
4787
4788 rcu_read_lock();
4789 rb = rcu_dereference(event->rb);
4790 if (!rb || !rb->nr_pages) {
4791 rcu_read_unlock();
4792 return -EINVAL;
4793 }
4794 rb_toggle_paused(rb, !!arg);
4795 rcu_read_unlock();
4796 return 0;
4797 }
4798 default:
4799 return -ENOTTY;
4800 }
4801
4802 if (flags & PERF_IOC_FLAG_GROUP)
4803 perf_event_for_each(event, func);
4804 else
4805 perf_event_for_each_child(event, func);
4806
4807 return 0;
4808}
4809
4810static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4811{
4812 struct perf_event *event = file->private_data;
4813 struct perf_event_context *ctx;
4814 long ret;
4815
4816 ctx = perf_event_ctx_lock(event);
4817 ret = _perf_ioctl(event, cmd, arg);
4818 perf_event_ctx_unlock(event, ctx);
4819
4820 return ret;
4821}
4822
4823#ifdef CONFIG_COMPAT
4824static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4825 unsigned long arg)
4826{
4827 switch (_IOC_NR(cmd)) {
4828 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4829 case _IOC_NR(PERF_EVENT_IOC_ID):
4830
4831 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4832 cmd &= ~IOCSIZE_MASK;
4833 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4834 }
4835 break;
4836 }
4837 return perf_ioctl(file, cmd, arg);
4838}
4839#else
4840# define perf_compat_ioctl NULL
4841#endif
4842
4843int perf_event_task_enable(void)
4844{
4845 struct perf_event_context *ctx;
4846 struct perf_event *event;
4847
4848 mutex_lock(¤t->perf_event_mutex);
4849 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4850 ctx = perf_event_ctx_lock(event);
4851 perf_event_for_each_child(event, _perf_event_enable);
4852 perf_event_ctx_unlock(event, ctx);
4853 }
4854 mutex_unlock(¤t->perf_event_mutex);
4855
4856 return 0;
4857}
4858
4859int perf_event_task_disable(void)
4860{
4861 struct perf_event_context *ctx;
4862 struct perf_event *event;
4863
4864 mutex_lock(¤t->perf_event_mutex);
4865 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4866 ctx = perf_event_ctx_lock(event);
4867 perf_event_for_each_child(event, _perf_event_disable);
4868 perf_event_ctx_unlock(event, ctx);
4869 }
4870 mutex_unlock(¤t->perf_event_mutex);
4871
4872 return 0;
4873}
4874
4875static int perf_event_index(struct perf_event *event)
4876{
4877 if (event->hw.state & PERF_HES_STOPPED)
4878 return 0;
4879
4880 if (event->state != PERF_EVENT_STATE_ACTIVE)
4881 return 0;
4882
4883 return event->pmu->event_idx(event);
4884}
4885
4886static void calc_timer_values(struct perf_event *event,
4887 u64 *now,
4888 u64 *enabled,
4889 u64 *running)
4890{
4891 u64 ctx_time;
4892
4893 *now = perf_clock();
4894 ctx_time = event->shadow_ctx_time + *now;
4895 *enabled = ctx_time - event->tstamp_enabled;
4896 *running = ctx_time - event->tstamp_running;
4897}
4898
4899static void perf_event_init_userpage(struct perf_event *event)
4900{
4901 struct perf_event_mmap_page *userpg;
4902 struct ring_buffer *rb;
4903
4904 rcu_read_lock();
4905 rb = rcu_dereference(event->rb);
4906 if (!rb)
4907 goto unlock;
4908
4909 userpg = rb->user_page;
4910
4911
4912 userpg->cap_bit0_is_deprecated = 1;
4913 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4914 userpg->data_offset = PAGE_SIZE;
4915 userpg->data_size = perf_data_size(rb);
4916
4917unlock:
4918 rcu_read_unlock();
4919}
4920
4921void __weak arch_perf_update_userpage(
4922 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4923{
4924}
4925
4926
4927
4928
4929
4930
4931void perf_event_update_userpage(struct perf_event *event)
4932{
4933 struct perf_event_mmap_page *userpg;
4934 struct ring_buffer *rb;
4935 u64 enabled, running, now;
4936
4937 rcu_read_lock();
4938 rb = rcu_dereference(event->rb);
4939 if (!rb)
4940 goto unlock;
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951 calc_timer_values(event, &now, &enabled, &running);
4952
4953 userpg = rb->user_page;
4954
4955
4956
4957
4958 preempt_disable();
4959 ++userpg->lock;
4960 barrier();
4961 userpg->index = perf_event_index(event);
4962 userpg->offset = perf_event_count(event);
4963 if (userpg->index)
4964 userpg->offset -= local64_read(&event->hw.prev_count);
4965
4966 userpg->time_enabled = enabled +
4967 atomic64_read(&event->child_total_time_enabled);
4968
4969 userpg->time_running = running +
4970 atomic64_read(&event->child_total_time_running);
4971
4972 arch_perf_update_userpage(event, userpg, now);
4973
4974 barrier();
4975 ++userpg->lock;
4976 preempt_enable();
4977unlock:
4978 rcu_read_unlock();
4979}
4980
4981static int perf_mmap_fault(struct vm_fault *vmf)
4982{
4983 struct perf_event *event = vmf->vma->vm_file->private_data;
4984 struct ring_buffer *rb;
4985 int ret = VM_FAULT_SIGBUS;
4986
4987 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4988 if (vmf->pgoff == 0)
4989 ret = 0;
4990 return ret;
4991 }
4992
4993 rcu_read_lock();
4994 rb = rcu_dereference(event->rb);
4995 if (!rb)
4996 goto unlock;
4997
4998 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4999 goto unlock;
5000
5001 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5002 if (!vmf->page)
5003 goto unlock;
5004
5005 get_page(vmf->page);
5006 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5007 vmf->page->index = vmf->pgoff;
5008
5009 ret = 0;
5010unlock:
5011 rcu_read_unlock();
5012
5013 return ret;
5014}
5015
5016static void ring_buffer_attach(struct perf_event *event,
5017 struct ring_buffer *rb)
5018{
5019 struct ring_buffer *old_rb = NULL;
5020 unsigned long flags;
5021
5022 if (event->rb) {
5023
5024
5025
5026
5027 WARN_ON_ONCE(event->rcu_pending);
5028
5029 old_rb = event->rb;
5030 spin_lock_irqsave(&old_rb->event_lock, flags);
5031 list_del_rcu(&event->rb_entry);
5032 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5033
5034 event->rcu_batches = get_state_synchronize_rcu();
5035 event->rcu_pending = 1;
5036 }
5037
5038 if (rb) {
5039 if (event->rcu_pending) {
5040 cond_synchronize_rcu(event->rcu_batches);
5041 event->rcu_pending = 0;
5042 }
5043
5044 spin_lock_irqsave(&rb->event_lock, flags);
5045 list_add_rcu(&event->rb_entry, &rb->event_list);
5046 spin_unlock_irqrestore(&rb->event_lock, flags);
5047 }
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059 if (has_aux(event))
5060 perf_event_stop(event, 0);
5061
5062 rcu_assign_pointer(event->rb, rb);
5063
5064 if (old_rb) {
5065 ring_buffer_put(old_rb);
5066
5067
5068
5069
5070
5071 wake_up_all(&event->waitq);
5072 }
5073}
5074
5075static void ring_buffer_wakeup(struct perf_event *event)
5076{
5077 struct ring_buffer *rb;
5078
5079 rcu_read_lock();
5080 rb = rcu_dereference(event->rb);
5081 if (rb) {
5082 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5083 wake_up_all(&event->waitq);
5084 }
5085 rcu_read_unlock();
5086}
5087
5088struct ring_buffer *ring_buffer_get(struct perf_event *event)
5089{
5090 struct ring_buffer *rb;
5091
5092 rcu_read_lock();
5093 rb = rcu_dereference(event->rb);
5094 if (rb) {
5095 if (!atomic_inc_not_zero(&rb->refcount))
5096 rb = NULL;
5097 }
5098 rcu_read_unlock();
5099
5100 return rb;
5101}
5102
5103void ring_buffer_put(struct ring_buffer *rb)
5104{
5105 if (!atomic_dec_and_test(&rb->refcount))
5106 return;
5107
5108 WARN_ON_ONCE(!list_empty(&rb->event_list));
5109
5110 call_rcu(&rb->rcu_head, rb_free_rcu);
5111}
5112
5113static void perf_mmap_open(struct vm_area_struct *vma)
5114{
5115 struct perf_event *event = vma->vm_file->private_data;
5116
5117 atomic_inc(&event->mmap_count);
5118 atomic_inc(&event->rb->mmap_count);
5119
5120 if (vma->vm_pgoff)
5121 atomic_inc(&event->rb->aux_mmap_count);
5122
5123 if (event->pmu->event_mapped)
5124 event->pmu->event_mapped(event, vma->vm_mm);
5125}
5126
5127static void perf_pmu_output_stop(struct perf_event *event);
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137static void perf_mmap_close(struct vm_area_struct *vma)
5138{
5139 struct perf_event *event = vma->vm_file->private_data;
5140
5141 struct ring_buffer *rb = ring_buffer_get(event);
5142 struct user_struct *mmap_user = rb->mmap_user;
5143 int mmap_locked = rb->mmap_locked;
5144 unsigned long size = perf_data_size(rb);
5145
5146 if (event->pmu->event_unmapped)
5147 event->pmu->event_unmapped(event, vma->vm_mm);
5148
5149
5150
5151
5152
5153
5154 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5155 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5156
5157
5158
5159
5160
5161
5162 perf_pmu_output_stop(event);
5163
5164
5165 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5166 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5167
5168
5169 rb_free_aux(rb);
5170 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5171
5172 mutex_unlock(&event->mmap_mutex);
5173 }
5174
5175 atomic_dec(&rb->mmap_count);
5176
5177 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5178 goto out_put;
5179
5180 ring_buffer_attach(event, NULL);
5181 mutex_unlock(&event->mmap_mutex);
5182
5183
5184 if (atomic_read(&rb->mmap_count))
5185 goto out_put;
5186
5187
5188
5189
5190
5191
5192again:
5193 rcu_read_lock();
5194 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5195 if (!atomic_long_inc_not_zero(&event->refcount)) {
5196
5197
5198
5199
5200 continue;
5201 }
5202 rcu_read_unlock();
5203
5204 mutex_lock(&event->mmap_mutex);
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215 if (event->rb == rb)
5216 ring_buffer_attach(event, NULL);
5217
5218 mutex_unlock(&event->mmap_mutex);
5219 put_event(event);
5220
5221
5222
5223
5224
5225 goto again;
5226 }
5227 rcu_read_unlock();
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5239 vma->vm_mm->pinned_vm -= mmap_locked;
5240 free_uid(mmap_user);
5241
5242out_put:
5243 ring_buffer_put(rb);
5244}
5245
5246static const struct vm_operations_struct perf_mmap_vmops = {
5247 .open = perf_mmap_open,
5248 .close = perf_mmap_close,
5249 .fault = perf_mmap_fault,
5250 .page_mkwrite = perf_mmap_fault,
5251};
5252
5253static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5254{
5255 struct perf_event *event = file->private_data;
5256 unsigned long user_locked, user_lock_limit;
5257 struct user_struct *user = current_user();
5258 unsigned long locked, lock_limit;
5259 struct ring_buffer *rb = NULL;
5260 unsigned long vma_size;
5261 unsigned long nr_pages;
5262 long user_extra = 0, extra = 0;
5263 int ret = 0, flags = 0;
5264
5265
5266
5267
5268
5269
5270 if (event->cpu == -1 && event->attr.inherit)
5271 return -EINVAL;
5272
5273 if (!(vma->vm_flags & VM_SHARED))
5274 return -EINVAL;
5275
5276 vma_size = vma->vm_end - vma->vm_start;
5277
5278 if (vma->vm_pgoff == 0) {
5279 nr_pages = (vma_size / PAGE_SIZE) - 1;
5280 } else {
5281
5282
5283
5284
5285
5286 u64 aux_offset, aux_size;
5287
5288 if (!event->rb)
5289 return -EINVAL;
5290
5291 nr_pages = vma_size / PAGE_SIZE;
5292
5293 mutex_lock(&event->mmap_mutex);
5294 ret = -EINVAL;
5295
5296 rb = event->rb;
5297 if (!rb)
5298 goto aux_unlock;
5299
5300 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5301 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5302
5303 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5304 goto aux_unlock;
5305
5306 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5307 goto aux_unlock;
5308
5309
5310 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5311 goto aux_unlock;
5312
5313 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5314 goto aux_unlock;
5315
5316
5317 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5318 goto aux_unlock;
5319
5320 if (!is_power_of_2(nr_pages))
5321 goto aux_unlock;
5322
5323 if (!atomic_inc_not_zero(&rb->mmap_count))
5324 goto aux_unlock;
5325
5326 if (rb_has_aux(rb)) {
5327 atomic_inc(&rb->aux_mmap_count);
5328 ret = 0;
5329 goto unlock;
5330 }
5331
5332 atomic_set(&rb->aux_mmap_count, 1);
5333 user_extra = nr_pages;
5334
5335 goto accounting;
5336 }
5337
5338
5339
5340
5341
5342 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5343 return -EINVAL;
5344
5345 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5346 return -EINVAL;
5347
5348 WARN_ON_ONCE(event->ctx->parent_ctx);
5349again:
5350 mutex_lock(&event->mmap_mutex);
5351 if (event->rb) {
5352 if (event->rb->nr_pages != nr_pages) {
5353 ret = -EINVAL;
5354 goto unlock;
5355 }
5356
5357 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5358
5359
5360
5361
5362
5363 mutex_unlock(&event->mmap_mutex);
5364 goto again;
5365 }
5366
5367 goto unlock;
5368 }
5369
5370 user_extra = nr_pages + 1;
5371
5372accounting:
5373 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5374
5375
5376
5377
5378 user_lock_limit *= num_online_cpus();
5379
5380 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5381
5382 if (user_locked > user_lock_limit)
5383 extra = user_locked - user_lock_limit;
5384
5385 lock_limit = rlimit(RLIMIT_MEMLOCK);
5386 lock_limit >>= PAGE_SHIFT;
5387 locked = vma->vm_mm->pinned_vm + extra;
5388
5389 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5390 !capable(CAP_IPC_LOCK)) {
5391 ret = -EPERM;
5392 goto unlock;
5393 }
5394
5395 WARN_ON(!rb && event->rb);
5396
5397 if (vma->vm_flags & VM_WRITE)
5398 flags |= RING_BUFFER_WRITABLE;
5399
5400 if (!rb) {
5401 rb = rb_alloc(nr_pages,
5402 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5403 event->cpu, flags);
5404
5405 if (!rb) {
5406 ret = -ENOMEM;
5407 goto unlock;
5408 }
5409
5410 atomic_set(&rb->mmap_count, 1);
5411 rb->mmap_user = get_current_user();
5412 rb->mmap_locked = extra;
5413
5414 ring_buffer_attach(event, rb);
5415
5416 perf_event_init_userpage(event);
5417 perf_event_update_userpage(event);
5418 } else {
5419 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5420 event->attr.aux_watermark, flags);
5421 if (!ret)
5422 rb->aux_mmap_locked = extra;
5423 }
5424
5425unlock:
5426 if (!ret) {
5427 atomic_long_add(user_extra, &user->locked_vm);
5428 vma->vm_mm->pinned_vm += extra;
5429
5430 atomic_inc(&event->mmap_count);
5431 } else if (rb) {
5432 atomic_dec(&rb->mmap_count);
5433 }
5434aux_unlock:
5435 mutex_unlock(&event->mmap_mutex);
5436
5437
5438
5439
5440
5441 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5442 vma->vm_ops = &perf_mmap_vmops;
5443
5444 if (event->pmu->event_mapped)
5445 event->pmu->event_mapped(event, vma->vm_mm);
5446
5447 return ret;
5448}
5449
5450static int perf_fasync(int fd, struct file *filp, int on)
5451{
5452 struct inode *inode = file_inode(filp);
5453 struct perf_event *event = filp->private_data;
5454 int retval;
5455
5456 inode_lock(inode);
5457 retval = fasync_helper(fd, filp, on, &event->fasync);
5458 inode_unlock(inode);
5459
5460 if (retval < 0)
5461 return retval;
5462
5463 return 0;
5464}
5465
5466static const struct file_operations perf_fops = {
5467 .llseek = no_llseek,
5468 .release = perf_release,
5469 .read = perf_read,
5470 .poll = perf_poll,
5471 .unlocked_ioctl = perf_ioctl,
5472 .compat_ioctl = perf_compat_ioctl,
5473 .mmap = perf_mmap,
5474 .fasync = perf_fasync,
5475};
5476
5477
5478
5479
5480
5481
5482
5483
5484static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5485{
5486
5487 if (event->parent)
5488 event = event->parent;
5489 return &event->fasync;
5490}
5491
5492void perf_event_wakeup(struct perf_event *event)
5493{
5494 ring_buffer_wakeup(event);
5495
5496 if (event->pending_kill) {
5497 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5498 event->pending_kill = 0;
5499 }
5500}
5501
5502static void perf_pending_event(struct irq_work *entry)
5503{
5504 struct perf_event *event = container_of(entry,
5505 struct perf_event, pending);
5506 int rctx;
5507
5508 rctx = perf_swevent_get_recursion_context();
5509
5510
5511
5512
5513
5514 if (event->pending_disable) {
5515 event->pending_disable = 0;
5516 perf_event_disable_local(event);
5517 }
5518
5519 if (event->pending_wakeup) {
5520 event->pending_wakeup = 0;
5521 perf_event_wakeup(event);
5522 }
5523
5524 if (rctx >= 0)
5525 perf_swevent_put_recursion_context(rctx);
5526}
5527
5528
5529
5530
5531
5532
5533struct perf_guest_info_callbacks *perf_guest_cbs;
5534
5535int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5536{
5537 perf_guest_cbs = cbs;
5538 return 0;
5539}
5540EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5541
5542int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5543{
5544 perf_guest_cbs = NULL;
5545 return 0;
5546}
5547EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5548
5549static void
5550perf_output_sample_regs(struct perf_output_handle *handle,
5551 struct pt_regs *regs, u64 mask)
5552{
5553 int bit;
5554 DECLARE_BITMAP(_mask, 64);
5555
5556 bitmap_from_u64(_mask, mask);
5557 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5558 u64 val;
5559
5560 val = perf_reg_value(regs, bit);
5561 perf_output_put(handle, val);
5562 }
5563}
5564
5565static void perf_sample_regs_user(struct perf_regs *regs_user,
5566 struct pt_regs *regs,
5567 struct pt_regs *regs_user_copy)
5568{
5569 if (user_mode(regs)) {
5570 regs_user->abi = perf_reg_abi(current);
5571 regs_user->regs = regs;
5572 } else if (current->mm) {
5573 perf_get_regs_user(regs_user, regs, regs_user_copy);
5574 } else {
5575 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5576 regs_user->regs = NULL;
5577 }
5578}
5579
5580static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5581 struct pt_regs *regs)
5582{
5583 regs_intr->regs = regs;
5584 regs_intr->abi = perf_reg_abi(current);
5585}
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595static u64 perf_ustack_task_size(struct pt_regs *regs)
5596{
5597 unsigned long addr = perf_user_stack_pointer(regs);
5598
5599 if (!addr || addr >= TASK_SIZE)
5600 return 0;
5601
5602 return TASK_SIZE - addr;
5603}
5604
5605static u16
5606perf_sample_ustack_size(u16 stack_size, u16 header_size,
5607 struct pt_regs *regs)
5608{
5609 u64 task_size;
5610
5611
5612 if (!regs)
5613 return 0;
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5626 stack_size = min(stack_size, (u16) task_size);
5627
5628
5629 header_size += 2 * sizeof(u64);
5630
5631
5632 if ((u16) (header_size + stack_size) < header_size) {
5633
5634
5635
5636
5637 stack_size = USHRT_MAX - header_size - sizeof(u64);
5638 stack_size = round_up(stack_size, sizeof(u64));
5639 }
5640
5641 return stack_size;
5642}
5643
5644static void
5645perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5646 struct pt_regs *regs)
5647{
5648
5649 if (!regs) {
5650 u64 size = 0;
5651 perf_output_put(handle, size);
5652 } else {
5653 unsigned long sp;
5654 unsigned int rem;
5655 u64 dyn_size;
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669 perf_output_put(handle, dump_size);
5670
5671
5672 sp = perf_user_stack_pointer(regs);
5673 rem = __output_copy_user(handle, (void *) sp, dump_size);
5674 dyn_size = dump_size - rem;
5675
5676 perf_output_skip(handle, rem);
5677
5678
5679 perf_output_put(handle, dyn_size);
5680 }
5681}
5682
5683static void __perf_event_header__init_id(struct perf_event_header *header,
5684 struct perf_sample_data *data,
5685 struct perf_event *event)
5686{
5687 u64 sample_type = event->attr.sample_type;
5688
5689 data->type = sample_type;
5690 header->size += event->id_header_size;
5691
5692 if (sample_type & PERF_SAMPLE_TID) {
5693
5694 data->tid_entry.pid = perf_event_pid(event, current);
5695 data->tid_entry.tid = perf_event_tid(event, current);
5696 }
5697
5698 if (sample_type & PERF_SAMPLE_TIME)
5699 data->time = perf_event_clock(event);
5700
5701 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5702 data->id = primary_event_id(event);
5703
5704 if (sample_type & PERF_SAMPLE_STREAM_ID)
5705 data->stream_id = event->id;
5706
5707 if (sample_type & PERF_SAMPLE_CPU) {
5708 data->cpu_entry.cpu = raw_smp_processor_id();
5709 data->cpu_entry.reserved = 0;
5710 }
5711}
5712
5713void perf_event_header__init_id(struct perf_event_header *header,
5714 struct perf_sample_data *data,
5715 struct perf_event *event)
5716{
5717 if (event->attr.sample_id_all)
5718 __perf_event_header__init_id(header, data, event);
5719}
5720
5721static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5722 struct perf_sample_data *data)
5723{
5724 u64 sample_type = data->type;
5725
5726 if (sample_type & PERF_SAMPLE_TID)
5727 perf_output_put(handle, data->tid_entry);
5728
5729 if (sample_type & PERF_SAMPLE_TIME)
5730 perf_output_put(handle, data->time);
5731
5732 if (sample_type & PERF_SAMPLE_ID)
5733 perf_output_put(handle, data->id);
5734
5735 if (sample_type & PERF_SAMPLE_STREAM_ID)
5736 perf_output_put(handle, data->stream_id);
5737
5738 if (sample_type & PERF_SAMPLE_CPU)
5739 perf_output_put(handle, data->cpu_entry);
5740
5741 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5742 perf_output_put(handle, data->id);
5743}
5744
5745void perf_event__output_id_sample(struct perf_event *event,
5746 struct perf_output_handle *handle,
5747 struct perf_sample_data *sample)
5748{
5749 if (event->attr.sample_id_all)
5750 __perf_event__output_id_sample(handle, sample);
5751}
5752
5753static void perf_output_read_one(struct perf_output_handle *handle,
5754 struct perf_event *event,
5755 u64 enabled, u64 running)
5756{
5757 u64 read_format = event->attr.read_format;
5758 u64 values[4];
5759 int n = 0;
5760
5761 values[n++] = perf_event_count(event);
5762 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5763 values[n++] = enabled +
5764 atomic64_read(&event->child_total_time_enabled);
5765 }
5766 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5767 values[n++] = running +
5768 atomic64_read(&event->child_total_time_running);
5769 }
5770 if (read_format & PERF_FORMAT_ID)
5771 values[n++] = primary_event_id(event);
5772
5773 __output_copy(handle, values, n * sizeof(u64));
5774}
5775
5776static void perf_output_read_group(struct perf_output_handle *handle,
5777 struct perf_event *event,
5778 u64 enabled, u64 running)
5779{
5780 struct perf_event *leader = event->group_leader, *sub;
5781 u64 read_format = event->attr.read_format;
5782 u64 values[5];
5783 int n = 0;
5784
5785 values[n++] = 1 + leader->nr_siblings;
5786
5787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5788 values[n++] = enabled;
5789
5790 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5791 values[n++] = running;
5792
5793 if (leader != event)
5794 leader->pmu->read(leader);
5795
5796 values[n++] = perf_event_count(leader);
5797 if (read_format & PERF_FORMAT_ID)
5798 values[n++] = primary_event_id(leader);
5799
5800 __output_copy(handle, values, n * sizeof(u64));
5801
5802 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5803 n = 0;
5804
5805 if ((sub != event) &&
5806 (sub->state == PERF_EVENT_STATE_ACTIVE))
5807 sub->pmu->read(sub);
5808
5809 values[n++] = perf_event_count(sub);
5810 if (read_format & PERF_FORMAT_ID)
5811 values[n++] = primary_event_id(sub);
5812
5813 __output_copy(handle, values, n * sizeof(u64));
5814 }
5815}
5816
5817#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5818 PERF_FORMAT_TOTAL_TIME_RUNNING)
5819
5820
5821
5822
5823
5824
5825
5826
5827static void perf_output_read(struct perf_output_handle *handle,
5828 struct perf_event *event)
5829{
5830 u64 enabled = 0, running = 0, now;
5831 u64 read_format = event->attr.read_format;
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5843 calc_timer_values(event, &now, &enabled, &running);
5844
5845 if (event->attr.read_format & PERF_FORMAT_GROUP)
5846 perf_output_read_group(handle, event, enabled, running);
5847 else
5848 perf_output_read_one(handle, event, enabled, running);
5849}
5850
5851void perf_output_sample(struct perf_output_handle *handle,
5852 struct perf_event_header *header,
5853 struct perf_sample_data *data,
5854 struct perf_event *event)
5855{
5856 u64 sample_type = data->type;
5857
5858 perf_output_put(handle, *header);
5859
5860 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5861 perf_output_put(handle, data->id);
5862
5863 if (sample_type & PERF_SAMPLE_IP)
5864 perf_output_put(handle, data->ip);
5865
5866 if (sample_type & PERF_SAMPLE_TID)
5867 perf_output_put(handle, data->tid_entry);
5868
5869 if (sample_type & PERF_SAMPLE_TIME)
5870 perf_output_put(handle, data->time);
5871
5872 if (sample_type & PERF_SAMPLE_ADDR)
5873 perf_output_put(handle, data->addr);
5874
5875 if (sample_type & PERF_SAMPLE_ID)
5876 perf_output_put(handle, data->id);
5877
5878 if (sample_type & PERF_SAMPLE_STREAM_ID)
5879 perf_output_put(handle, data->stream_id);
5880
5881 if (sample_type & PERF_SAMPLE_CPU)
5882 perf_output_put(handle, data->cpu_entry);
5883
5884 if (sample_type & PERF_SAMPLE_PERIOD)
5885 perf_output_put(handle, data->period);
5886
5887 if (sample_type & PERF_SAMPLE_READ)
5888 perf_output_read(handle, event);
5889
5890 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5891 if (data->callchain) {
5892 int size = 1;
5893
5894 if (data->callchain)
5895 size += data->callchain->nr;
5896
5897 size *= sizeof(u64);
5898
5899 __output_copy(handle, data->callchain, size);
5900 } else {
5901 u64 nr = 0;
5902 perf_output_put(handle, nr);
5903 }
5904 }
5905
5906 if (sample_type & PERF_SAMPLE_RAW) {
5907 struct perf_raw_record *raw = data->raw;
5908
5909 if (raw) {
5910 struct perf_raw_frag *frag = &raw->frag;
5911
5912 perf_output_put(handle, raw->size);
5913 do {
5914 if (frag->copy) {
5915 __output_custom(handle, frag->copy,
5916 frag->data, frag->size);
5917 } else {
5918 __output_copy(handle, frag->data,
5919 frag->size);
5920 }
5921 if (perf_raw_frag_last(frag))
5922 break;
5923 frag = frag->next;
5924 } while (1);
5925 if (frag->pad)
5926 __output_skip(handle, NULL, frag->pad);
5927 } else {
5928 struct {
5929 u32 size;
5930 u32 data;
5931 } raw = {
5932 .size = sizeof(u32),
5933 .data = 0,
5934 };
5935 perf_output_put(handle, raw);
5936 }
5937 }
5938
5939 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5940 if (data->br_stack) {
5941 size_t size;
5942
5943 size = data->br_stack->nr
5944 * sizeof(struct perf_branch_entry);
5945
5946 perf_output_put(handle, data->br_stack->nr);
5947 perf_output_copy(handle, data->br_stack->entries, size);
5948 } else {
5949
5950
5951
5952 u64 nr = 0;
5953 perf_output_put(handle, nr);
5954 }
5955 }
5956
5957 if (sample_type & PERF_SAMPLE_REGS_USER) {
5958 u64 abi = data->regs_user.abi;
5959
5960
5961
5962
5963
5964 perf_output_put(handle, abi);
5965
5966 if (abi) {
5967 u64 mask = event->attr.sample_regs_user;
5968 perf_output_sample_regs(handle,
5969 data->regs_user.regs,
5970 mask);
5971 }
5972 }
5973
5974 if (sample_type & PERF_SAMPLE_STACK_USER) {
5975 perf_output_sample_ustack(handle,
5976 data->stack_user_size,
5977 data->regs_user.regs);
5978 }
5979
5980 if (sample_type & PERF_SAMPLE_WEIGHT)
5981 perf_output_put(handle, data->weight);
5982
5983 if (sample_type & PERF_SAMPLE_DATA_SRC)
5984 perf_output_put(handle, data->data_src.val);
5985
5986 if (sample_type & PERF_SAMPLE_TRANSACTION)
5987 perf_output_put(handle, data->txn);
5988
5989 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5990 u64 abi = data->regs_intr.abi;
5991
5992
5993
5994
5995 perf_output_put(handle, abi);
5996
5997 if (abi) {
5998 u64 mask = event->attr.sample_regs_intr;
5999
6000 perf_output_sample_regs(handle,
6001 data->regs_intr.regs,
6002 mask);
6003 }
6004 }
6005
6006 if (!event->attr.watermark) {
6007 int wakeup_events = event->attr.wakeup_events;
6008
6009 if (wakeup_events) {
6010 struct ring_buffer *rb = handle->rb;
6011 int events = local_inc_return(&rb->events);
6012
6013 if (events >= wakeup_events) {
6014 local_sub(wakeup_events, &rb->events);
6015 local_inc(&rb->wakeup);
6016 }
6017 }
6018 }
6019}
6020
6021void perf_prepare_sample(struct perf_event_header *header,
6022 struct perf_sample_data *data,
6023 struct perf_event *event,
6024 struct pt_regs *regs)
6025{
6026 u64 sample_type = event->attr.sample_type;
6027
6028 header->type = PERF_RECORD_SAMPLE;
6029 header->size = sizeof(*header) + event->header_size;
6030
6031 header->misc = 0;
6032 header->misc |= perf_misc_flags(regs);
6033
6034 __perf_event_header__init_id(header, data, event);
6035
6036 if (sample_type & PERF_SAMPLE_IP)
6037 data->ip = perf_instruction_pointer(regs);
6038
6039 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6040 int size = 1;
6041
6042 data->callchain = perf_callchain(event, regs);
6043
6044 if (data->callchain)
6045 size += data->callchain->nr;
6046
6047 header->size += size * sizeof(u64);
6048 }
6049
6050 if (sample_type & PERF_SAMPLE_RAW) {
6051 struct perf_raw_record *raw = data->raw;
6052 int size;
6053
6054 if (raw) {
6055 struct perf_raw_frag *frag = &raw->frag;
6056 u32 sum = 0;
6057
6058 do {
6059 sum += frag->size;
6060 if (perf_raw_frag_last(frag))
6061 break;
6062 frag = frag->next;
6063 } while (1);
6064
6065 size = round_up(sum + sizeof(u32), sizeof(u64));
6066 raw->size = size - sizeof(u32);
6067 frag->pad = raw->size - sum;
6068 } else {
6069 size = sizeof(u64);
6070 }
6071
6072 header->size += size;
6073 }
6074
6075 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6076 int size = sizeof(u64);
6077 if (data->br_stack) {
6078 size += data->br_stack->nr
6079 * sizeof(struct perf_branch_entry);
6080 }
6081 header->size += size;
6082 }
6083
6084 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6085 perf_sample_regs_user(&data->regs_user, regs,
6086 &data->regs_user_copy);
6087
6088 if (sample_type & PERF_SAMPLE_REGS_USER) {
6089
6090 int size = sizeof(u64);
6091
6092 if (data->regs_user.regs) {
6093 u64 mask = event->attr.sample_regs_user;
6094 size += hweight64(mask) * sizeof(u64);
6095 }
6096
6097 header->size += size;
6098 }
6099
6100 if (sample_type & PERF_SAMPLE_STACK_USER) {
6101
6102
6103
6104
6105
6106
6107 u16 stack_size = event->attr.sample_stack_user;
6108 u16 size = sizeof(u64);
6109
6110 stack_size = perf_sample_ustack_size(stack_size, header->size,
6111 data->regs_user.regs);
6112
6113
6114
6115
6116
6117
6118 if (stack_size)
6119 size += sizeof(u64) + stack_size;
6120
6121 data->stack_user_size = stack_size;
6122 header->size += size;
6123 }
6124
6125 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6126
6127 int size = sizeof(u64);
6128
6129 perf_sample_regs_intr(&data->regs_intr, regs);
6130
6131 if (data->regs_intr.regs) {
6132 u64 mask = event->attr.sample_regs_intr;
6133
6134 size += hweight64(mask) * sizeof(u64);
6135 }
6136
6137 header->size += size;
6138 }
6139}
6140
6141static void __always_inline
6142__perf_event_output(struct perf_event *event,
6143 struct perf_sample_data *data,
6144 struct pt_regs *regs,
6145 int (*output_begin)(struct perf_output_handle *,
6146 struct perf_event *,
6147 unsigned int))
6148{
6149 struct perf_output_handle handle;
6150 struct perf_event_header header;
6151
6152
6153 rcu_read_lock();
6154
6155 perf_prepare_sample(&header, data, event, regs);
6156
6157 if (output_begin(&handle, event, header.size))
6158 goto exit;
6159
6160 perf_output_sample(&handle, &header, data, event);
6161
6162 perf_output_end(&handle);
6163
6164exit:
6165 rcu_read_unlock();
6166}
6167
6168void
6169perf_event_output_forward(struct perf_event *event,
6170 struct perf_sample_data *data,
6171 struct pt_regs *regs)
6172{
6173 __perf_event_output(event, data, regs, perf_output_begin_forward);
6174}
6175
6176void
6177perf_event_output_backward(struct perf_event *event,
6178 struct perf_sample_data *data,
6179 struct pt_regs *regs)
6180{
6181 __perf_event_output(event, data, regs, perf_output_begin_backward);
6182}
6183
6184void
6185perf_event_output(struct perf_event *event,
6186 struct perf_sample_data *data,
6187 struct pt_regs *regs)
6188{
6189 __perf_event_output(event, data, regs, perf_output_begin);
6190}
6191
6192
6193
6194
6195
6196struct perf_read_event {
6197 struct perf_event_header header;
6198
6199 u32 pid;
6200 u32 tid;
6201};
6202
6203static void
6204perf_event_read_event(struct perf_event *event,
6205 struct task_struct *task)
6206{
6207 struct perf_output_handle handle;
6208 struct perf_sample_data sample;
6209 struct perf_read_event read_event = {
6210 .header = {
6211 .type = PERF_RECORD_READ,
6212 .misc = 0,
6213 .size = sizeof(read_event) + event->read_size,
6214 },
6215 .pid = perf_event_pid(event, task),
6216 .tid = perf_event_tid(event, task),
6217 };
6218 int ret;
6219
6220 perf_event_header__init_id(&read_event.header, &sample, event);
6221 ret = perf_output_begin(&handle, event, read_event.header.size);
6222 if (ret)
6223 return;
6224
6225 perf_output_put(&handle, read_event);
6226 perf_output_read(&handle, event);
6227 perf_event__output_id_sample(event, &handle, &sample);
6228
6229 perf_output_end(&handle);
6230}
6231
6232typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6233
6234static void
6235perf_iterate_ctx(struct perf_event_context *ctx,
6236 perf_iterate_f output,
6237 void *data, bool all)
6238{
6239 struct perf_event *event;
6240
6241 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6242 if (!all) {
6243 if (event->state < PERF_EVENT_STATE_INACTIVE)
6244 continue;
6245 if (!event_filter_match(event))
6246 continue;
6247 }
6248
6249 output(event, data);
6250 }
6251}
6252
6253static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6254{
6255 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6256 struct perf_event *event;
6257
6258 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6259
6260
6261
6262
6263
6264 if (!smp_load_acquire(&event->ctx))
6265 continue;
6266
6267 if (event->state < PERF_EVENT_STATE_INACTIVE)
6268 continue;
6269 if (!event_filter_match(event))
6270 continue;
6271 output(event, data);
6272 }
6273}
6274
6275
6276
6277
6278
6279
6280
6281static void
6282perf_iterate_sb(perf_iterate_f output, void *data,
6283 struct perf_event_context *task_ctx)
6284{
6285 struct perf_event_context *ctx;
6286 int ctxn;
6287
6288 rcu_read_lock();
6289 preempt_disable();
6290
6291
6292
6293
6294
6295
6296 if (task_ctx) {
6297 perf_iterate_ctx(task_ctx, output, data, false);
6298 goto done;
6299 }
6300
6301 perf_iterate_sb_cpu(output, data);
6302
6303 for_each_task_context_nr(ctxn) {
6304 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6305 if (ctx)
6306 perf_iterate_ctx(ctx, output, data, false);
6307 }
6308done:
6309 preempt_enable();
6310 rcu_read_unlock();
6311}
6312
6313
6314
6315
6316
6317static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6318{
6319 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6320 struct perf_addr_filter *filter;
6321 unsigned int restart = 0, count = 0;
6322 unsigned long flags;
6323
6324 if (!has_addr_filter(event))
6325 return;
6326
6327 raw_spin_lock_irqsave(&ifh->lock, flags);
6328 list_for_each_entry(filter, &ifh->list, entry) {
6329 if (filter->inode) {
6330 event->addr_filters_offs[count] = 0;
6331 restart++;
6332 }
6333
6334 count++;
6335 }
6336
6337 if (restart)
6338 event->addr_filters_gen++;
6339 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6340
6341 if (restart)
6342 perf_event_stop(event, 1);
6343}
6344
6345void perf_event_exec(void)
6346{
6347 struct perf_event_context *ctx;
6348 int ctxn;
6349
6350 rcu_read_lock();
6351 for_each_task_context_nr(ctxn) {
6352 ctx = current->perf_event_ctxp[ctxn];
6353 if (!ctx)
6354 continue;
6355
6356 perf_event_enable_on_exec(ctxn);
6357
6358 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6359 true);
6360 }
6361 rcu_read_unlock();
6362}
6363
6364struct remote_output {
6365 struct ring_buffer *rb;
6366 int err;
6367};
6368
6369static void __perf_event_output_stop(struct perf_event *event, void *data)
6370{
6371 struct perf_event *parent = event->parent;
6372 struct remote_output *ro = data;
6373 struct ring_buffer *rb = ro->rb;
6374 struct stop_event_data sd = {
6375 .event = event,
6376 };
6377
6378 if (!has_aux(event))
6379 return;
6380
6381 if (!parent)
6382 parent = event;
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394 if (rcu_dereference(parent->rb) == rb)
6395 ro->err = __perf_event_stop(&sd);
6396}
6397
6398static int __perf_pmu_output_stop(void *info)
6399{
6400 struct perf_event *event = info;
6401 struct pmu *pmu = event->pmu;
6402 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6403 struct remote_output ro = {
6404 .rb = event->rb,
6405 };
6406
6407 rcu_read_lock();
6408 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6409 if (cpuctx->task_ctx)
6410 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6411 &ro, false);
6412 rcu_read_unlock();
6413
6414 return ro.err;
6415}
6416
6417static void perf_pmu_output_stop(struct perf_event *event)
6418{
6419 struct perf_event *iter;
6420 int err, cpu;
6421
6422restart:
6423 rcu_read_lock();
6424 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6425
6426
6427
6428
6429
6430
6431 cpu = iter->cpu;
6432 if (cpu == -1)
6433 cpu = READ_ONCE(iter->oncpu);
6434
6435 if (cpu == -1)
6436 continue;
6437
6438 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6439 if (err == -EAGAIN) {
6440 rcu_read_unlock();
6441 goto restart;
6442 }
6443 }
6444 rcu_read_unlock();
6445}
6446
6447
6448
6449
6450
6451
6452
6453struct perf_task_event {
6454 struct task_struct *task;
6455 struct perf_event_context *task_ctx;
6456
6457 struct {
6458 struct perf_event_header header;
6459
6460 u32 pid;
6461 u32 ppid;
6462 u32 tid;
6463 u32 ptid;
6464 u64 time;
6465 } event_id;
6466};
6467
6468static int perf_event_task_match(struct perf_event *event)
6469{
6470 return event->attr.comm || event->attr.mmap ||
6471 event->attr.mmap2 || event->attr.mmap_data ||
6472 event->attr.task;
6473}
6474
6475static void perf_event_task_output(struct perf_event *event,
6476 void *data)
6477{
6478 struct perf_task_event *task_event = data;
6479 struct perf_output_handle handle;
6480 struct perf_sample_data sample;
6481 struct task_struct *task = task_event->task;
6482 int ret, size = task_event->event_id.header.size;
6483
6484 if (!perf_event_task_match(event))
6485 return;
6486
6487 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6488
6489 ret = perf_output_begin(&handle, event,
6490 task_event->event_id.header.size);
6491 if (ret)
6492 goto out;
6493
6494 task_event->event_id.pid = perf_event_pid(event, task);
6495 task_event->event_id.ppid = perf_event_pid(event, current);
6496
6497 task_event->event_id.tid = perf_event_tid(event, task);
6498 task_event->event_id.ptid = perf_event_tid(event, current);
6499
6500 task_event->event_id.time = perf_event_clock(event);
6501
6502 perf_output_put(&handle, task_event->event_id);
6503
6504 perf_event__output_id_sample(event, &handle, &sample);
6505
6506 perf_output_end(&handle);
6507out:
6508 task_event->event_id.header.size = size;
6509}
6510
6511static void perf_event_task(struct task_struct *task,
6512 struct perf_event_context *task_ctx,
6513 int new)
6514{
6515 struct perf_task_event task_event;
6516
6517 if (!atomic_read(&nr_comm_events) &&
6518 !atomic_read(&nr_mmap_events) &&
6519 !atomic_read(&nr_task_events))
6520 return;
6521
6522 task_event = (struct perf_task_event){
6523 .task = task,
6524 .task_ctx = task_ctx,
6525 .event_id = {
6526 .header = {
6527 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6528 .misc = 0,
6529 .size = sizeof(task_event.event_id),
6530 },
6531
6532
6533
6534
6535
6536 },
6537 };
6538
6539 perf_iterate_sb(perf_event_task_output,
6540 &task_event,
6541 task_ctx);
6542}
6543
6544void perf_event_fork(struct task_struct *task)
6545{
6546 perf_event_task(task, NULL, 1);
6547 perf_event_namespaces(task);
6548}
6549
6550
6551
6552
6553
6554struct perf_comm_event {
6555 struct task_struct *task;
6556 char *comm;
6557 int comm_size;
6558
6559 struct {
6560 struct perf_event_header header;
6561
6562 u32 pid;
6563 u32 tid;
6564 } event_id;
6565};
6566
6567static int perf_event_comm_match(struct perf_event *event)
6568{
6569 return event->attr.comm;
6570}
6571
6572static void perf_event_comm_output(struct perf_event *event,
6573 void *data)
6574{
6575 struct perf_comm_event *comm_event = data;
6576 struct perf_output_handle handle;
6577 struct perf_sample_data sample;
6578 int size = comm_event->event_id.header.size;
6579 int ret;
6580
6581 if (!perf_event_comm_match(event))
6582 return;
6583
6584 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6585 ret = perf_output_begin(&handle, event,
6586 comm_event->event_id.header.size);
6587
6588 if (ret)
6589 goto out;
6590
6591 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6592 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6593
6594 perf_output_put(&handle, comm_event->event_id);
6595 __output_copy(&handle, comm_event->comm,
6596 comm_event->comm_size);
6597
6598 perf_event__output_id_sample(event, &handle, &sample);
6599
6600 perf_output_end(&handle);
6601out:
6602 comm_event->event_id.header.size = size;
6603}
6604
6605static void perf_event_comm_event(struct perf_comm_event *comm_event)
6606{
6607 char comm[TASK_COMM_LEN];
6608 unsigned int size;
6609
6610 memset(comm, 0, sizeof(comm));
6611 strlcpy(comm, comm_event->task->comm, sizeof(comm));
6612 size = ALIGN(strlen(comm)+1, sizeof(u64));
6613
6614 comm_event->comm = comm;
6615 comm_event->comm_size = size;
6616
6617 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6618
6619 perf_iterate_sb(perf_event_comm_output,
6620 comm_event,
6621 NULL);
6622}
6623
6624void perf_event_comm(struct task_struct *task, bool exec)
6625{
6626 struct perf_comm_event comm_event;
6627
6628 if (!atomic_read(&nr_comm_events))
6629 return;
6630
6631 comm_event = (struct perf_comm_event){
6632 .task = task,
6633
6634
6635 .event_id = {
6636 .header = {
6637 .type = PERF_RECORD_COMM,
6638 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6639
6640 },
6641
6642
6643 },
6644 };
6645
6646 perf_event_comm_event(&comm_event);
6647}
6648
6649
6650
6651
6652
6653struct perf_namespaces_event {
6654 struct task_struct *task;
6655
6656 struct {
6657 struct perf_event_header header;
6658
6659 u32 pid;
6660 u32 tid;
6661 u64 nr_namespaces;
6662 struct perf_ns_link_info link_info[NR_NAMESPACES];
6663 } event_id;
6664};
6665
6666static int perf_event_namespaces_match(struct perf_event *event)
6667{
6668 return event->attr.namespaces;
6669}
6670
6671static void perf_event_namespaces_output(struct perf_event *event,
6672 void *data)
6673{
6674 struct perf_namespaces_event *namespaces_event = data;
6675 struct perf_output_handle handle;
6676 struct perf_sample_data sample;
6677 int ret;
6678
6679 if (!perf_event_namespaces_match(event))
6680 return;
6681
6682 perf_event_header__init_id(&namespaces_event->event_id.header,
6683 &sample, event);
6684 ret = perf_output_begin(&handle, event,
6685 namespaces_event->event_id.header.size);
6686 if (ret)
6687 return;
6688
6689 namespaces_event->event_id.pid = perf_event_pid(event,
6690 namespaces_event->task);
6691 namespaces_event->event_id.tid = perf_event_tid(event,
6692 namespaces_event->task);
6693
6694 perf_output_put(&handle, namespaces_event->event_id);
6695
6696 perf_event__output_id_sample(event, &handle, &sample);
6697
6698 perf_output_end(&handle);
6699}
6700
6701static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
6702 struct task_struct *task,
6703 const struct proc_ns_operations *ns_ops)
6704{
6705 struct path ns_path;
6706 struct inode *ns_inode;
6707 void *error;
6708
6709 error = ns_get_path(&ns_path, task, ns_ops);
6710 if (!error) {
6711 ns_inode = ns_path.dentry->d_inode;
6712 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
6713 ns_link_info->ino = ns_inode->i_ino;
6714 }
6715}
6716
6717void perf_event_namespaces(struct task_struct *task)
6718{
6719 struct perf_namespaces_event namespaces_event;
6720 struct perf_ns_link_info *ns_link_info;
6721
6722 if (!atomic_read(&nr_namespaces_events))
6723 return;
6724
6725 namespaces_event = (struct perf_namespaces_event){
6726 .task = task,
6727 .event_id = {
6728 .header = {
6729 .type = PERF_RECORD_NAMESPACES,
6730 .misc = 0,
6731 .size = sizeof(namespaces_event.event_id),
6732 },
6733
6734
6735 .nr_namespaces = NR_NAMESPACES,
6736
6737 },
6738 };
6739
6740 ns_link_info = namespaces_event.event_id.link_info;
6741
6742 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
6743 task, &mntns_operations);
6744
6745#ifdef CONFIG_USER_NS
6746 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
6747 task, &userns_operations);
6748#endif
6749#ifdef CONFIG_NET_NS
6750 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
6751 task, &netns_operations);
6752#endif
6753#ifdef CONFIG_UTS_NS
6754 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
6755 task, &utsns_operations);
6756#endif
6757#ifdef CONFIG_IPC_NS
6758 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
6759 task, &ipcns_operations);
6760#endif
6761#ifdef CONFIG_PID_NS
6762 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
6763 task, &pidns_operations);
6764#endif
6765#ifdef CONFIG_CGROUPS
6766 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
6767 task, &cgroupns_operations);
6768#endif
6769
6770 perf_iterate_sb(perf_event_namespaces_output,
6771 &namespaces_event,
6772 NULL);
6773}
6774
6775
6776
6777
6778
6779struct perf_mmap_event {
6780 struct vm_area_struct *vma;
6781
6782 const char *file_name;
6783 int file_size;
6784 int maj, min;
6785 u64 ino;
6786 u64 ino_generation;
6787 u32 prot, flags;
6788
6789 struct {
6790 struct perf_event_header header;
6791
6792 u32 pid;
6793 u32 tid;
6794 u64 start;
6795 u64 len;
6796 u64 pgoff;
6797 } event_id;
6798};
6799
6800static int perf_event_mmap_match(struct perf_event *event,
6801 void *data)
6802{
6803 struct perf_mmap_event *mmap_event = data;
6804 struct vm_area_struct *vma = mmap_event->vma;
6805 int executable = vma->vm_flags & VM_EXEC;
6806
6807 return (!executable && event->attr.mmap_data) ||
6808 (executable && (event->attr.mmap || event->attr.mmap2));
6809}
6810
6811static void perf_event_mmap_output(struct perf_event *event,
6812 void *data)
6813{
6814 struct perf_mmap_event *mmap_event = data;
6815 struct perf_output_handle handle;
6816 struct perf_sample_data sample;
6817 int size = mmap_event->event_id.header.size;
6818 int ret;
6819
6820 if (!perf_event_mmap_match(event, data))
6821 return;
6822
6823 if (event->attr.mmap2) {
6824 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6825 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6826 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6827 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6828 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6829 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6830 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6831 }
6832
6833 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6834 ret = perf_output_begin(&handle, event,
6835 mmap_event->event_id.header.size);
6836 if (ret)
6837 goto out;
6838
6839 mmap_event->event_id.pid = perf_event_pid(event, current);
6840 mmap_event->event_id.tid = perf_event_tid(event, current);
6841
6842 perf_output_put(&handle, mmap_event->event_id);
6843
6844 if (event->attr.mmap2) {
6845 perf_output_put(&handle, mmap_event->maj);
6846 perf_output_put(&handle, mmap_event->min);
6847 perf_output_put(&handle, mmap_event->ino);
6848 perf_output_put(&handle, mmap_event->ino_generation);
6849 perf_output_put(&handle, mmap_event->prot);
6850 perf_output_put(&handle, mmap_event->flags);
6851 }
6852
6853 __output_copy(&handle, mmap_event->file_name,
6854 mmap_event->file_size);
6855
6856 perf_event__output_id_sample(event, &handle, &sample);
6857
6858 perf_output_end(&handle);
6859out:
6860 mmap_event->event_id.header.size = size;
6861}
6862
6863static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6864{
6865 struct vm_area_struct *vma = mmap_event->vma;
6866 struct file *file = vma->vm_file;
6867 int maj = 0, min = 0;
6868 u64 ino = 0, gen = 0;
6869 u32 prot = 0, flags = 0;
6870 unsigned int size;
6871 char tmp[16];
6872 char *buf = NULL;
6873 char *name;
6874
6875 if (vma->vm_flags & VM_READ)
6876 prot |= PROT_READ;
6877 if (vma->vm_flags & VM_WRITE)
6878 prot |= PROT_WRITE;
6879 if (vma->vm_flags & VM_EXEC)
6880 prot |= PROT_EXEC;
6881
6882 if (vma->vm_flags & VM_MAYSHARE)
6883 flags = MAP_SHARED;
6884 else
6885 flags = MAP_PRIVATE;
6886
6887 if (vma->vm_flags & VM_DENYWRITE)
6888 flags |= MAP_DENYWRITE;
6889 if (vma->vm_flags & VM_MAYEXEC)
6890 flags |= MAP_EXECUTABLE;
6891 if (vma->vm_flags & VM_LOCKED)
6892 flags |= MAP_LOCKED;
6893 if (vma->vm_flags & VM_HUGETLB)
6894 flags |= MAP_HUGETLB;
6895
6896 if (file) {
6897 struct inode *inode;
6898 dev_t dev;
6899
6900 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6901 if (!buf) {
6902 name = "//enomem";
6903 goto cpy_name;
6904 }
6905
6906
6907
6908
6909
6910 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6911 if (IS_ERR(name)) {
6912 name = "//toolong";
6913 goto cpy_name;
6914 }
6915 inode = file_inode(vma->vm_file);
6916 dev = inode->i_sb->s_dev;
6917 ino = inode->i_ino;
6918 gen = inode->i_generation;
6919 maj = MAJOR(dev);
6920 min = MINOR(dev);
6921
6922 goto got_name;
6923 } else {
6924 if (vma->vm_ops && vma->vm_ops->name) {
6925 name = (char *) vma->vm_ops->name(vma);
6926 if (name)
6927 goto cpy_name;
6928 }
6929
6930 name = (char *)arch_vma_name(vma);
6931 if (name)
6932 goto cpy_name;
6933
6934 if (vma->vm_start <= vma->vm_mm->start_brk &&
6935 vma->vm_end >= vma->vm_mm->brk) {
6936 name = "[heap]";
6937 goto cpy_name;
6938 }
6939 if (vma->vm_start <= vma->vm_mm->start_stack &&
6940 vma->vm_end >= vma->vm_mm->start_stack) {
6941 name = "[stack]";
6942 goto cpy_name;
6943 }
6944
6945 name = "//anon";
6946 goto cpy_name;
6947 }
6948
6949cpy_name:
6950 strlcpy(tmp, name, sizeof(tmp));
6951 name = tmp;
6952got_name:
6953
6954
6955
6956
6957
6958 size = strlen(name)+1;
6959 while (!IS_ALIGNED(size, sizeof(u64)))
6960 name[size++] = '\0';
6961
6962 mmap_event->file_name = name;
6963 mmap_event->file_size = size;
6964 mmap_event->maj = maj;
6965 mmap_event->min = min;
6966 mmap_event->ino = ino;
6967 mmap_event->ino_generation = gen;
6968 mmap_event->prot = prot;
6969 mmap_event->flags = flags;
6970
6971 if (!(vma->vm_flags & VM_EXEC))
6972 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6973
6974 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6975
6976 perf_iterate_sb(perf_event_mmap_output,
6977 mmap_event,
6978 NULL);
6979
6980 kfree(buf);
6981}
6982
6983
6984
6985
6986static bool perf_addr_filter_match(struct perf_addr_filter *filter,
6987 struct file *file, unsigned long offset,
6988 unsigned long size)
6989{
6990 if (filter->inode != file_inode(file))
6991 return false;
6992
6993 if (filter->offset > offset + size)
6994 return false;
6995
6996 if (filter->offset + filter->size < offset)
6997 return false;
6998
6999 return true;
7000}
7001
7002static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7003{
7004 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7005 struct vm_area_struct *vma = data;
7006 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7007 struct file *file = vma->vm_file;
7008 struct perf_addr_filter *filter;
7009 unsigned int restart = 0, count = 0;
7010
7011 if (!has_addr_filter(event))
7012 return;
7013
7014 if (!file)
7015 return;
7016
7017 raw_spin_lock_irqsave(&ifh->lock, flags);
7018 list_for_each_entry(filter, &ifh->list, entry) {
7019 if (perf_addr_filter_match(filter, file, off,
7020 vma->vm_end - vma->vm_start)) {
7021 event->addr_filters_offs[count] = vma->vm_start;
7022 restart++;
7023 }
7024
7025 count++;
7026 }
7027
7028 if (restart)
7029 event->addr_filters_gen++;
7030 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7031
7032 if (restart)
7033 perf_event_stop(event, 1);
7034}
7035
7036
7037
7038
7039static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7040{
7041 struct perf_event_context *ctx;
7042 int ctxn;
7043
7044
7045
7046
7047
7048 if (!(vma->vm_flags & VM_EXEC))
7049 return;
7050
7051 rcu_read_lock();
7052 for_each_task_context_nr(ctxn) {
7053 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7054 if (!ctx)
7055 continue;
7056
7057 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7058 }
7059 rcu_read_unlock();
7060}
7061
7062void perf_event_mmap(struct vm_area_struct *vma)
7063{
7064 struct perf_mmap_event mmap_event;
7065
7066 if (!atomic_read(&nr_mmap_events))
7067 return;
7068
7069 mmap_event = (struct perf_mmap_event){
7070 .vma = vma,
7071
7072
7073 .event_id = {
7074 .header = {
7075 .type = PERF_RECORD_MMAP,
7076 .misc = PERF_RECORD_MISC_USER,
7077
7078 },
7079
7080
7081 .start = vma->vm_start,
7082 .len = vma->vm_end - vma->vm_start,
7083 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
7084 },
7085
7086
7087
7088
7089
7090
7091 };
7092
7093 perf_addr_filters_adjust(vma);
7094 perf_event_mmap_event(&mmap_event);
7095}
7096
7097void perf_event_aux_event(struct perf_event *event, unsigned long head,
7098 unsigned long size, u64 flags)
7099{
7100 struct perf_output_handle handle;
7101 struct perf_sample_data sample;
7102 struct perf_aux_event {
7103 struct perf_event_header header;
7104 u64 offset;
7105 u64 size;
7106 u64 flags;
7107 } rec = {
7108 .header = {
7109 .type = PERF_RECORD_AUX,
7110 .misc = 0,
7111 .size = sizeof(rec),
7112 },
7113 .offset = head,
7114 .size = size,
7115 .flags = flags,
7116 };
7117 int ret;
7118
7119 perf_event_header__init_id(&rec.header, &sample, event);
7120 ret = perf_output_begin(&handle, event, rec.header.size);
7121
7122 if (ret)
7123 return;
7124
7125 perf_output_put(&handle, rec);
7126 perf_event__output_id_sample(event, &handle, &sample);
7127
7128 perf_output_end(&handle);
7129}
7130
7131
7132
7133
7134void perf_log_lost_samples(struct perf_event *event, u64 lost)
7135{
7136 struct perf_output_handle handle;
7137 struct perf_sample_data sample;
7138 int ret;
7139
7140 struct {
7141 struct perf_event_header header;
7142 u64 lost;
7143 } lost_samples_event = {
7144 .header = {
7145 .type = PERF_RECORD_LOST_SAMPLES,
7146 .misc = 0,
7147 .size = sizeof(lost_samples_event),
7148 },
7149 .lost = lost,
7150 };
7151
7152 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7153
7154 ret = perf_output_begin(&handle, event,
7155 lost_samples_event.header.size);
7156 if (ret)
7157 return;
7158
7159 perf_output_put(&handle, lost_samples_event);
7160 perf_event__output_id_sample(event, &handle, &sample);
7161 perf_output_end(&handle);
7162}
7163
7164
7165
7166
7167
7168struct perf_switch_event {
7169 struct task_struct *task;
7170 struct task_struct *next_prev;
7171
7172 struct {
7173 struct perf_event_header header;
7174 u32 next_prev_pid;
7175 u32 next_prev_tid;
7176 } event_id;
7177};
7178
7179static int perf_event_switch_match(struct perf_event *event)
7180{
7181 return event->attr.context_switch;
7182}
7183
7184static void perf_event_switch_output(struct perf_event *event, void *data)
7185{
7186 struct perf_switch_event *se = data;
7187 struct perf_output_handle handle;
7188 struct perf_sample_data sample;
7189 int ret;
7190
7191 if (!perf_event_switch_match(event))
7192 return;
7193
7194
7195 if (event->ctx->task) {
7196 se->event_id.header.type = PERF_RECORD_SWITCH;
7197 se->event_id.header.size = sizeof(se->event_id.header);
7198 } else {
7199 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7200 se->event_id.header.size = sizeof(se->event_id);
7201 se->event_id.next_prev_pid =
7202 perf_event_pid(event, se->next_prev);
7203 se->event_id.next_prev_tid =
7204 perf_event_tid(event, se->next_prev);
7205 }
7206
7207 perf_event_header__init_id(&se->event_id.header, &sample, event);
7208
7209 ret = perf_output_begin(&handle, event, se->event_id.header.size);
7210 if (ret)
7211 return;
7212
7213 if (event->ctx->task)
7214 perf_output_put(&handle, se->event_id.header);
7215 else
7216 perf_output_put(&handle, se->event_id);
7217
7218 perf_event__output_id_sample(event, &handle, &sample);
7219
7220 perf_output_end(&handle);
7221}
7222
7223static void perf_event_switch(struct task_struct *task,
7224 struct task_struct *next_prev, bool sched_in)
7225{
7226 struct perf_switch_event switch_event;
7227
7228
7229
7230 switch_event = (struct perf_switch_event){
7231 .task = task,
7232 .next_prev = next_prev,
7233 .event_id = {
7234 .header = {
7235
7236 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7237
7238 },
7239
7240
7241 },
7242 };
7243
7244 perf_iterate_sb(perf_event_switch_output,
7245 &switch_event,
7246 NULL);
7247}
7248
7249
7250
7251
7252
7253static void perf_log_throttle(struct perf_event *event, int enable)
7254{
7255 struct perf_output_handle handle;
7256 struct perf_sample_data sample;
7257 int ret;
7258
7259 struct {
7260 struct perf_event_header header;
7261 u64 time;
7262 u64 id;
7263 u64 stream_id;
7264 } throttle_event = {
7265 .header = {
7266 .type = PERF_RECORD_THROTTLE,
7267 .misc = 0,
7268 .size = sizeof(throttle_event),
7269 },
7270 .time = perf_event_clock(event),
7271 .id = primary_event_id(event),
7272 .stream_id = event->id,
7273 };
7274
7275 if (enable)
7276 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7277
7278 perf_event_header__init_id(&throttle_event.header, &sample, event);
7279
7280 ret = perf_output_begin(&handle, event,
7281 throttle_event.header.size);
7282 if (ret)
7283 return;
7284
7285 perf_output_put(&handle, throttle_event);
7286 perf_event__output_id_sample(event, &handle, &sample);
7287 perf_output_end(&handle);
7288}
7289
7290static void perf_log_itrace_start(struct perf_event *event)
7291{
7292 struct perf_output_handle handle;
7293 struct perf_sample_data sample;
7294 struct perf_aux_event {
7295 struct perf_event_header header;
7296 u32 pid;
7297 u32 tid;
7298 } rec;
7299 int ret;
7300
7301 if (event->parent)
7302 event = event->parent;
7303
7304 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7305 event->hw.itrace_started)
7306 return;
7307
7308 rec.header.type = PERF_RECORD_ITRACE_START;
7309 rec.header.misc = 0;
7310 rec.header.size = sizeof(rec);
7311 rec.pid = perf_event_pid(event, current);
7312 rec.tid = perf_event_tid(event, current);
7313
7314 perf_event_header__init_id(&rec.header, &sample, event);
7315 ret = perf_output_begin(&handle, event, rec.header.size);
7316
7317 if (ret)
7318 return;
7319
7320 perf_output_put(&handle, rec);
7321 perf_event__output_id_sample(event, &handle, &sample);
7322
7323 perf_output_end(&handle);
7324}
7325
7326static int
7327__perf_event_account_interrupt(struct perf_event *event, int throttle)
7328{
7329 struct hw_perf_event *hwc = &event->hw;
7330 int ret = 0;
7331 u64 seq;
7332
7333 seq = __this_cpu_read(perf_throttled_seq);
7334 if (seq != hwc->interrupts_seq) {
7335 hwc->interrupts_seq = seq;
7336 hwc->interrupts = 1;
7337 } else {
7338 hwc->interrupts++;
7339 if (unlikely(throttle
7340 && hwc->interrupts >= max_samples_per_tick)) {
7341 __this_cpu_inc(perf_throttled_count);
7342 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7343 hwc->interrupts = MAX_INTERRUPTS;
7344 perf_log_throttle(event, 0);
7345 ret = 1;
7346 }
7347 }
7348
7349 if (event->attr.freq) {
7350 u64 now = perf_clock();
7351 s64 delta = now - hwc->freq_time_stamp;
7352
7353 hwc->freq_time_stamp = now;
7354
7355 if (delta > 0 && delta < 2*TICK_NSEC)
7356 perf_adjust_period(event, delta, hwc->last_period, true);
7357 }
7358
7359 return ret;
7360}
7361
7362int perf_event_account_interrupt(struct perf_event *event)
7363{
7364 return __perf_event_account_interrupt(event, 1);
7365}
7366
7367
7368
7369
7370
7371static int __perf_event_overflow(struct perf_event *event,
7372 int throttle, struct perf_sample_data *data,
7373 struct pt_regs *regs)
7374{
7375 int events = atomic_read(&event->event_limit);
7376 int ret = 0;
7377
7378
7379
7380
7381
7382 if (unlikely(!is_sampling_event(event)))
7383 return 0;
7384
7385 ret = __perf_event_account_interrupt(event, throttle);
7386
7387
7388
7389
7390
7391
7392 event->pending_kill = POLL_IN;
7393 if (events && atomic_dec_and_test(&event->event_limit)) {
7394 ret = 1;
7395 event->pending_kill = POLL_HUP;
7396
7397 perf_event_disable_inatomic(event);
7398 }
7399
7400 READ_ONCE(event->overflow_handler)(event, data, regs);
7401
7402 if (*perf_event_fasync(event) && event->pending_kill) {
7403 event->pending_wakeup = 1;
7404 irq_work_queue(&event->pending);
7405 }
7406
7407 return ret;
7408}
7409
7410int perf_event_overflow(struct perf_event *event,
7411 struct perf_sample_data *data,
7412 struct pt_regs *regs)
7413{
7414 return __perf_event_overflow(event, 1, data, regs);
7415}
7416
7417
7418
7419
7420
7421struct swevent_htable {
7422 struct swevent_hlist *swevent_hlist;
7423 struct mutex hlist_mutex;
7424 int hlist_refcount;
7425
7426
7427 int recursion[PERF_NR_CONTEXTS];
7428};
7429
7430static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7431
7432
7433
7434
7435
7436
7437
7438
7439u64 perf_swevent_set_period(struct perf_event *event)
7440{
7441 struct hw_perf_event *hwc = &event->hw;
7442 u64 period = hwc->last_period;
7443 u64 nr, offset;
7444 s64 old, val;
7445
7446 hwc->last_period = hwc->sample_period;
7447
7448again:
7449 old = val = local64_read(&hwc->period_left);
7450 if (val < 0)
7451 return 0;
7452
7453 nr = div64_u64(period + val, period);
7454 offset = nr * period;
7455 val -= offset;
7456 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7457 goto again;
7458
7459 return nr;
7460}
7461
7462static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7463 struct perf_sample_data *data,
7464 struct pt_regs *regs)
7465{
7466 struct hw_perf_event *hwc = &event->hw;
7467 int throttle = 0;
7468
7469 if (!overflow)
7470 overflow = perf_swevent_set_period(event);
7471
7472 if (hwc->interrupts == MAX_INTERRUPTS)
7473 return;
7474
7475 for (; overflow; overflow--) {
7476 if (__perf_event_overflow(event, throttle,
7477 data, regs)) {
7478
7479
7480
7481
7482 break;
7483 }
7484 throttle = 1;
7485 }
7486}
7487
7488static void perf_swevent_event(struct perf_event *event, u64 nr,
7489 struct perf_sample_data *data,
7490 struct pt_regs *regs)
7491{
7492 struct hw_perf_event *hwc = &event->hw;
7493
7494 local64_add(nr, &event->count);
7495
7496 if (!regs)
7497 return;
7498
7499 if (!is_sampling_event(event))
7500 return;
7501
7502 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7503 data->period = nr;
7504 return perf_swevent_overflow(event, 1, data, regs);
7505 } else
7506 data->period = event->hw.last_period;
7507
7508 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7509 return perf_swevent_overflow(event, 1, data, regs);
7510
7511 if (local64_add_negative(nr, &hwc->period_left))
7512 return;
7513
7514 perf_swevent_overflow(event, 0, data, regs);
7515}
7516
7517static int perf_exclude_event(struct perf_event *event,
7518 struct pt_regs *regs)
7519{
7520 if (event->hw.state & PERF_HES_STOPPED)
7521 return 1;
7522
7523 if (regs) {
7524 if (event->attr.exclude_user && user_mode(regs))
7525 return 1;
7526
7527 if (event->attr.exclude_kernel && !user_mode(regs))
7528 return 1;
7529 }
7530
7531 return 0;
7532}
7533
7534static int perf_swevent_match(struct perf_event *event,
7535 enum perf_type_id type,
7536 u32 event_id,
7537 struct perf_sample_data *data,
7538 struct pt_regs *regs)
7539{
7540 if (event->attr.type != type)
7541 return 0;
7542
7543 if (event->attr.config != event_id)
7544 return 0;
7545
7546 if (perf_exclude_event(event, regs))
7547 return 0;
7548
7549 return 1;
7550}
7551
7552static inline u64 swevent_hash(u64 type, u32 event_id)
7553{
7554 u64 val = event_id | (type << 32);
7555
7556 return hash_64(val, SWEVENT_HLIST_BITS);
7557}
7558
7559static inline struct hlist_head *
7560__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7561{
7562 u64 hash = swevent_hash(type, event_id);
7563
7564 return &hlist->heads[hash];
7565}
7566
7567
7568static inline struct hlist_head *
7569find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7570{
7571 struct swevent_hlist *hlist;
7572
7573 hlist = rcu_dereference(swhash->swevent_hlist);
7574 if (!hlist)
7575 return NULL;
7576
7577 return __find_swevent_head(hlist, type, event_id);
7578}
7579
7580
7581static inline struct hlist_head *
7582find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7583{
7584 struct swevent_hlist *hlist;
7585 u32 event_id = event->attr.config;
7586 u64 type = event->attr.type;
7587
7588
7589
7590
7591
7592
7593 hlist = rcu_dereference_protected(swhash->swevent_hlist,
7594 lockdep_is_held(&event->ctx->lock));
7595 if (!hlist)
7596 return NULL;
7597
7598 return __find_swevent_head(hlist, type, event_id);
7599}
7600
7601static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7602 u64 nr,
7603 struct perf_sample_data *data,
7604 struct pt_regs *regs)
7605{
7606 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7607 struct perf_event *event;
7608 struct hlist_head *head;
7609
7610 rcu_read_lock();
7611 head = find_swevent_head_rcu(swhash, type, event_id);
7612 if (!head)
7613 goto end;
7614
7615 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7616 if (perf_swevent_match(event, type, event_id, data, regs))
7617 perf_swevent_event(event, nr, data, regs);
7618 }
7619end:
7620 rcu_read_unlock();
7621}
7622
7623DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7624
7625int perf_swevent_get_recursion_context(void)
7626{
7627 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7628
7629 return get_recursion_context(swhash->recursion);
7630}
7631EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7632
7633void perf_swevent_put_recursion_context(int rctx)
7634{
7635 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7636
7637 put_recursion_context(swhash->recursion, rctx);
7638}
7639
7640void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7641{
7642 struct perf_sample_data data;
7643
7644 if (WARN_ON_ONCE(!regs))
7645 return;
7646
7647 perf_sample_data_init(&data, addr, 0);
7648 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7649}
7650
7651void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7652{
7653 int rctx;
7654
7655 preempt_disable_notrace();
7656 rctx = perf_swevent_get_recursion_context();
7657 if (unlikely(rctx < 0))
7658 goto fail;
7659
7660 ___perf_sw_event(event_id, nr, regs, addr);
7661
7662 perf_swevent_put_recursion_context(rctx);
7663fail:
7664 preempt_enable_notrace();
7665}
7666
7667static void perf_swevent_read(struct perf_event *event)
7668{
7669}
7670
7671static int perf_swevent_add(struct perf_event *event, int flags)
7672{
7673 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7674 struct hw_perf_event *hwc = &event->hw;
7675 struct hlist_head *head;
7676
7677 if (is_sampling_event(event)) {
7678 hwc->last_period = hwc->sample_period;
7679 perf_swevent_set_period(event);
7680 }
7681
7682 hwc->state = !(flags & PERF_EF_START);
7683
7684 head = find_swevent_head(swhash, event);
7685 if (WARN_ON_ONCE(!head))
7686 return -EINVAL;
7687
7688 hlist_add_head_rcu(&event->hlist_entry, head);
7689 perf_event_update_userpage(event);
7690
7691 return 0;
7692}
7693
7694static void perf_swevent_del(struct perf_event *event, int flags)
7695{
7696 hlist_del_rcu(&event->hlist_entry);
7697}
7698
7699static void perf_swevent_start(struct perf_event *event, int flags)
7700{
7701 event->hw.state = 0;
7702}
7703
7704static void perf_swevent_stop(struct perf_event *event, int flags)
7705{
7706 event->hw.state = PERF_HES_STOPPED;
7707}
7708
7709
7710static inline struct swevent_hlist *
7711swevent_hlist_deref(struct swevent_htable *swhash)
7712{
7713 return rcu_dereference_protected(swhash->swevent_hlist,
7714 lockdep_is_held(&swhash->hlist_mutex));
7715}
7716
7717static void swevent_hlist_release(struct swevent_htable *swhash)
7718{
7719 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7720
7721 if (!hlist)
7722 return;
7723
7724 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7725 kfree_rcu(hlist, rcu_head);
7726}
7727
7728static void swevent_hlist_put_cpu(int cpu)
7729{
7730 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7731
7732 mutex_lock(&swhash->hlist_mutex);
7733
7734 if (!--swhash->hlist_refcount)
7735 swevent_hlist_release(swhash);
7736
7737 mutex_unlock(&swhash->hlist_mutex);
7738}
7739
7740static void swevent_hlist_put(void)
7741{
7742 int cpu;
7743
7744 for_each_possible_cpu(cpu)
7745 swevent_hlist_put_cpu(cpu);
7746}
7747
7748static int swevent_hlist_get_cpu(int cpu)
7749{
7750 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7751 int err = 0;
7752
7753 mutex_lock(&swhash->hlist_mutex);
7754 if (!swevent_hlist_deref(swhash) &&
7755 cpumask_test_cpu(cpu, perf_online_mask)) {
7756 struct swevent_hlist *hlist;
7757
7758 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7759 if (!hlist) {
7760 err = -ENOMEM;
7761 goto exit;
7762 }
7763 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7764 }
7765 swhash->hlist_refcount++;
7766exit:
7767 mutex_unlock(&swhash->hlist_mutex);
7768
7769 return err;
7770}
7771
7772static int swevent_hlist_get(void)
7773{
7774 int err, cpu, failed_cpu;
7775
7776 mutex_lock(&pmus_lock);
7777 for_each_possible_cpu(cpu) {
7778 err = swevent_hlist_get_cpu(cpu);
7779 if (err) {
7780 failed_cpu = cpu;
7781 goto fail;
7782 }
7783 }
7784 mutex_unlock(&pmus_lock);
7785 return 0;
7786fail:
7787 for_each_possible_cpu(cpu) {
7788 if (cpu == failed_cpu)
7789 break;
7790 swevent_hlist_put_cpu(cpu);
7791 }
7792 mutex_unlock(&pmus_lock);
7793 return err;
7794}
7795
7796struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7797
7798static void sw_perf_event_destroy(struct perf_event *event)
7799{
7800 u64 event_id = event->attr.config;
7801
7802 WARN_ON(event->parent);
7803
7804 static_key_slow_dec(&perf_swevent_enabled[event_id]);
7805 swevent_hlist_put();
7806}
7807
7808static int perf_swevent_init(struct perf_event *event)
7809{
7810 u64 event_id = event->attr.config;
7811
7812 if (event->attr.type != PERF_TYPE_SOFTWARE)
7813 return -ENOENT;
7814
7815
7816
7817
7818 if (has_branch_stack(event))
7819 return -EOPNOTSUPP;
7820
7821 switch (event_id) {
7822 case PERF_COUNT_SW_CPU_CLOCK:
7823 case PERF_COUNT_SW_TASK_CLOCK:
7824 return -ENOENT;
7825
7826 default:
7827 break;
7828 }
7829
7830 if (event_id >= PERF_COUNT_SW_MAX)
7831 return -ENOENT;
7832
7833 if (!event->parent) {
7834 int err;
7835
7836 err = swevent_hlist_get();
7837 if (err)
7838 return err;
7839
7840 static_key_slow_inc(&perf_swevent_enabled[event_id]);
7841 event->destroy = sw_perf_event_destroy;
7842 }
7843
7844 return 0;
7845}
7846
7847static struct pmu perf_swevent = {
7848 .task_ctx_nr = perf_sw_context,
7849
7850 .capabilities = PERF_PMU_CAP_NO_NMI,
7851
7852 .event_init = perf_swevent_init,
7853 .add = perf_swevent_add,
7854 .del = perf_swevent_del,
7855 .start = perf_swevent_start,
7856 .stop = perf_swevent_stop,
7857 .read = perf_swevent_read,
7858};
7859
7860#ifdef CONFIG_EVENT_TRACING
7861
7862static int perf_tp_filter_match(struct perf_event *event,
7863 struct perf_sample_data *data)
7864{
7865 void *record = data->raw->frag.data;
7866
7867
7868 if (event->parent)
7869 event = event->parent;
7870
7871 if (likely(!event->filter) || filter_match_preds(event->filter, record))
7872 return 1;
7873 return 0;
7874}
7875
7876static int perf_tp_event_match(struct perf_event *event,
7877 struct perf_sample_data *data,
7878 struct pt_regs *regs)
7879{
7880 if (event->hw.state & PERF_HES_STOPPED)
7881 return 0;
7882
7883
7884
7885 if (event->attr.exclude_kernel)
7886 return 0;
7887
7888 if (!perf_tp_filter_match(event, data))
7889 return 0;
7890
7891 return 1;
7892}
7893
7894void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7895 struct trace_event_call *call, u64 count,
7896 struct pt_regs *regs, struct hlist_head *head,
7897 struct task_struct *task)
7898{
7899 struct bpf_prog *prog = call->prog;
7900
7901 if (prog) {
7902 *(struct pt_regs **)raw_data = regs;
7903 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7904 perf_swevent_put_recursion_context(rctx);
7905 return;
7906 }
7907 }
7908 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7909 rctx, task, NULL);
7910}
7911EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7912
7913void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7914 struct pt_regs *regs, struct hlist_head *head, int rctx,
7915 struct task_struct *task, struct perf_event *event)
7916{
7917 struct perf_sample_data data;
7918
7919 struct perf_raw_record raw = {
7920 .frag = {
7921 .size = entry_size,
7922 .data = record,
7923 },
7924 };
7925
7926 perf_sample_data_init(&data, 0, 0);
7927 data.raw = &raw;
7928
7929 perf_trace_buf_update(record, event_type);
7930
7931
7932 if (event) {
7933 if (perf_tp_event_match(event, &data, regs))
7934 perf_swevent_event(event, count, &data, regs);
7935 } else {
7936 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7937 if (perf_tp_event_match(event, &data, regs))
7938 perf_swevent_event(event, count, &data, regs);
7939 }
7940 }
7941
7942
7943
7944
7945
7946 if (task && task != current) {
7947 struct perf_event_context *ctx;
7948 struct trace_entry *entry = record;
7949
7950 rcu_read_lock();
7951 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
7952 if (!ctx)
7953 goto unlock;
7954
7955 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7956 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7957 continue;
7958 if (event->attr.config != entry->type)
7959 continue;
7960 if (perf_tp_event_match(event, &data, regs))
7961 perf_swevent_event(event, count, &data, regs);
7962 }
7963unlock:
7964 rcu_read_unlock();
7965 }
7966
7967 perf_swevent_put_recursion_context(rctx);
7968}
7969EXPORT_SYMBOL_GPL(perf_tp_event);
7970
7971static void tp_perf_event_destroy(struct perf_event *event)
7972{
7973 perf_trace_destroy(event);
7974}
7975
7976static int perf_tp_event_init(struct perf_event *event)
7977{
7978 int err;
7979
7980 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7981 return -ENOENT;
7982
7983
7984
7985
7986 if (has_branch_stack(event))
7987 return -EOPNOTSUPP;
7988
7989 err = perf_trace_init(event);
7990 if (err)
7991 return err;
7992
7993 event->destroy = tp_perf_event_destroy;
7994
7995 return 0;
7996}
7997
7998static struct pmu perf_tracepoint = {
7999 .task_ctx_nr = perf_sw_context,
8000
8001 .event_init = perf_tp_event_init,
8002 .add = perf_trace_add,
8003 .del = perf_trace_del,
8004 .start = perf_swevent_start,
8005 .stop = perf_swevent_stop,
8006 .read = perf_swevent_read,
8007};
8008
8009static inline void perf_tp_register(void)
8010{
8011 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8012}
8013
8014static void perf_event_free_filter(struct perf_event *event)
8015{
8016 ftrace_profile_free_filter(event);
8017}
8018
8019#ifdef CONFIG_BPF_SYSCALL
8020static void bpf_overflow_handler(struct perf_event *event,
8021 struct perf_sample_data *data,
8022 struct pt_regs *regs)
8023{
8024 struct bpf_perf_event_data_kern ctx = {
8025 .data = data,
8026 .regs = regs,
8027 };
8028 int ret = 0;
8029
8030 preempt_disable();
8031 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8032 goto out;
8033 rcu_read_lock();
8034 ret = BPF_PROG_RUN(event->prog, &ctx);
8035 rcu_read_unlock();
8036out:
8037 __this_cpu_dec(bpf_prog_active);
8038 preempt_enable();
8039 if (!ret)
8040 return;
8041
8042 event->orig_overflow_handler(event, data, regs);
8043}
8044
8045static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8046{
8047 struct bpf_prog *prog;
8048
8049 if (event->overflow_handler_context)
8050
8051 return -EINVAL;
8052
8053 if (event->prog)
8054 return -EEXIST;
8055
8056 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8057 if (IS_ERR(prog))
8058 return PTR_ERR(prog);
8059
8060 event->prog = prog;
8061 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8062 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8063 return 0;
8064}
8065
8066static void perf_event_free_bpf_handler(struct perf_event *event)
8067{
8068 struct bpf_prog *prog = event->prog;
8069
8070 if (!prog)
8071 return;
8072
8073 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8074 event->prog = NULL;
8075 bpf_prog_put(prog);
8076}
8077#else
8078static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8079{
8080 return -EOPNOTSUPP;
8081}
8082static void perf_event_free_bpf_handler(struct perf_event *event)
8083{
8084}
8085#endif
8086
8087static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8088{
8089 bool is_kprobe, is_tracepoint;
8090 struct bpf_prog *prog;
8091
8092 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8093 return perf_event_set_bpf_handler(event, prog_fd);
8094
8095 if (event->tp_event->prog)
8096 return -EEXIST;
8097
8098 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8099 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8100 if (!is_kprobe && !is_tracepoint)
8101
8102 return -EINVAL;
8103
8104 prog = bpf_prog_get(prog_fd);
8105 if (IS_ERR(prog))
8106 return PTR_ERR(prog);
8107
8108 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8109 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8110
8111 bpf_prog_put(prog);
8112 return -EINVAL;
8113 }
8114
8115 if (is_tracepoint) {
8116 int off = trace_event_get_offsets(event->tp_event);
8117
8118 if (prog->aux->max_ctx_offset > off) {
8119 bpf_prog_put(prog);
8120 return -EACCES;
8121 }
8122 }
8123 event->tp_event->prog = prog;
8124
8125 return 0;
8126}
8127
8128static void perf_event_free_bpf_prog(struct perf_event *event)
8129{
8130 struct bpf_prog *prog;
8131
8132 perf_event_free_bpf_handler(event);
8133
8134 if (!event->tp_event)
8135 return;
8136
8137 prog = event->tp_event->prog;
8138 if (prog) {
8139 event->tp_event->prog = NULL;
8140 bpf_prog_put(prog);
8141 }
8142}
8143
8144#else
8145
8146static inline void perf_tp_register(void)
8147{
8148}
8149
8150static void perf_event_free_filter(struct perf_event *event)
8151{
8152}
8153
8154static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8155{
8156 return -ENOENT;
8157}
8158
8159static void perf_event_free_bpf_prog(struct perf_event *event)
8160{
8161}
8162#endif
8163
8164#ifdef CONFIG_HAVE_HW_BREAKPOINT
8165void perf_bp_event(struct perf_event *bp, void *data)
8166{
8167 struct perf_sample_data sample;
8168 struct pt_regs *regs = data;
8169
8170 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8171
8172 if (!bp->hw.state && !perf_exclude_event(bp, regs))
8173 perf_swevent_event(bp, 1, &sample, regs);
8174}
8175#endif
8176
8177
8178
8179
8180static struct perf_addr_filter *
8181perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8182{
8183 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8184 struct perf_addr_filter *filter;
8185
8186 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8187 if (!filter)
8188 return NULL;
8189
8190 INIT_LIST_HEAD(&filter->entry);
8191 list_add_tail(&filter->entry, filters);
8192
8193 return filter;
8194}
8195
8196static void free_filters_list(struct list_head *filters)
8197{
8198 struct perf_addr_filter *filter, *iter;
8199
8200 list_for_each_entry_safe(filter, iter, filters, entry) {
8201 if (filter->inode)
8202 iput(filter->inode);
8203 list_del(&filter->entry);
8204 kfree(filter);
8205 }
8206}
8207
8208
8209
8210
8211static void perf_addr_filters_splice(struct perf_event *event,
8212 struct list_head *head)
8213{
8214 unsigned long flags;
8215 LIST_HEAD(list);
8216
8217 if (!has_addr_filter(event))
8218 return;
8219
8220
8221 if (event->parent)
8222 return;
8223
8224 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8225
8226 list_splice_init(&event->addr_filters.list, &list);
8227 if (head)
8228 list_splice(head, &event->addr_filters.list);
8229
8230 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8231
8232 free_filters_list(&list);
8233}
8234
8235
8236
8237
8238
8239
8240static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8241 struct mm_struct *mm)
8242{
8243 struct vm_area_struct *vma;
8244
8245 for (vma = mm->mmap; vma; vma = vma->vm_next) {
8246 struct file *file = vma->vm_file;
8247 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8248 unsigned long vma_size = vma->vm_end - vma->vm_start;
8249
8250 if (!file)
8251 continue;
8252
8253 if (!perf_addr_filter_match(filter, file, off, vma_size))
8254 continue;
8255
8256 return vma->vm_start;
8257 }
8258
8259 return 0;
8260}
8261
8262
8263
8264
8265
8266static void perf_event_addr_filters_apply(struct perf_event *event)
8267{
8268 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8269 struct task_struct *task = READ_ONCE(event->ctx->task);
8270 struct perf_addr_filter *filter;
8271 struct mm_struct *mm = NULL;
8272 unsigned int count = 0;
8273 unsigned long flags;
8274
8275
8276
8277
8278
8279 if (task == TASK_TOMBSTONE)
8280 return;
8281
8282 if (!ifh->nr_file_filters)
8283 return;
8284
8285 mm = get_task_mm(event->ctx->task);
8286 if (!mm)
8287 goto restart;
8288
8289 down_read(&mm->mmap_sem);
8290
8291 raw_spin_lock_irqsave(&ifh->lock, flags);
8292 list_for_each_entry(filter, &ifh->list, entry) {
8293 event->addr_filters_offs[count] = 0;
8294
8295
8296
8297
8298
8299 if (filter->inode)
8300 event->addr_filters_offs[count] =
8301 perf_addr_filter_apply(filter, mm);
8302
8303 count++;
8304 }
8305
8306 event->addr_filters_gen++;
8307 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8308
8309 up_read(&mm->mmap_sem);
8310
8311 mmput(mm);
8312
8313restart:
8314 perf_event_stop(event, 1);
8315}
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335enum {
8336 IF_ACT_NONE = -1,
8337 IF_ACT_FILTER,
8338 IF_ACT_START,
8339 IF_ACT_STOP,
8340 IF_SRC_FILE,
8341 IF_SRC_KERNEL,
8342 IF_SRC_FILEADDR,
8343 IF_SRC_KERNELADDR,
8344};
8345
8346enum {
8347 IF_STATE_ACTION = 0,
8348 IF_STATE_SOURCE,
8349 IF_STATE_END,
8350};
8351
8352static const match_table_t if_tokens = {
8353 { IF_ACT_FILTER, "filter" },
8354 { IF_ACT_START, "start" },
8355 { IF_ACT_STOP, "stop" },
8356 { IF_SRC_FILE, "%u/%u@%s" },
8357 { IF_SRC_KERNEL, "%u/%u" },
8358 { IF_SRC_FILEADDR, "%u@%s" },
8359 { IF_SRC_KERNELADDR, "%u" },
8360 { IF_ACT_NONE, NULL },
8361};
8362
8363
8364
8365
8366static int
8367perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8368 struct list_head *filters)
8369{
8370 struct perf_addr_filter *filter = NULL;
8371 char *start, *orig, *filename = NULL;
8372 struct path path;
8373 substring_t args[MAX_OPT_ARGS];
8374 int state = IF_STATE_ACTION, token;
8375 unsigned int kernel = 0;
8376 int ret = -EINVAL;
8377
8378 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8379 if (!fstr)
8380 return -ENOMEM;
8381
8382 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8383 ret = -EINVAL;
8384
8385 if (!*start)
8386 continue;
8387
8388
8389 if (state == IF_STATE_ACTION) {
8390 filter = perf_addr_filter_new(event, filters);
8391 if (!filter)
8392 goto fail;
8393 }
8394
8395 token = match_token(start, if_tokens, args);
8396 switch (token) {
8397 case IF_ACT_FILTER:
8398 case IF_ACT_START:
8399 filter->filter = 1;
8400
8401 case IF_ACT_STOP:
8402 if (state != IF_STATE_ACTION)
8403 goto fail;
8404
8405 state = IF_STATE_SOURCE;
8406 break;
8407
8408 case IF_SRC_KERNELADDR:
8409 case IF_SRC_KERNEL:
8410 kernel = 1;
8411
8412 case IF_SRC_FILEADDR:
8413 case IF_SRC_FILE:
8414 if (state != IF_STATE_SOURCE)
8415 goto fail;
8416
8417 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8418 filter->range = 1;
8419
8420 *args[0].to = 0;
8421 ret = kstrtoul(args[0].from, 0, &filter->offset);
8422 if (ret)
8423 goto fail;
8424
8425 if (filter->range) {
8426 *args[1].to = 0;
8427 ret = kstrtoul(args[1].from, 0, &filter->size);
8428 if (ret)
8429 goto fail;
8430 }
8431
8432 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8433 int fpos = filter->range ? 2 : 1;
8434
8435 filename = match_strdup(&args[fpos]);
8436 if (!filename) {
8437 ret = -ENOMEM;
8438 goto fail;
8439 }
8440 }
8441
8442 state = IF_STATE_END;
8443 break;
8444
8445 default:
8446 goto fail;
8447 }
8448
8449
8450
8451
8452
8453
8454 if (state == IF_STATE_END) {
8455 ret = -EINVAL;
8456 if (kernel && event->attr.exclude_kernel)
8457 goto fail;
8458
8459 if (!kernel) {
8460 if (!filename)
8461 goto fail;
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471 ret = -EOPNOTSUPP;
8472 if (!event->ctx->task)
8473 goto fail_free_name;
8474
8475
8476 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8477 if (ret)
8478 goto fail_free_name;
8479
8480 filter->inode = igrab(d_inode(path.dentry));
8481 path_put(&path);
8482 kfree(filename);
8483 filename = NULL;
8484
8485 ret = -EINVAL;
8486 if (!filter->inode ||
8487 !S_ISREG(filter->inode->i_mode))
8488
8489 goto fail;
8490
8491 event->addr_filters.nr_file_filters++;
8492 }
8493
8494
8495 state = IF_STATE_ACTION;
8496 filter = NULL;
8497 }
8498 }
8499
8500 if (state != IF_STATE_ACTION)
8501 goto fail;
8502
8503 kfree(orig);
8504
8505 return 0;
8506
8507fail_free_name:
8508 kfree(filename);
8509fail:
8510 free_filters_list(filters);
8511 kfree(orig);
8512
8513 return ret;
8514}
8515
8516static int
8517perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8518{
8519 LIST_HEAD(filters);
8520 int ret;
8521
8522
8523
8524
8525
8526 lockdep_assert_held(&event->ctx->mutex);
8527
8528 if (WARN_ON_ONCE(event->parent))
8529 return -EINVAL;
8530
8531 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8532 if (ret)
8533 goto fail_clear_files;
8534
8535 ret = event->pmu->addr_filters_validate(&filters);
8536 if (ret)
8537 goto fail_free_filters;
8538
8539
8540 perf_addr_filters_splice(event, &filters);
8541
8542
8543 perf_event_for_each_child(event, perf_event_addr_filters_apply);
8544
8545 return ret;
8546
8547fail_free_filters:
8548 free_filters_list(&filters);
8549
8550fail_clear_files:
8551 event->addr_filters.nr_file_filters = 0;
8552
8553 return ret;
8554}
8555
8556static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8557{
8558 char *filter_str;
8559 int ret = -EINVAL;
8560
8561 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8562 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8563 !has_addr_filter(event))
8564 return -EINVAL;
8565
8566 filter_str = strndup_user(arg, PAGE_SIZE);
8567 if (IS_ERR(filter_str))
8568 return PTR_ERR(filter_str);
8569
8570 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8571 event->attr.type == PERF_TYPE_TRACEPOINT)
8572 ret = ftrace_profile_set_filter(event, event->attr.config,
8573 filter_str);
8574 else if (has_addr_filter(event))
8575 ret = perf_event_set_addr_filter(event, filter_str);
8576
8577 kfree(filter_str);
8578 return ret;
8579}
8580
8581
8582
8583
8584
8585static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8586{
8587 enum hrtimer_restart ret = HRTIMER_RESTART;
8588 struct perf_sample_data data;
8589 struct pt_regs *regs;
8590 struct perf_event *event;
8591 u64 period;
8592
8593 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8594
8595 if (event->state != PERF_EVENT_STATE_ACTIVE)
8596 return HRTIMER_NORESTART;
8597
8598 event->pmu->read(event);
8599
8600 perf_sample_data_init(&data, 0, event->hw.last_period);
8601 regs = get_irq_regs();
8602
8603 if (regs && !perf_exclude_event(event, regs)) {
8604 if (!(event->attr.exclude_idle && is_idle_task(current)))
8605 if (__perf_event_overflow(event, 1, &data, regs))
8606 ret = HRTIMER_NORESTART;
8607 }
8608
8609 period = max_t(u64, 10000, event->hw.sample_period);
8610 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8611
8612 return ret;
8613}
8614
8615static void perf_swevent_start_hrtimer(struct perf_event *event)
8616{
8617 struct hw_perf_event *hwc = &event->hw;
8618 s64 period;
8619
8620 if (!is_sampling_event(event))
8621 return;
8622
8623 period = local64_read(&hwc->period_left);
8624 if (period) {
8625 if (period < 0)
8626 period = 10000;
8627
8628 local64_set(&hwc->period_left, 0);
8629 } else {
8630 period = max_t(u64, 10000, hwc->sample_period);
8631 }
8632 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8633 HRTIMER_MODE_REL_PINNED);
8634}
8635
8636static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8637{
8638 struct hw_perf_event *hwc = &event->hw;
8639
8640 if (is_sampling_event(event)) {
8641 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8642 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8643
8644 hrtimer_cancel(&hwc->hrtimer);
8645 }
8646}
8647
8648static void perf_swevent_init_hrtimer(struct perf_event *event)
8649{
8650 struct hw_perf_event *hwc = &event->hw;
8651
8652 if (!is_sampling_event(event))
8653 return;
8654
8655 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8656 hwc->hrtimer.function = perf_swevent_hrtimer;
8657
8658
8659
8660
8661
8662 if (event->attr.freq) {
8663 long freq = event->attr.sample_freq;
8664
8665 event->attr.sample_period = NSEC_PER_SEC / freq;
8666 hwc->sample_period = event->attr.sample_period;
8667 local64_set(&hwc->period_left, hwc->sample_period);
8668 hwc->last_period = hwc->sample_period;
8669 event->attr.freq = 0;
8670 }
8671}
8672
8673
8674
8675
8676
8677static void cpu_clock_event_update(struct perf_event *event)
8678{
8679 s64 prev;
8680 u64 now;
8681
8682 now = local_clock();
8683 prev = local64_xchg(&event->hw.prev_count, now);
8684 local64_add(now - prev, &event->count);
8685}
8686
8687static void cpu_clock_event_start(struct perf_event *event, int flags)
8688{
8689 local64_set(&event->hw.prev_count, local_clock());
8690 perf_swevent_start_hrtimer(event);
8691}
8692
8693static void cpu_clock_event_stop(struct perf_event *event, int flags)
8694{
8695 perf_swevent_cancel_hrtimer(event);
8696 cpu_clock_event_update(event);
8697}
8698
8699static int cpu_clock_event_add(struct perf_event *event, int flags)
8700{
8701 if (flags & PERF_EF_START)
8702 cpu_clock_event_start(event, flags);
8703 perf_event_update_userpage(event);
8704
8705 return 0;
8706}
8707
8708static void cpu_clock_event_del(struct perf_event *event, int flags)
8709{
8710 cpu_clock_event_stop(event, flags);
8711}
8712
8713static void cpu_clock_event_read(struct perf_event *event)
8714{
8715 cpu_clock_event_update(event);
8716}
8717
8718static int cpu_clock_event_init(struct perf_event *event)
8719{
8720 if (event->attr.type != PERF_TYPE_SOFTWARE)
8721 return -ENOENT;
8722
8723 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8724 return -ENOENT;
8725
8726
8727
8728
8729 if (has_branch_stack(event))
8730 return -EOPNOTSUPP;
8731
8732 perf_swevent_init_hrtimer(event);
8733
8734 return 0;
8735}
8736
8737static struct pmu perf_cpu_clock = {
8738 .task_ctx_nr = perf_sw_context,
8739
8740 .capabilities = PERF_PMU_CAP_NO_NMI,
8741
8742 .event_init = cpu_clock_event_init,
8743 .add = cpu_clock_event_add,
8744 .del = cpu_clock_event_del,
8745 .start = cpu_clock_event_start,
8746 .stop = cpu_clock_event_stop,
8747 .read = cpu_clock_event_read,
8748};
8749
8750
8751
8752
8753
8754static void task_clock_event_update(struct perf_event *event, u64 now)
8755{
8756 u64 prev;
8757 s64 delta;
8758
8759 prev = local64_xchg(&event->hw.prev_count, now);
8760 delta = now - prev;
8761 local64_add(delta, &event->count);
8762}
8763
8764static void task_clock_event_start(struct perf_event *event, int flags)
8765{
8766 local64_set(&event->hw.prev_count, event->ctx->time);
8767 perf_swevent_start_hrtimer(event);
8768}
8769
8770static void task_clock_event_stop(struct perf_event *event, int flags)
8771{
8772 perf_swevent_cancel_hrtimer(event);
8773 task_clock_event_update(event, event->ctx->time);
8774}
8775
8776static int task_clock_event_add(struct perf_event *event, int flags)
8777{
8778 if (flags & PERF_EF_START)
8779 task_clock_event_start(event, flags);
8780 perf_event_update_userpage(event);
8781
8782 return 0;
8783}
8784
8785static void task_clock_event_del(struct perf_event *event, int flags)
8786{
8787 task_clock_event_stop(event, PERF_EF_UPDATE);
8788}
8789
8790static void task_clock_event_read(struct perf_event *event)
8791{
8792 u64 now = perf_clock();
8793 u64 delta = now - event->ctx->timestamp;
8794 u64 time = event->ctx->time + delta;
8795
8796 task_clock_event_update(event, time);
8797}
8798
8799static int task_clock_event_init(struct perf_event *event)
8800{
8801 if (event->attr.type != PERF_TYPE_SOFTWARE)
8802 return -ENOENT;
8803
8804 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8805 return -ENOENT;
8806
8807
8808
8809
8810 if (has_branch_stack(event))
8811 return -EOPNOTSUPP;
8812
8813 perf_swevent_init_hrtimer(event);
8814
8815 return 0;
8816}
8817
8818static struct pmu perf_task_clock = {
8819 .task_ctx_nr = perf_sw_context,
8820
8821 .capabilities = PERF_PMU_CAP_NO_NMI,
8822
8823 .event_init = task_clock_event_init,
8824 .add = task_clock_event_add,
8825 .del = task_clock_event_del,
8826 .start = task_clock_event_start,
8827 .stop = task_clock_event_stop,
8828 .read = task_clock_event_read,
8829};
8830
8831static void perf_pmu_nop_void(struct pmu *pmu)
8832{
8833}
8834
8835static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8836{
8837}
8838
8839static int perf_pmu_nop_int(struct pmu *pmu)
8840{
8841 return 0;
8842}
8843
8844static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
8845
8846static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
8847{
8848 __this_cpu_write(nop_txn_flags, flags);
8849
8850 if (flags & ~PERF_PMU_TXN_ADD)
8851 return;
8852
8853 perf_pmu_disable(pmu);
8854}
8855
8856static int perf_pmu_commit_txn(struct pmu *pmu)
8857{
8858 unsigned int flags = __this_cpu_read(nop_txn_flags);
8859
8860 __this_cpu_write(nop_txn_flags, 0);
8861
8862 if (flags & ~PERF_PMU_TXN_ADD)
8863 return 0;
8864
8865 perf_pmu_enable(pmu);
8866 return 0;
8867}
8868
8869static void perf_pmu_cancel_txn(struct pmu *pmu)
8870{
8871 unsigned int flags = __this_cpu_read(nop_txn_flags);
8872
8873 __this_cpu_write(nop_txn_flags, 0);
8874
8875 if (flags & ~PERF_PMU_TXN_ADD)
8876 return;
8877
8878 perf_pmu_enable(pmu);
8879}
8880
8881static int perf_event_idx_default(struct perf_event *event)
8882{
8883 return 0;
8884}
8885
8886
8887
8888
8889
8890static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8891{
8892 struct pmu *pmu;
8893
8894 if (ctxn < 0)
8895 return NULL;
8896
8897 list_for_each_entry(pmu, &pmus, entry) {
8898 if (pmu->task_ctx_nr == ctxn)
8899 return pmu->pmu_cpu_context;
8900 }
8901
8902 return NULL;
8903}
8904
8905static void free_pmu_context(struct pmu *pmu)
8906{
8907 mutex_lock(&pmus_lock);
8908 free_percpu(pmu->pmu_cpu_context);
8909 mutex_unlock(&pmus_lock);
8910}
8911
8912
8913
8914
8915static ssize_t nr_addr_filters_show(struct device *dev,
8916 struct device_attribute *attr,
8917 char *page)
8918{
8919 struct pmu *pmu = dev_get_drvdata(dev);
8920
8921 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
8922}
8923DEVICE_ATTR_RO(nr_addr_filters);
8924
8925static struct idr pmu_idr;
8926
8927static ssize_t
8928type_show(struct device *dev, struct device_attribute *attr, char *page)
8929{
8930 struct pmu *pmu = dev_get_drvdata(dev);
8931
8932 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
8933}
8934static DEVICE_ATTR_RO(type);
8935
8936static ssize_t
8937perf_event_mux_interval_ms_show(struct device *dev,
8938 struct device_attribute *attr,
8939 char *page)
8940{
8941 struct pmu *pmu = dev_get_drvdata(dev);
8942
8943 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
8944}
8945
8946static DEFINE_MUTEX(mux_interval_mutex);
8947
8948static ssize_t
8949perf_event_mux_interval_ms_store(struct device *dev,
8950 struct device_attribute *attr,
8951 const char *buf, size_t count)
8952{
8953 struct pmu *pmu = dev_get_drvdata(dev);
8954 int timer, cpu, ret;
8955
8956 ret = kstrtoint(buf, 0, &timer);
8957 if (ret)
8958 return ret;
8959
8960 if (timer < 1)
8961 return -EINVAL;
8962
8963
8964 if (timer == pmu->hrtimer_interval_ms)
8965 return count;
8966
8967 mutex_lock(&mux_interval_mutex);
8968 pmu->hrtimer_interval_ms = timer;
8969
8970
8971 cpus_read_lock();
8972 for_each_online_cpu(cpu) {
8973 struct perf_cpu_context *cpuctx;
8974 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8975 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
8976
8977 cpu_function_call(cpu,
8978 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
8979 }
8980 cpus_read_unlock();
8981 mutex_unlock(&mux_interval_mutex);
8982
8983 return count;
8984}
8985static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
8986
8987static struct attribute *pmu_dev_attrs[] = {
8988 &dev_attr_type.attr,
8989 &dev_attr_perf_event_mux_interval_ms.attr,
8990 NULL,
8991};
8992ATTRIBUTE_GROUPS(pmu_dev);
8993
8994static int pmu_bus_running;
8995static struct bus_type pmu_bus = {
8996 .name = "event_source",
8997 .dev_groups = pmu_dev_groups,
8998};
8999
9000static void pmu_dev_release(struct device *dev)
9001{
9002 kfree(dev);
9003}
9004
9005static int pmu_dev_alloc(struct pmu *pmu)
9006{
9007 int ret = -ENOMEM;
9008
9009 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9010 if (!pmu->dev)
9011 goto out;
9012
9013 pmu->dev->groups = pmu->attr_groups;
9014 device_initialize(pmu->dev);
9015 ret = dev_set_name(pmu->dev, "%s", pmu->name);
9016 if (ret)
9017 goto free_dev;
9018
9019 dev_set_drvdata(pmu->dev, pmu);
9020 pmu->dev->bus = &pmu_bus;
9021 pmu->dev->release = pmu_dev_release;
9022 ret = device_add(pmu->dev);
9023 if (ret)
9024 goto free_dev;
9025
9026
9027 if (pmu->nr_addr_filters)
9028 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9029
9030 if (ret)
9031 goto del_dev;
9032
9033out:
9034 return ret;
9035
9036del_dev:
9037 device_del(pmu->dev);
9038
9039free_dev:
9040 put_device(pmu->dev);
9041 goto out;
9042}
9043
9044static struct lock_class_key cpuctx_mutex;
9045static struct lock_class_key cpuctx_lock;
9046
9047int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9048{
9049 int cpu, ret;
9050
9051 mutex_lock(&pmus_lock);
9052 ret = -ENOMEM;
9053 pmu->pmu_disable_count = alloc_percpu(int);
9054 if (!pmu->pmu_disable_count)
9055 goto unlock;
9056
9057 pmu->type = -1;
9058 if (!name)
9059 goto skip_type;
9060 pmu->name = name;
9061
9062 if (type < 0) {
9063 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9064 if (type < 0) {
9065 ret = type;
9066 goto free_pdc;
9067 }
9068 }
9069 pmu->type = type;
9070
9071 if (pmu_bus_running) {
9072 ret = pmu_dev_alloc(pmu);
9073 if (ret)
9074 goto free_idr;
9075 }
9076
9077skip_type:
9078 if (pmu->task_ctx_nr == perf_hw_context) {
9079 static int hw_context_taken = 0;
9080
9081
9082
9083
9084
9085
9086 if (WARN_ON_ONCE(hw_context_taken &&
9087 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9088 pmu->task_ctx_nr = perf_invalid_context;
9089
9090 hw_context_taken = 1;
9091 }
9092
9093 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9094 if (pmu->pmu_cpu_context)
9095 goto got_cpu_context;
9096
9097 ret = -ENOMEM;
9098 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9099 if (!pmu->pmu_cpu_context)
9100 goto free_dev;
9101
9102 for_each_possible_cpu(cpu) {
9103 struct perf_cpu_context *cpuctx;
9104
9105 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9106 __perf_event_init_context(&cpuctx->ctx);
9107 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9108 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9109 cpuctx->ctx.pmu = pmu;
9110 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9111
9112 __perf_mux_hrtimer_init(cpuctx, cpu);
9113 }
9114
9115got_cpu_context:
9116 if (!pmu->start_txn) {
9117 if (pmu->pmu_enable) {
9118
9119
9120
9121
9122
9123 pmu->start_txn = perf_pmu_start_txn;
9124 pmu->commit_txn = perf_pmu_commit_txn;
9125 pmu->cancel_txn = perf_pmu_cancel_txn;
9126 } else {
9127 pmu->start_txn = perf_pmu_nop_txn;
9128 pmu->commit_txn = perf_pmu_nop_int;
9129 pmu->cancel_txn = perf_pmu_nop_void;
9130 }
9131 }
9132
9133 if (!pmu->pmu_enable) {
9134 pmu->pmu_enable = perf_pmu_nop_void;
9135 pmu->pmu_disable = perf_pmu_nop_void;
9136 }
9137
9138 if (!pmu->event_idx)
9139 pmu->event_idx = perf_event_idx_default;
9140
9141 list_add_rcu(&pmu->entry, &pmus);
9142 atomic_set(&pmu->exclusive_cnt, 0);
9143 ret = 0;
9144unlock:
9145 mutex_unlock(&pmus_lock);
9146
9147 return ret;
9148
9149free_dev:
9150 device_del(pmu->dev);
9151 put_device(pmu->dev);
9152
9153free_idr:
9154 if (pmu->type >= PERF_TYPE_MAX)
9155 idr_remove(&pmu_idr, pmu->type);
9156
9157free_pdc:
9158 free_percpu(pmu->pmu_disable_count);
9159 goto unlock;
9160}
9161EXPORT_SYMBOL_GPL(perf_pmu_register);
9162
9163void perf_pmu_unregister(struct pmu *pmu)
9164{
9165 int remove_device;
9166
9167 mutex_lock(&pmus_lock);
9168 remove_device = pmu_bus_running;
9169 list_del_rcu(&pmu->entry);
9170 mutex_unlock(&pmus_lock);
9171
9172
9173
9174
9175
9176 synchronize_srcu(&pmus_srcu);
9177 synchronize_rcu();
9178
9179 free_percpu(pmu->pmu_disable_count);
9180 if (pmu->type >= PERF_TYPE_MAX)
9181 idr_remove(&pmu_idr, pmu->type);
9182 if (remove_device) {
9183 if (pmu->nr_addr_filters)
9184 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9185 device_del(pmu->dev);
9186 put_device(pmu->dev);
9187 }
9188 free_pmu_context(pmu);
9189}
9190EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9191
9192static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9193{
9194 struct perf_event_context *ctx = NULL;
9195 int ret;
9196
9197 if (!try_module_get(pmu->module))
9198 return -ENODEV;
9199
9200 if (event->group_leader != event) {
9201
9202
9203
9204
9205 ctx = perf_event_ctx_lock_nested(event->group_leader,
9206 SINGLE_DEPTH_NESTING);
9207 BUG_ON(!ctx);
9208 }
9209
9210 event->pmu = pmu;
9211 ret = pmu->event_init(event);
9212
9213 if (ctx)
9214 perf_event_ctx_unlock(event->group_leader, ctx);
9215
9216 if (ret)
9217 module_put(pmu->module);
9218
9219 return ret;
9220}
9221
9222static struct pmu *perf_init_event(struct perf_event *event)
9223{
9224 struct pmu *pmu;
9225 int idx;
9226 int ret;
9227
9228 idx = srcu_read_lock(&pmus_srcu);
9229
9230
9231 if (event->parent && event->parent->pmu) {
9232 pmu = event->parent->pmu;
9233 ret = perf_try_init_event(pmu, event);
9234 if (!ret)
9235 goto unlock;
9236 }
9237
9238 rcu_read_lock();
9239 pmu = idr_find(&pmu_idr, event->attr.type);
9240 rcu_read_unlock();
9241 if (pmu) {
9242 ret = perf_try_init_event(pmu, event);
9243 if (ret)
9244 pmu = ERR_PTR(ret);
9245 goto unlock;
9246 }
9247
9248 list_for_each_entry_rcu(pmu, &pmus, entry) {
9249 ret = perf_try_init_event(pmu, event);
9250 if (!ret)
9251 goto unlock;
9252
9253 if (ret != -ENOENT) {
9254 pmu = ERR_PTR(ret);
9255 goto unlock;
9256 }
9257 }
9258 pmu = ERR_PTR(-ENOENT);
9259unlock:
9260 srcu_read_unlock(&pmus_srcu, idx);
9261
9262 return pmu;
9263}
9264
9265static void attach_sb_event(struct perf_event *event)
9266{
9267 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9268
9269 raw_spin_lock(&pel->lock);
9270 list_add_rcu(&event->sb_list, &pel->list);
9271 raw_spin_unlock(&pel->lock);
9272}
9273
9274
9275
9276
9277
9278
9279
9280
9281static void account_pmu_sb_event(struct perf_event *event)
9282{
9283 if (is_sb_event(event))
9284 attach_sb_event(event);
9285}
9286
9287static void account_event_cpu(struct perf_event *event, int cpu)
9288{
9289 if (event->parent)
9290 return;
9291
9292 if (is_cgroup_event(event))
9293 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9294}
9295
9296
9297static void account_freq_event_nohz(void)
9298{
9299#ifdef CONFIG_NO_HZ_FULL
9300
9301 spin_lock(&nr_freq_lock);
9302 if (atomic_inc_return(&nr_freq_events) == 1)
9303 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9304 spin_unlock(&nr_freq_lock);
9305#endif
9306}
9307
9308static void account_freq_event(void)
9309{
9310 if (tick_nohz_full_enabled())
9311 account_freq_event_nohz();
9312 else
9313 atomic_inc(&nr_freq_events);
9314}
9315
9316
9317static void account_event(struct perf_event *event)
9318{
9319 bool inc = false;
9320
9321 if (event->parent)
9322 return;
9323
9324 if (event->attach_state & PERF_ATTACH_TASK)
9325 inc = true;
9326 if (event->attr.mmap || event->attr.mmap_data)
9327 atomic_inc(&nr_mmap_events);
9328 if (event->attr.comm)
9329 atomic_inc(&nr_comm_events);
9330 if (event->attr.namespaces)
9331 atomic_inc(&nr_namespaces_events);
9332 if (event->attr.task)
9333 atomic_inc(&nr_task_events);
9334 if (event->attr.freq)
9335 account_freq_event();
9336 if (event->attr.context_switch) {
9337 atomic_inc(&nr_switch_events);
9338 inc = true;
9339 }
9340 if (has_branch_stack(event))
9341 inc = true;
9342 if (is_cgroup_event(event))
9343 inc = true;
9344
9345 if (inc) {
9346 if (atomic_inc_not_zero(&perf_sched_count))
9347 goto enabled;
9348
9349 mutex_lock(&perf_sched_mutex);
9350 if (!atomic_read(&perf_sched_count)) {
9351 static_branch_enable(&perf_sched_events);
9352
9353
9354
9355
9356
9357 synchronize_sched();
9358 }
9359
9360
9361
9362
9363 atomic_inc(&perf_sched_count);
9364 mutex_unlock(&perf_sched_mutex);
9365 }
9366enabled:
9367
9368 account_event_cpu(event, event->cpu);
9369
9370 account_pmu_sb_event(event);
9371}
9372
9373
9374
9375
9376static struct perf_event *
9377perf_event_alloc(struct perf_event_attr *attr, int cpu,
9378 struct task_struct *task,
9379 struct perf_event *group_leader,
9380 struct perf_event *parent_event,
9381 perf_overflow_handler_t overflow_handler,
9382 void *context, int cgroup_fd)
9383{
9384 struct pmu *pmu;
9385 struct perf_event *event;
9386 struct hw_perf_event *hwc;
9387 long err = -EINVAL;
9388
9389 if ((unsigned)cpu >= nr_cpu_ids) {
9390 if (!task || cpu != -1)
9391 return ERR_PTR(-EINVAL);
9392 }
9393
9394 event = kzalloc(sizeof(*event), GFP_KERNEL);
9395 if (!event)
9396 return ERR_PTR(-ENOMEM);
9397
9398
9399
9400
9401
9402 if (!group_leader)
9403 group_leader = event;
9404
9405 mutex_init(&event->child_mutex);
9406 INIT_LIST_HEAD(&event->child_list);
9407
9408 INIT_LIST_HEAD(&event->group_entry);
9409 INIT_LIST_HEAD(&event->event_entry);
9410 INIT_LIST_HEAD(&event->sibling_list);
9411 INIT_LIST_HEAD(&event->rb_entry);
9412 INIT_LIST_HEAD(&event->active_entry);
9413 INIT_LIST_HEAD(&event->addr_filters.list);
9414 INIT_HLIST_NODE(&event->hlist_entry);
9415
9416
9417 init_waitqueue_head(&event->waitq);
9418 init_irq_work(&event->pending, perf_pending_event);
9419
9420 mutex_init(&event->mmap_mutex);
9421 raw_spin_lock_init(&event->addr_filters.lock);
9422
9423 atomic_long_set(&event->refcount, 1);
9424 event->cpu = cpu;
9425 event->attr = *attr;
9426 event->group_leader = group_leader;
9427 event->pmu = NULL;
9428 event->oncpu = -1;
9429
9430 event->parent = parent_event;
9431
9432 event->ns = get_pid_ns(task_active_pid_ns(current));
9433 event->id = atomic64_inc_return(&perf_event_id);
9434
9435 event->state = PERF_EVENT_STATE_INACTIVE;
9436
9437 if (task) {
9438 event->attach_state = PERF_ATTACH_TASK;
9439
9440
9441
9442
9443
9444 event->hw.target = task;
9445 }
9446
9447 event->clock = &local_clock;
9448 if (parent_event)
9449 event->clock = parent_event->clock;
9450
9451 if (!overflow_handler && parent_event) {
9452 overflow_handler = parent_event->overflow_handler;
9453 context = parent_event->overflow_handler_context;
9454#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9455 if (overflow_handler == bpf_overflow_handler) {
9456 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9457
9458 if (IS_ERR(prog)) {
9459 err = PTR_ERR(prog);
9460 goto err_ns;
9461 }
9462 event->prog = prog;
9463 event->orig_overflow_handler =
9464 parent_event->orig_overflow_handler;
9465 }
9466#endif
9467 }
9468
9469 if (overflow_handler) {
9470 event->overflow_handler = overflow_handler;
9471 event->overflow_handler_context = context;
9472 } else if (is_write_backward(event)){
9473 event->overflow_handler = perf_event_output_backward;
9474 event->overflow_handler_context = NULL;
9475 } else {
9476 event->overflow_handler = perf_event_output_forward;
9477 event->overflow_handler_context = NULL;
9478 }
9479
9480 perf_event__state_init(event);
9481
9482 pmu = NULL;
9483
9484 hwc = &event->hw;
9485 hwc->sample_period = attr->sample_period;
9486 if (attr->freq && attr->sample_freq)
9487 hwc->sample_period = 1;
9488 hwc->last_period = hwc->sample_period;
9489
9490 local64_set(&hwc->period_left, hwc->sample_period);
9491
9492
9493
9494
9495
9496 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9497 goto err_ns;
9498
9499 if (!has_branch_stack(event))
9500 event->attr.branch_sample_type = 0;
9501
9502 if (cgroup_fd != -1) {
9503 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9504 if (err)
9505 goto err_ns;
9506 }
9507
9508 pmu = perf_init_event(event);
9509 if (IS_ERR(pmu)) {
9510 err = PTR_ERR(pmu);
9511 goto err_ns;
9512 }
9513
9514 err = exclusive_event_init(event);
9515 if (err)
9516 goto err_pmu;
9517
9518 if (has_addr_filter(event)) {
9519 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9520 sizeof(unsigned long),
9521 GFP_KERNEL);
9522 if (!event->addr_filters_offs) {
9523 err = -ENOMEM;
9524 goto err_per_task;
9525 }
9526
9527
9528 event->addr_filters_gen = 1;
9529 }
9530
9531 if (!event->parent) {
9532 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9533 err = get_callchain_buffers(attr->sample_max_stack);
9534 if (err)
9535 goto err_addr_filters;
9536 }
9537 }
9538
9539
9540 account_event(event);
9541
9542 return event;
9543
9544err_addr_filters:
9545 kfree(event->addr_filters_offs);
9546
9547err_per_task:
9548 exclusive_event_destroy(event);
9549
9550err_pmu:
9551 if (event->destroy)
9552 event->destroy(event);
9553 module_put(pmu->module);
9554err_ns:
9555 if (is_cgroup_event(event))
9556 perf_detach_cgroup(event);
9557 if (event->ns)
9558 put_pid_ns(event->ns);
9559 kfree(event);
9560
9561 return ERR_PTR(err);
9562}
9563
9564static int perf_copy_attr(struct perf_event_attr __user *uattr,
9565 struct perf_event_attr *attr)
9566{
9567 u32 size;
9568 int ret;
9569
9570 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9571 return -EFAULT;
9572
9573
9574
9575
9576 memset(attr, 0, sizeof(*attr));
9577
9578 ret = get_user(size, &uattr->size);
9579 if (ret)
9580 return ret;
9581
9582 if (size > PAGE_SIZE)
9583 goto err_size;
9584
9585 if (!size)
9586 size = PERF_ATTR_SIZE_VER0;
9587
9588 if (size < PERF_ATTR_SIZE_VER0)
9589 goto err_size;
9590
9591
9592
9593
9594
9595
9596
9597 if (size > sizeof(*attr)) {
9598 unsigned char __user *addr;
9599 unsigned char __user *end;
9600 unsigned char val;
9601
9602 addr = (void __user *)uattr + sizeof(*attr);
9603 end = (void __user *)uattr + size;
9604
9605 for (; addr < end; addr++) {
9606 ret = get_user(val, addr);
9607 if (ret)
9608 return ret;
9609 if (val)
9610 goto err_size;
9611 }
9612 size = sizeof(*attr);
9613 }
9614
9615 ret = copy_from_user(attr, uattr, size);
9616 if (ret)
9617 return -EFAULT;
9618
9619 attr->size = size;
9620
9621 if (attr->__reserved_1)
9622 return -EINVAL;
9623
9624 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9625 return -EINVAL;
9626
9627 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9628 return -EINVAL;
9629
9630 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9631 u64 mask = attr->branch_sample_type;
9632
9633
9634 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9635 return -EINVAL;
9636
9637
9638 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9639 return -EINVAL;
9640
9641
9642 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9643
9644
9645 if (!attr->exclude_kernel)
9646 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9647
9648 if (!attr->exclude_user)
9649 mask |= PERF_SAMPLE_BRANCH_USER;
9650
9651 if (!attr->exclude_hv)
9652 mask |= PERF_SAMPLE_BRANCH_HV;
9653
9654
9655
9656 attr->branch_sample_type = mask;
9657 }
9658
9659 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9660 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9661 return -EACCES;
9662 }
9663
9664 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9665 ret = perf_reg_validate(attr->sample_regs_user);
9666 if (ret)
9667 return ret;
9668 }
9669
9670 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9671 if (!arch_perf_have_user_stack_dump())
9672 return -ENOSYS;
9673
9674
9675
9676
9677
9678
9679 if (attr->sample_stack_user >= USHRT_MAX)
9680 ret = -EINVAL;
9681 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9682 ret = -EINVAL;
9683 }
9684
9685 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9686 ret = perf_reg_validate(attr->sample_regs_intr);
9687out:
9688 return ret;
9689
9690err_size:
9691 put_user(sizeof(*attr), &uattr->size);
9692 ret = -E2BIG;
9693 goto out;
9694}
9695
9696static int
9697perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9698{
9699 struct ring_buffer *rb = NULL;
9700 int ret = -EINVAL;
9701
9702 if (!output_event)
9703 goto set;
9704
9705
9706 if (event == output_event)
9707 goto out;
9708
9709
9710
9711
9712 if (output_event->cpu != event->cpu)
9713 goto out;
9714
9715
9716
9717
9718 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9719 goto out;
9720
9721
9722
9723
9724 if (output_event->clock != event->clock)
9725 goto out;
9726
9727
9728
9729
9730
9731 if (is_write_backward(output_event) != is_write_backward(event))
9732 goto out;
9733
9734
9735
9736
9737 if (has_aux(event) && has_aux(output_event) &&
9738 event->pmu != output_event->pmu)
9739 goto out;
9740
9741set:
9742 mutex_lock(&event->mmap_mutex);
9743
9744 if (atomic_read(&event->mmap_count))
9745 goto unlock;
9746
9747 if (output_event) {
9748
9749 rb = ring_buffer_get(output_event);
9750 if (!rb)
9751 goto unlock;
9752 }
9753
9754 ring_buffer_attach(event, rb);
9755
9756 ret = 0;
9757unlock:
9758 mutex_unlock(&event->mmap_mutex);
9759
9760out:
9761 return ret;
9762}
9763
9764static void mutex_lock_double(struct mutex *a, struct mutex *b)
9765{
9766 if (b < a)
9767 swap(a, b);
9768
9769 mutex_lock(a);
9770 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9771}
9772
9773static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9774{
9775 bool nmi_safe = false;
9776
9777 switch (clk_id) {
9778 case CLOCK_MONOTONIC:
9779 event->clock = &ktime_get_mono_fast_ns;
9780 nmi_safe = true;
9781 break;
9782
9783 case CLOCK_MONOTONIC_RAW:
9784 event->clock = &ktime_get_raw_fast_ns;
9785 nmi_safe = true;
9786 break;
9787
9788 case CLOCK_REALTIME:
9789 event->clock = &ktime_get_real_ns;
9790 break;
9791
9792 case CLOCK_BOOTTIME:
9793 event->clock = &ktime_get_boot_ns;
9794 break;
9795
9796 case CLOCK_TAI:
9797 event->clock = &ktime_get_tai_ns;
9798 break;
9799
9800 default:
9801 return -EINVAL;
9802 }
9803
9804 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9805 return -EINVAL;
9806
9807 return 0;
9808}
9809
9810
9811
9812
9813
9814static struct perf_event_context *
9815__perf_event_ctx_lock_double(struct perf_event *group_leader,
9816 struct perf_event_context *ctx)
9817{
9818 struct perf_event_context *gctx;
9819
9820again:
9821 rcu_read_lock();
9822 gctx = READ_ONCE(group_leader->ctx);
9823 if (!atomic_inc_not_zero(&gctx->refcount)) {
9824 rcu_read_unlock();
9825 goto again;
9826 }
9827 rcu_read_unlock();
9828
9829 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9830
9831 if (group_leader->ctx != gctx) {
9832 mutex_unlock(&ctx->mutex);
9833 mutex_unlock(&gctx->mutex);
9834 put_ctx(gctx);
9835 goto again;
9836 }
9837
9838 return gctx;
9839}
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849SYSCALL_DEFINE5(perf_event_open,
9850 struct perf_event_attr __user *, attr_uptr,
9851 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
9852{
9853 struct perf_event *group_leader = NULL, *output_event = NULL;
9854 struct perf_event *event, *sibling;
9855 struct perf_event_attr attr;
9856 struct perf_event_context *ctx, *uninitialized_var(gctx);
9857 struct file *event_file = NULL;
9858 struct fd group = {NULL, 0};
9859 struct task_struct *task = NULL;
9860 struct pmu *pmu;
9861 int event_fd;
9862 int move_group = 0;
9863 int err;
9864 int f_flags = O_RDWR;
9865 int cgroup_fd = -1;
9866
9867
9868 if (flags & ~PERF_FLAG_ALL)
9869 return -EINVAL;
9870
9871 err = perf_copy_attr(attr_uptr, &attr);
9872 if (err)
9873 return err;
9874
9875 if (!attr.exclude_kernel) {
9876 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9877 return -EACCES;
9878 }
9879
9880 if (attr.namespaces) {
9881 if (!capable(CAP_SYS_ADMIN))
9882 return -EACCES;
9883 }
9884
9885 if (attr.freq) {
9886 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9887 return -EINVAL;
9888 } else {
9889 if (attr.sample_period & (1ULL << 63))
9890 return -EINVAL;
9891 }
9892
9893 if (!attr.sample_max_stack)
9894 attr.sample_max_stack = sysctl_perf_event_max_stack;
9895
9896
9897
9898
9899
9900
9901
9902 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9903 return -EINVAL;
9904
9905 if (flags & PERF_FLAG_FD_CLOEXEC)
9906 f_flags |= O_CLOEXEC;
9907
9908 event_fd = get_unused_fd_flags(f_flags);
9909 if (event_fd < 0)
9910 return event_fd;
9911
9912 if (group_fd != -1) {
9913 err = perf_fget_light(group_fd, &group);
9914 if (err)
9915 goto err_fd;
9916 group_leader = group.file->private_data;
9917 if (flags & PERF_FLAG_FD_OUTPUT)
9918 output_event = group_leader;
9919 if (flags & PERF_FLAG_FD_NO_GROUP)
9920 group_leader = NULL;
9921 }
9922
9923 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
9924 task = find_lively_task_by_vpid(pid);
9925 if (IS_ERR(task)) {
9926 err = PTR_ERR(task);
9927 goto err_group_fd;
9928 }
9929 }
9930
9931 if (task && group_leader &&
9932 group_leader->attr.inherit != attr.inherit) {
9933 err = -EINVAL;
9934 goto err_task;
9935 }
9936
9937 if (task) {
9938 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
9939 if (err)
9940 goto err_task;
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950 err = -EACCES;
9951 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
9952 goto err_cred;
9953 }
9954
9955 if (flags & PERF_FLAG_PID_CGROUP)
9956 cgroup_fd = pid;
9957
9958 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
9959 NULL, NULL, cgroup_fd);
9960 if (IS_ERR(event)) {
9961 err = PTR_ERR(event);
9962 goto err_cred;
9963 }
9964
9965 if (is_sampling_event(event)) {
9966 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
9967 err = -EOPNOTSUPP;
9968 goto err_alloc;
9969 }
9970 }
9971
9972
9973
9974
9975
9976 pmu = event->pmu;
9977
9978 if (attr.use_clockid) {
9979 err = perf_event_set_clock(event, attr.clockid);
9980 if (err)
9981 goto err_alloc;
9982 }
9983
9984 if (pmu->task_ctx_nr == perf_sw_context)
9985 event->event_caps |= PERF_EV_CAP_SOFTWARE;
9986
9987 if (group_leader &&
9988 (is_software_event(event) != is_software_event(group_leader))) {
9989 if (is_software_event(event)) {
9990
9991
9992
9993
9994
9995
9996
9997
9998 pmu = group_leader->pmu;
9999 } else if (is_software_event(group_leader) &&
10000 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10001
10002
10003
10004
10005
10006 move_group = 1;
10007 }
10008 }
10009
10010
10011
10012
10013 ctx = find_get_context(pmu, task, event);
10014 if (IS_ERR(ctx)) {
10015 err = PTR_ERR(ctx);
10016 goto err_alloc;
10017 }
10018
10019 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10020 err = -EBUSY;
10021 goto err_context;
10022 }
10023
10024
10025
10026
10027 if (group_leader) {
10028 err = -EINVAL;
10029
10030
10031
10032
10033
10034 if (group_leader->group_leader != group_leader)
10035 goto err_context;
10036
10037
10038 if (group_leader->clock != event->clock)
10039 goto err_context;
10040
10041
10042
10043
10044
10045
10046 if (group_leader->cpu != event->cpu)
10047 goto err_context;
10048
10049
10050
10051
10052
10053 if (group_leader->ctx->task != ctx->task)
10054 goto err_context;
10055
10056
10057
10058
10059
10060
10061 if (!move_group && group_leader->ctx != ctx)
10062 goto err_context;
10063
10064
10065
10066
10067 if (attr.exclusive || attr.pinned)
10068 goto err_context;
10069 }
10070
10071 if (output_event) {
10072 err = perf_event_set_output(event, output_event);
10073 if (err)
10074 goto err_context;
10075 }
10076
10077 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10078 f_flags);
10079 if (IS_ERR(event_file)) {
10080 err = PTR_ERR(event_file);
10081 event_file = NULL;
10082 goto err_context;
10083 }
10084
10085 if (move_group) {
10086 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10087
10088 if (gctx->task == TASK_TOMBSTONE) {
10089 err = -ESRCH;
10090 goto err_locked;
10091 }
10092
10093
10094
10095
10096
10097 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10098
10099
10100
10101
10102
10103 if (gctx != ctx) {
10104 err = -EINVAL;
10105 goto err_locked;
10106 } else {
10107 perf_event_ctx_unlock(group_leader, gctx);
10108 move_group = 0;
10109 }
10110 }
10111 } else {
10112 mutex_lock(&ctx->mutex);
10113 }
10114
10115 if (ctx->task == TASK_TOMBSTONE) {
10116 err = -ESRCH;
10117 goto err_locked;
10118 }
10119
10120 if (!perf_event_validate_size(event)) {
10121 err = -E2BIG;
10122 goto err_locked;
10123 }
10124
10125 if (!task) {
10126
10127
10128
10129
10130
10131
10132 struct perf_cpu_context *cpuctx =
10133 container_of(ctx, struct perf_cpu_context, ctx);
10134
10135 if (!cpuctx->online) {
10136 err = -ENODEV;
10137 goto err_locked;
10138 }
10139 }
10140
10141
10142
10143
10144
10145
10146 if (!exclusive_event_installable(event, ctx)) {
10147
10148 WARN_ON_ONCE(move_group);
10149
10150 err = -EBUSY;
10151 goto err_locked;
10152 }
10153
10154 WARN_ON_ONCE(ctx->parent_ctx);
10155
10156
10157
10158
10159
10160
10161 if (move_group) {
10162
10163
10164
10165
10166 perf_remove_from_context(group_leader, 0);
10167 put_ctx(gctx);
10168
10169 list_for_each_entry(sibling, &group_leader->sibling_list,
10170 group_entry) {
10171 perf_remove_from_context(sibling, 0);
10172 put_ctx(gctx);
10173 }
10174
10175
10176
10177
10178
10179 synchronize_rcu();
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191 list_for_each_entry(sibling, &group_leader->sibling_list,
10192 group_entry) {
10193 perf_event__state_init(sibling);
10194 perf_install_in_context(ctx, sibling, sibling->cpu);
10195 get_ctx(ctx);
10196 }
10197
10198
10199
10200
10201
10202
10203 perf_event__state_init(group_leader);
10204 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10205 get_ctx(ctx);
10206 }
10207
10208
10209
10210
10211
10212
10213
10214 perf_event__header_size(event);
10215 perf_event__id_header_size(event);
10216
10217 event->owner = current;
10218
10219 perf_install_in_context(ctx, event, event->cpu);
10220 perf_unpin_context(ctx);
10221
10222 if (move_group)
10223 perf_event_ctx_unlock(group_leader, gctx);
10224 mutex_unlock(&ctx->mutex);
10225
10226 if (task) {
10227 mutex_unlock(&task->signal->cred_guard_mutex);
10228 put_task_struct(task);
10229 }
10230
10231 mutex_lock(¤t->perf_event_mutex);
10232 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
10233 mutex_unlock(¤t->perf_event_mutex);
10234
10235
10236
10237
10238
10239
10240
10241 fdput(group);
10242 fd_install(event_fd, event_file);
10243 return event_fd;
10244
10245err_locked:
10246 if (move_group)
10247 perf_event_ctx_unlock(group_leader, gctx);
10248 mutex_unlock(&ctx->mutex);
10249
10250 fput(event_file);
10251err_context:
10252 perf_unpin_context(ctx);
10253 put_ctx(ctx);
10254err_alloc:
10255
10256
10257
10258
10259 if (!event_file)
10260 free_event(event);
10261err_cred:
10262 if (task)
10263 mutex_unlock(&task->signal->cred_guard_mutex);
10264err_task:
10265 if (task)
10266 put_task_struct(task);
10267err_group_fd:
10268 fdput(group);
10269err_fd:
10270 put_unused_fd(event_fd);
10271 return err;
10272}
10273
10274
10275
10276
10277
10278
10279
10280
10281struct perf_event *
10282perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10283 struct task_struct *task,
10284 perf_overflow_handler_t overflow_handler,
10285 void *context)
10286{
10287 struct perf_event_context *ctx;
10288 struct perf_event *event;
10289 int err;
10290
10291
10292
10293
10294
10295 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10296 overflow_handler, context, -1);
10297 if (IS_ERR(event)) {
10298 err = PTR_ERR(event);
10299 goto err;
10300 }
10301
10302
10303 event->owner = TASK_TOMBSTONE;
10304
10305 ctx = find_get_context(event->pmu, task, event);
10306 if (IS_ERR(ctx)) {
10307 err = PTR_ERR(ctx);
10308 goto err_free;
10309 }
10310
10311 WARN_ON_ONCE(ctx->parent_ctx);
10312 mutex_lock(&ctx->mutex);
10313 if (ctx->task == TASK_TOMBSTONE) {
10314 err = -ESRCH;
10315 goto err_unlock;
10316 }
10317
10318 if (!task) {
10319
10320
10321
10322
10323
10324
10325 struct perf_cpu_context *cpuctx =
10326 container_of(ctx, struct perf_cpu_context, ctx);
10327 if (!cpuctx->online) {
10328 err = -ENODEV;
10329 goto err_unlock;
10330 }
10331 }
10332
10333 if (!exclusive_event_installable(event, ctx)) {
10334 err = -EBUSY;
10335 goto err_unlock;
10336 }
10337
10338 perf_install_in_context(ctx, event, cpu);
10339 perf_unpin_context(ctx);
10340 mutex_unlock(&ctx->mutex);
10341
10342 return event;
10343
10344err_unlock:
10345 mutex_unlock(&ctx->mutex);
10346 perf_unpin_context(ctx);
10347 put_ctx(ctx);
10348err_free:
10349 free_event(event);
10350err:
10351 return ERR_PTR(err);
10352}
10353EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10354
10355void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10356{
10357 struct perf_event_context *src_ctx;
10358 struct perf_event_context *dst_ctx;
10359 struct perf_event *event, *tmp;
10360 LIST_HEAD(events);
10361
10362 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10363 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10364
10365
10366
10367
10368
10369 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10370 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10371 event_entry) {
10372 perf_remove_from_context(event, 0);
10373 unaccount_event_cpu(event, src_cpu);
10374 put_ctx(src_ctx);
10375 list_add(&event->migrate_entry, &events);
10376 }
10377
10378
10379
10380
10381 synchronize_rcu();
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10392 if (event->group_leader == event)
10393 continue;
10394
10395 list_del(&event->migrate_entry);
10396 if (event->state >= PERF_EVENT_STATE_OFF)
10397 event->state = PERF_EVENT_STATE_INACTIVE;
10398 account_event_cpu(event, dst_cpu);
10399 perf_install_in_context(dst_ctx, event, dst_cpu);
10400 get_ctx(dst_ctx);
10401 }
10402
10403
10404
10405
10406
10407 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10408 list_del(&event->migrate_entry);
10409 if (event->state >= PERF_EVENT_STATE_OFF)
10410 event->state = PERF_EVENT_STATE_INACTIVE;
10411 account_event_cpu(event, dst_cpu);
10412 perf_install_in_context(dst_ctx, event, dst_cpu);
10413 get_ctx(dst_ctx);
10414 }
10415 mutex_unlock(&dst_ctx->mutex);
10416 mutex_unlock(&src_ctx->mutex);
10417}
10418EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10419
10420static void sync_child_event(struct perf_event *child_event,
10421 struct task_struct *child)
10422{
10423 struct perf_event *parent_event = child_event->parent;
10424 u64 child_val;
10425
10426 if (child_event->attr.inherit_stat)
10427 perf_event_read_event(child_event, child);
10428
10429 child_val = perf_event_count(child_event);
10430
10431
10432
10433
10434 atomic64_add(child_val, &parent_event->child_count);
10435 atomic64_add(child_event->total_time_enabled,
10436 &parent_event->child_total_time_enabled);
10437 atomic64_add(child_event->total_time_running,
10438 &parent_event->child_total_time_running);
10439}
10440
10441static void
10442perf_event_exit_event(struct perf_event *child_event,
10443 struct perf_event_context *child_ctx,
10444 struct task_struct *child)
10445{
10446 struct perf_event *parent_event = child_event->parent;
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460 raw_spin_lock_irq(&child_ctx->lock);
10461 WARN_ON_ONCE(child_ctx->is_active);
10462
10463 if (parent_event)
10464 perf_group_detach(child_event);
10465 list_del_event(child_event, child_ctx);
10466 child_event->state = PERF_EVENT_STATE_EXIT;
10467 raw_spin_unlock_irq(&child_ctx->lock);
10468
10469
10470
10471
10472 if (!parent_event) {
10473 perf_event_wakeup(child_event);
10474 return;
10475 }
10476
10477
10478
10479
10480 sync_child_event(child_event, child);
10481
10482
10483
10484
10485 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10486 mutex_lock(&parent_event->child_mutex);
10487 list_del_init(&child_event->child_list);
10488 mutex_unlock(&parent_event->child_mutex);
10489
10490
10491
10492
10493 perf_event_wakeup(parent_event);
10494 free_event(child_event);
10495 put_event(parent_event);
10496}
10497
10498static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10499{
10500 struct perf_event_context *child_ctx, *clone_ctx = NULL;
10501 struct perf_event *child_event, *next;
10502
10503 WARN_ON_ONCE(child != current);
10504
10505 child_ctx = perf_pin_task_context(child, ctxn);
10506 if (!child_ctx)
10507 return;
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519 mutex_lock(&child_ctx->mutex);
10520
10521
10522
10523
10524
10525
10526 raw_spin_lock_irq(&child_ctx->lock);
10527 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
10528
10529
10530
10531
10532
10533 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10534 put_ctx(child_ctx);
10535 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10536 put_task_struct(current);
10537
10538 clone_ctx = unclone_ctx(child_ctx);
10539 raw_spin_unlock_irq(&child_ctx->lock);
10540
10541 if (clone_ctx)
10542 put_ctx(clone_ctx);
10543
10544
10545
10546
10547
10548
10549 perf_event_task(child, child_ctx, 0);
10550
10551 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10552 perf_event_exit_event(child_event, child_ctx, child);
10553
10554 mutex_unlock(&child_ctx->mutex);
10555
10556 put_ctx(child_ctx);
10557}
10558
10559
10560
10561
10562
10563
10564
10565void perf_event_exit_task(struct task_struct *child)
10566{
10567 struct perf_event *event, *tmp;
10568 int ctxn;
10569
10570 mutex_lock(&child->perf_event_mutex);
10571 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10572 owner_entry) {
10573 list_del_init(&event->owner_entry);
10574
10575
10576
10577
10578
10579
10580 smp_store_release(&event->owner, NULL);
10581 }
10582 mutex_unlock(&child->perf_event_mutex);
10583
10584 for_each_task_context_nr(ctxn)
10585 perf_event_exit_task_context(child, ctxn);
10586
10587
10588
10589
10590
10591
10592
10593 perf_event_task(child, NULL, 0);
10594}
10595
10596static void perf_free_event(struct perf_event *event,
10597 struct perf_event_context *ctx)
10598{
10599 struct perf_event *parent = event->parent;
10600
10601 if (WARN_ON_ONCE(!parent))
10602 return;
10603
10604 mutex_lock(&parent->child_mutex);
10605 list_del_init(&event->child_list);
10606 mutex_unlock(&parent->child_mutex);
10607
10608 put_event(parent);
10609
10610 raw_spin_lock_irq(&ctx->lock);
10611 perf_group_detach(event);
10612 list_del_event(event, ctx);
10613 raw_spin_unlock_irq(&ctx->lock);
10614 free_event(event);
10615}
10616
10617
10618
10619
10620
10621
10622
10623
10624void perf_event_free_task(struct task_struct *task)
10625{
10626 struct perf_event_context *ctx;
10627 struct perf_event *event, *tmp;
10628 int ctxn;
10629
10630 for_each_task_context_nr(ctxn) {
10631 ctx = task->perf_event_ctxp[ctxn];
10632 if (!ctx)
10633 continue;
10634
10635 mutex_lock(&ctx->mutex);
10636 raw_spin_lock_irq(&ctx->lock);
10637
10638
10639
10640
10641
10642
10643 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10644 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10645 put_task_struct(task);
10646 raw_spin_unlock_irq(&ctx->lock);
10647
10648 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
10649 perf_free_event(event, ctx);
10650
10651 mutex_unlock(&ctx->mutex);
10652 put_ctx(ctx);
10653 }
10654}
10655
10656void perf_event_delayed_put(struct task_struct *task)
10657{
10658 int ctxn;
10659
10660 for_each_task_context_nr(ctxn)
10661 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10662}
10663
10664struct file *perf_event_get(unsigned int fd)
10665{
10666 struct file *file;
10667
10668 file = fget_raw(fd);
10669 if (!file)
10670 return ERR_PTR(-EBADF);
10671
10672 if (file->f_op != &perf_fops) {
10673 fput(file);
10674 return ERR_PTR(-EBADF);
10675 }
10676
10677 return file;
10678}
10679
10680const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10681{
10682 if (!event)
10683 return ERR_PTR(-EINVAL);
10684
10685 return &event->attr;
10686}
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696static struct perf_event *
10697inherit_event(struct perf_event *parent_event,
10698 struct task_struct *parent,
10699 struct perf_event_context *parent_ctx,
10700 struct task_struct *child,
10701 struct perf_event *group_leader,
10702 struct perf_event_context *child_ctx)
10703{
10704 enum perf_event_active_state parent_state = parent_event->state;
10705 struct perf_event *child_event;
10706 unsigned long flags;
10707
10708
10709
10710
10711
10712
10713
10714 if (parent_event->parent)
10715 parent_event = parent_event->parent;
10716
10717 child_event = perf_event_alloc(&parent_event->attr,
10718 parent_event->cpu,
10719 child,
10720 group_leader, parent_event,
10721 NULL, NULL, -1);
10722 if (IS_ERR(child_event))
10723 return child_event;
10724
10725
10726
10727
10728
10729
10730
10731 mutex_lock(&parent_event->child_mutex);
10732 if (is_orphaned_event(parent_event) ||
10733 !atomic_long_inc_not_zero(&parent_event->refcount)) {
10734 mutex_unlock(&parent_event->child_mutex);
10735 free_event(child_event);
10736 return NULL;
10737 }
10738
10739 get_ctx(child_ctx);
10740
10741
10742
10743
10744
10745
10746 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10747 child_event->state = PERF_EVENT_STATE_INACTIVE;
10748 else
10749 child_event->state = PERF_EVENT_STATE_OFF;
10750
10751 if (parent_event->attr.freq) {
10752 u64 sample_period = parent_event->hw.sample_period;
10753 struct hw_perf_event *hwc = &child_event->hw;
10754
10755 hwc->sample_period = sample_period;
10756 hwc->last_period = sample_period;
10757
10758 local64_set(&hwc->period_left, sample_period);
10759 }
10760
10761 child_event->ctx = child_ctx;
10762 child_event->overflow_handler = parent_event->overflow_handler;
10763 child_event->overflow_handler_context
10764 = parent_event->overflow_handler_context;
10765
10766
10767
10768
10769 perf_event__header_size(child_event);
10770 perf_event__id_header_size(child_event);
10771
10772
10773
10774
10775 raw_spin_lock_irqsave(&child_ctx->lock, flags);
10776 add_event_to_ctx(child_event, child_ctx);
10777 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10778
10779
10780
10781
10782 list_add_tail(&child_event->child_list, &parent_event->child_list);
10783 mutex_unlock(&parent_event->child_mutex);
10784
10785 return child_event;
10786}
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798static int inherit_group(struct perf_event *parent_event,
10799 struct task_struct *parent,
10800 struct perf_event_context *parent_ctx,
10801 struct task_struct *child,
10802 struct perf_event_context *child_ctx)
10803{
10804 struct perf_event *leader;
10805 struct perf_event *sub;
10806 struct perf_event *child_ctr;
10807
10808 leader = inherit_event(parent_event, parent, parent_ctx,
10809 child, NULL, child_ctx);
10810 if (IS_ERR(leader))
10811 return PTR_ERR(leader);
10812
10813
10814
10815
10816
10817 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10818 child_ctr = inherit_event(sub, parent, parent_ctx,
10819 child, leader, child_ctx);
10820 if (IS_ERR(child_ctr))
10821 return PTR_ERR(child_ctr);
10822 }
10823 return 0;
10824}
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837static int
10838inherit_task_group(struct perf_event *event, struct task_struct *parent,
10839 struct perf_event_context *parent_ctx,
10840 struct task_struct *child, int ctxn,
10841 int *inherited_all)
10842{
10843 int ret;
10844 struct perf_event_context *child_ctx;
10845
10846 if (!event->attr.inherit) {
10847 *inherited_all = 0;
10848 return 0;
10849 }
10850
10851 child_ctx = child->perf_event_ctxp[ctxn];
10852 if (!child_ctx) {
10853
10854
10855
10856
10857
10858
10859 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10860 if (!child_ctx)
10861 return -ENOMEM;
10862
10863 child->perf_event_ctxp[ctxn] = child_ctx;
10864 }
10865
10866 ret = inherit_group(event, parent, parent_ctx,
10867 child, child_ctx);
10868
10869 if (ret)
10870 *inherited_all = 0;
10871
10872 return ret;
10873}
10874
10875
10876
10877
10878static int perf_event_init_context(struct task_struct *child, int ctxn)
10879{
10880 struct perf_event_context *child_ctx, *parent_ctx;
10881 struct perf_event_context *cloned_ctx;
10882 struct perf_event *event;
10883 struct task_struct *parent = current;
10884 int inherited_all = 1;
10885 unsigned long flags;
10886 int ret = 0;
10887
10888 if (likely(!parent->perf_event_ctxp[ctxn]))
10889 return 0;
10890
10891
10892
10893
10894
10895 parent_ctx = perf_pin_task_context(parent, ctxn);
10896 if (!parent_ctx)
10897 return 0;
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910 mutex_lock(&parent_ctx->mutex);
10911
10912
10913
10914
10915
10916 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
10917 ret = inherit_task_group(event, parent, parent_ctx,
10918 child, ctxn, &inherited_all);
10919 if (ret)
10920 goto out_unlock;
10921 }
10922
10923
10924
10925
10926
10927
10928 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10929 parent_ctx->rotate_disable = 1;
10930 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10931
10932 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
10933 ret = inherit_task_group(event, parent, parent_ctx,
10934 child, ctxn, &inherited_all);
10935 if (ret)
10936 goto out_unlock;
10937 }
10938
10939 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10940 parent_ctx->rotate_disable = 0;
10941
10942 child_ctx = child->perf_event_ctxp[ctxn];
10943
10944 if (child_ctx && inherited_all) {
10945
10946
10947
10948
10949
10950
10951
10952 cloned_ctx = parent_ctx->parent_ctx;
10953 if (cloned_ctx) {
10954 child_ctx->parent_ctx = cloned_ctx;
10955 child_ctx->parent_gen = parent_ctx->parent_gen;
10956 } else {
10957 child_ctx->parent_ctx = parent_ctx;
10958 child_ctx->parent_gen = parent_ctx->generation;
10959 }
10960 get_ctx(child_ctx->parent_ctx);
10961 }
10962
10963 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10964out_unlock:
10965 mutex_unlock(&parent_ctx->mutex);
10966
10967 perf_unpin_context(parent_ctx);
10968 put_ctx(parent_ctx);
10969
10970 return ret;
10971}
10972
10973
10974
10975
10976int perf_event_init_task(struct task_struct *child)
10977{
10978 int ctxn, ret;
10979
10980 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
10981 mutex_init(&child->perf_event_mutex);
10982 INIT_LIST_HEAD(&child->perf_event_list);
10983
10984 for_each_task_context_nr(ctxn) {
10985 ret = perf_event_init_context(child, ctxn);
10986 if (ret) {
10987 perf_event_free_task(child);
10988 return ret;
10989 }
10990 }
10991
10992 return 0;
10993}
10994
10995static void __init perf_event_init_all_cpus(void)
10996{
10997 struct swevent_htable *swhash;
10998 int cpu;
10999
11000 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11001
11002 for_each_possible_cpu(cpu) {
11003 swhash = &per_cpu(swevent_htable, cpu);
11004 mutex_init(&swhash->hlist_mutex);
11005 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11006
11007 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11008 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11009
11010#ifdef CONFIG_CGROUP_PERF
11011 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11012#endif
11013 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11014 }
11015}
11016
11017void perf_swevent_init_cpu(unsigned int cpu)
11018{
11019 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11020
11021 mutex_lock(&swhash->hlist_mutex);
11022 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11023 struct swevent_hlist *hlist;
11024
11025 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11026 WARN_ON(!hlist);
11027 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11028 }
11029 mutex_unlock(&swhash->hlist_mutex);
11030}
11031
11032#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11033static void __perf_event_exit_context(void *__info)
11034{
11035 struct perf_event_context *ctx = __info;
11036 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11037 struct perf_event *event;
11038
11039 raw_spin_lock(&ctx->lock);
11040 list_for_each_entry(event, &ctx->event_list, event_entry)
11041 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11042 raw_spin_unlock(&ctx->lock);
11043}
11044
11045static void perf_event_exit_cpu_context(int cpu)
11046{
11047 struct perf_cpu_context *cpuctx;
11048 struct perf_event_context *ctx;
11049 struct pmu *pmu;
11050
11051 mutex_lock(&pmus_lock);
11052 list_for_each_entry(pmu, &pmus, entry) {
11053 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11054 ctx = &cpuctx->ctx;
11055
11056 mutex_lock(&ctx->mutex);
11057 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11058 cpuctx->online = 0;
11059 mutex_unlock(&ctx->mutex);
11060 }
11061 cpumask_clear_cpu(cpu, perf_online_mask);
11062 mutex_unlock(&pmus_lock);
11063}
11064#else
11065
11066static void perf_event_exit_cpu_context(int cpu) { }
11067
11068#endif
11069
11070int perf_event_init_cpu(unsigned int cpu)
11071{
11072 struct perf_cpu_context *cpuctx;
11073 struct perf_event_context *ctx;
11074 struct pmu *pmu;
11075
11076 perf_swevent_init_cpu(cpu);
11077
11078 mutex_lock(&pmus_lock);
11079 cpumask_set_cpu(cpu, perf_online_mask);
11080 list_for_each_entry(pmu, &pmus, entry) {
11081 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11082 ctx = &cpuctx->ctx;
11083
11084 mutex_lock(&ctx->mutex);
11085 cpuctx->online = 1;
11086 mutex_unlock(&ctx->mutex);
11087 }
11088 mutex_unlock(&pmus_lock);
11089
11090 return 0;
11091}
11092
11093int perf_event_exit_cpu(unsigned int cpu)
11094{
11095 perf_event_exit_cpu_context(cpu);
11096 return 0;
11097}
11098
11099static int
11100perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11101{
11102 int cpu;
11103
11104 for_each_online_cpu(cpu)
11105 perf_event_exit_cpu(cpu);
11106
11107 return NOTIFY_OK;
11108}
11109
11110
11111
11112
11113
11114static struct notifier_block perf_reboot_notifier = {
11115 .notifier_call = perf_reboot,
11116 .priority = INT_MIN,
11117};
11118
11119void __init perf_event_init(void)
11120{
11121 int ret;
11122
11123 idr_init(&pmu_idr);
11124
11125 perf_event_init_all_cpus();
11126 init_srcu_struct(&pmus_srcu);
11127 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11128 perf_pmu_register(&perf_cpu_clock, NULL, -1);
11129 perf_pmu_register(&perf_task_clock, NULL, -1);
11130 perf_tp_register();
11131 perf_event_init_cpu(smp_processor_id());
11132 register_reboot_notifier(&perf_reboot_notifier);
11133
11134 ret = init_hw_breakpoint();
11135 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11136
11137
11138
11139
11140
11141 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11142 != 1024);
11143}
11144
11145ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11146 char *page)
11147{
11148 struct perf_pmu_events_attr *pmu_attr =
11149 container_of(attr, struct perf_pmu_events_attr, attr);
11150
11151 if (pmu_attr->event_str)
11152 return sprintf(page, "%s\n", pmu_attr->event_str);
11153
11154 return 0;
11155}
11156EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11157
11158static int __init perf_event_sysfs_init(void)
11159{
11160 struct pmu *pmu;
11161 int ret;
11162
11163 mutex_lock(&pmus_lock);
11164
11165 ret = bus_register(&pmu_bus);
11166 if (ret)
11167 goto unlock;
11168
11169 list_for_each_entry(pmu, &pmus, entry) {
11170 if (!pmu->name || pmu->type < 0)
11171 continue;
11172
11173 ret = pmu_dev_alloc(pmu);
11174 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11175 }
11176 pmu_bus_running = 1;
11177 ret = 0;
11178
11179unlock:
11180 mutex_unlock(&pmus_lock);
11181
11182 return ret;
11183}
11184device_initcall(perf_event_sysfs_init);
11185
11186#ifdef CONFIG_CGROUP_PERF
11187static struct cgroup_subsys_state *
11188perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11189{
11190 struct perf_cgroup *jc;
11191
11192 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11193 if (!jc)
11194 return ERR_PTR(-ENOMEM);
11195
11196 jc->info = alloc_percpu(struct perf_cgroup_info);
11197 if (!jc->info) {
11198 kfree(jc);
11199 return ERR_PTR(-ENOMEM);
11200 }
11201
11202 return &jc->css;
11203}
11204
11205static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11206{
11207 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11208
11209 free_percpu(jc->info);
11210 kfree(jc);
11211}
11212
11213static int __perf_cgroup_move(void *info)
11214{
11215 struct task_struct *task = info;
11216 rcu_read_lock();
11217 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11218 rcu_read_unlock();
11219 return 0;
11220}
11221
11222static void perf_cgroup_attach(struct cgroup_taskset *tset)
11223{
11224 struct task_struct *task;
11225 struct cgroup_subsys_state *css;
11226
11227 cgroup_taskset_for_each(task, css, tset)
11228 task_function_call(task, __perf_cgroup_move, task);
11229}
11230
11231struct cgroup_subsys perf_event_cgrp_subsys = {
11232 .css_alloc = perf_cgroup_css_alloc,
11233 .css_free = perf_cgroup_css_free,
11234 .attach = perf_cgroup_attach,
11235
11236
11237
11238
11239
11240 .implicit_on_dfl = true,
11241};
11242#endif
11243