1
2
3
4
5
6
7
8
9
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/hugetlb.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53#include <linux/min_heap.h>
54#include <linux/highmem.h>
55#include <linux/pgtable.h>
56#include <linux/buildid.h>
57
58#include "internal.h"
59
60#include <asm/irq_regs.h>
61
62typedef int (*remote_function_f)(void *);
63
64struct remote_function_call {
65 struct task_struct *p;
66 remote_function_f func;
67 void *info;
68 int ret;
69};
70
71static void remote_function(void *data)
72{
73 struct remote_function_call *tfc = data;
74 struct task_struct *p = tfc->p;
75
76 if (p) {
77
78 if (task_cpu(p) != smp_processor_id())
79 return;
80
81
82
83
84
85
86 tfc->ret = -ESRCH;
87 if (p != current)
88 return;
89 }
90
91 tfc->ret = tfc->func(tfc->info);
92}
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107static int
108task_function_call(struct task_struct *p, remote_function_f func, void *info)
109{
110 struct remote_function_call data = {
111 .p = p,
112 .func = func,
113 .info = info,
114 .ret = -EAGAIN,
115 };
116 int ret;
117
118 for (;;) {
119 ret = smp_call_function_single(task_cpu(p), remote_function,
120 &data, 1);
121 if (!ret)
122 ret = data.ret;
123
124 if (ret != -EAGAIN)
125 break;
126
127 cond_resched();
128 }
129
130 return ret;
131}
132
133
134
135
136
137
138
139
140
141
142
143static int cpu_function_call(int cpu, remote_function_f func, void *info)
144{
145 struct remote_function_call data = {
146 .p = NULL,
147 .func = func,
148 .info = info,
149 .ret = -ENXIO,
150 };
151
152 smp_call_function_single(cpu, remote_function, &data, 1);
153
154 return data.ret;
155}
156
157static inline struct perf_cpu_context *
158__get_cpu_context(struct perf_event_context *ctx)
159{
160 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
161}
162
163static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
164 struct perf_event_context *ctx)
165{
166 raw_spin_lock(&cpuctx->ctx.lock);
167 if (ctx)
168 raw_spin_lock(&ctx->lock);
169}
170
171static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
172 struct perf_event_context *ctx)
173{
174 if (ctx)
175 raw_spin_unlock(&ctx->lock);
176 raw_spin_unlock(&cpuctx->ctx.lock);
177}
178
179#define TASK_TOMBSTONE ((void *)-1L)
180
181static bool is_kernel_event(struct perf_event *event)
182{
183 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
184}
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
206 struct perf_event_context *, void *);
207
208struct event_function_struct {
209 struct perf_event *event;
210 event_f func;
211 void *data;
212};
213
214static int event_function(void *info)
215{
216 struct event_function_struct *efs = info;
217 struct perf_event *event = efs->event;
218 struct perf_event_context *ctx = event->ctx;
219 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
220 struct perf_event_context *task_ctx = cpuctx->task_ctx;
221 int ret = 0;
222
223 lockdep_assert_irqs_disabled();
224
225 perf_ctx_lock(cpuctx, task_ctx);
226
227
228
229
230 if (ctx->task) {
231 if (ctx->task != current) {
232 ret = -ESRCH;
233 goto unlock;
234 }
235
236
237
238
239
240
241
242
243 WARN_ON_ONCE(!ctx->is_active);
244
245
246
247
248 WARN_ON_ONCE(task_ctx != ctx);
249 } else {
250 WARN_ON_ONCE(&cpuctx->ctx != ctx);
251 }
252
253 efs->func(event, cpuctx, ctx, efs->data);
254unlock:
255 perf_ctx_unlock(cpuctx, task_ctx);
256
257 return ret;
258}
259
260static void event_function_call(struct perf_event *event, event_f func, void *data)
261{
262 struct perf_event_context *ctx = event->ctx;
263 struct task_struct *task = READ_ONCE(ctx->task);
264 struct event_function_struct efs = {
265 .event = event,
266 .func = func,
267 .data = data,
268 };
269
270 if (!event->parent) {
271
272
273
274
275
276 lockdep_assert_held(&ctx->mutex);
277 }
278
279 if (!task) {
280 cpu_function_call(event->cpu, event_function, &efs);
281 return;
282 }
283
284 if (task == TASK_TOMBSTONE)
285 return;
286
287again:
288 if (!task_function_call(task, event_function, &efs))
289 return;
290
291 raw_spin_lock_irq(&ctx->lock);
292
293
294
295
296 task = ctx->task;
297 if (task == TASK_TOMBSTONE) {
298 raw_spin_unlock_irq(&ctx->lock);
299 return;
300 }
301 if (ctx->is_active) {
302 raw_spin_unlock_irq(&ctx->lock);
303 goto again;
304 }
305 func(event, NULL, ctx, data);
306 raw_spin_unlock_irq(&ctx->lock);
307}
308
309
310
311
312
313static void event_function_local(struct perf_event *event, event_f func, void *data)
314{
315 struct perf_event_context *ctx = event->ctx;
316 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
317 struct task_struct *task = READ_ONCE(ctx->task);
318 struct perf_event_context *task_ctx = NULL;
319
320 lockdep_assert_irqs_disabled();
321
322 if (task) {
323 if (task == TASK_TOMBSTONE)
324 return;
325
326 task_ctx = ctx;
327 }
328
329 perf_ctx_lock(cpuctx, task_ctx);
330
331 task = ctx->task;
332 if (task == TASK_TOMBSTONE)
333 goto unlock;
334
335 if (task) {
336
337
338
339
340
341 if (ctx->is_active) {
342 if (WARN_ON_ONCE(task != current))
343 goto unlock;
344
345 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
346 goto unlock;
347 }
348 } else {
349 WARN_ON_ONCE(&cpuctx->ctx != ctx);
350 }
351
352 func(event, cpuctx, ctx, data);
353unlock:
354 perf_ctx_unlock(cpuctx, task_ctx);
355}
356
357#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
358 PERF_FLAG_FD_OUTPUT |\
359 PERF_FLAG_PID_CGROUP |\
360 PERF_FLAG_FD_CLOEXEC)
361
362
363
364
365#define PERF_SAMPLE_BRANCH_PERM_PLM \
366 (PERF_SAMPLE_BRANCH_KERNEL |\
367 PERF_SAMPLE_BRANCH_HV)
368
369enum event_type_t {
370 EVENT_FLEXIBLE = 0x1,
371 EVENT_PINNED = 0x2,
372 EVENT_TIME = 0x4,
373
374 EVENT_CPU = 0x8,
375 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
376};
377
378
379
380
381
382
383static void perf_sched_delayed(struct work_struct *work);
384DEFINE_STATIC_KEY_FALSE(perf_sched_events);
385static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
386static DEFINE_MUTEX(perf_sched_mutex);
387static atomic_t perf_sched_count;
388
389static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
390static DEFINE_PER_CPU(int, perf_sched_cb_usages);
391static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
392
393static atomic_t nr_mmap_events __read_mostly;
394static atomic_t nr_comm_events __read_mostly;
395static atomic_t nr_namespaces_events __read_mostly;
396static atomic_t nr_task_events __read_mostly;
397static atomic_t nr_freq_events __read_mostly;
398static atomic_t nr_switch_events __read_mostly;
399static atomic_t nr_ksymbol_events __read_mostly;
400static atomic_t nr_bpf_events __read_mostly;
401static atomic_t nr_cgroup_events __read_mostly;
402static atomic_t nr_text_poke_events __read_mostly;
403static atomic_t nr_build_id_events __read_mostly;
404
405static LIST_HEAD(pmus);
406static DEFINE_MUTEX(pmus_lock);
407static struct srcu_struct pmus_srcu;
408static cpumask_var_t perf_online_mask;
409static struct kmem_cache *perf_event_cache;
410
411
412
413
414
415
416
417
418int sysctl_perf_event_paranoid __read_mostly = 2;
419
420
421int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
422
423
424
425
426#define DEFAULT_MAX_SAMPLE_RATE 100000
427#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
428#define DEFAULT_CPU_TIME_MAX_PERCENT 25
429
430int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
431
432static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
433static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
434
435static int perf_sample_allowed_ns __read_mostly =
436 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
437
438static void update_perf_cpu_limits(void)
439{
440 u64 tmp = perf_sample_period_ns;
441
442 tmp *= sysctl_perf_cpu_time_max_percent;
443 tmp = div_u64(tmp, 100);
444 if (!tmp)
445 tmp = 1;
446
447 WRITE_ONCE(perf_sample_allowed_ns, tmp);
448}
449
450static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
451
452int perf_proc_update_handler(struct ctl_table *table, int write,
453 void *buffer, size_t *lenp, loff_t *ppos)
454{
455 int ret;
456 int perf_cpu = sysctl_perf_cpu_time_max_percent;
457
458
459
460 if (write && (perf_cpu == 100 || perf_cpu == 0))
461 return -EINVAL;
462
463 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
464 if (ret || !write)
465 return ret;
466
467 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
468 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
469 update_perf_cpu_limits();
470
471 return 0;
472}
473
474int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
475
476int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
477 void *buffer, size_t *lenp, loff_t *ppos)
478{
479 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
480
481 if (ret || !write)
482 return ret;
483
484 if (sysctl_perf_cpu_time_max_percent == 100 ||
485 sysctl_perf_cpu_time_max_percent == 0) {
486 printk(KERN_WARNING
487 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
488 WRITE_ONCE(perf_sample_allowed_ns, 0);
489 } else {
490 update_perf_cpu_limits();
491 }
492
493 return 0;
494}
495
496
497
498
499
500
501
502#define NR_ACCUMULATED_SAMPLES 128
503static DEFINE_PER_CPU(u64, running_sample_length);
504
505static u64 __report_avg;
506static u64 __report_allowed;
507
508static void perf_duration_warn(struct irq_work *w)
509{
510 printk_ratelimited(KERN_INFO
511 "perf: interrupt took too long (%lld > %lld), lowering "
512 "kernel.perf_event_max_sample_rate to %d\n",
513 __report_avg, __report_allowed,
514 sysctl_perf_event_sample_rate);
515}
516
517static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
518
519void perf_sample_event_took(u64 sample_len_ns)
520{
521 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
522 u64 running_len;
523 u64 avg_len;
524 u32 max;
525
526 if (max_len == 0)
527 return;
528
529
530 running_len = __this_cpu_read(running_sample_length);
531 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
532 running_len += sample_len_ns;
533 __this_cpu_write(running_sample_length, running_len);
534
535
536
537
538
539
540 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
541 if (avg_len <= max_len)
542 return;
543
544 __report_avg = avg_len;
545 __report_allowed = max_len;
546
547
548
549
550 avg_len += avg_len / 4;
551 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
552 if (avg_len < max)
553 max /= (u32)avg_len;
554 else
555 max = 1;
556
557 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
558 WRITE_ONCE(max_samples_per_tick, max);
559
560 sysctl_perf_event_sample_rate = max * HZ;
561 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
562
563 if (!irq_work_queue(&perf_duration_work)) {
564 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
565 "kernel.perf_event_max_sample_rate to %d\n",
566 __report_avg, __report_allowed,
567 sysctl_perf_event_sample_rate);
568 }
569}
570
571static atomic64_t perf_event_id;
572
573static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
574 enum event_type_t event_type);
575
576static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
577 enum event_type_t event_type,
578 struct task_struct *task);
579
580static void update_context_time(struct perf_event_context *ctx);
581static u64 perf_event_time(struct perf_event *event);
582
583void __weak perf_event_print_debug(void) { }
584
585static inline u64 perf_clock(void)
586{
587 return local_clock();
588}
589
590static inline u64 perf_event_clock(struct perf_event *event)
591{
592 return event->clock();
593}
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617static __always_inline enum perf_event_state
618__perf_effective_state(struct perf_event *event)
619{
620 struct perf_event *leader = event->group_leader;
621
622 if (leader->state <= PERF_EVENT_STATE_OFF)
623 return leader->state;
624
625 return event->state;
626}
627
628static __always_inline void
629__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
630{
631 enum perf_event_state state = __perf_effective_state(event);
632 u64 delta = now - event->tstamp;
633
634 *enabled = event->total_time_enabled;
635 if (state >= PERF_EVENT_STATE_INACTIVE)
636 *enabled += delta;
637
638 *running = event->total_time_running;
639 if (state >= PERF_EVENT_STATE_ACTIVE)
640 *running += delta;
641}
642
643static void perf_event_update_time(struct perf_event *event)
644{
645 u64 now = perf_event_time(event);
646
647 __perf_update_times(event, now, &event->total_time_enabled,
648 &event->total_time_running);
649 event->tstamp = now;
650}
651
652static void perf_event_update_sibling_time(struct perf_event *leader)
653{
654 struct perf_event *sibling;
655
656 for_each_sibling_event(sibling, leader)
657 perf_event_update_time(sibling);
658}
659
660static void
661perf_event_set_state(struct perf_event *event, enum perf_event_state state)
662{
663 if (event->state == state)
664 return;
665
666 perf_event_update_time(event);
667
668
669
670
671 if ((event->state < 0) ^ (state < 0))
672 perf_event_update_sibling_time(event);
673
674 WRITE_ONCE(event->state, state);
675}
676
677#ifdef CONFIG_CGROUP_PERF
678
679static inline bool
680perf_cgroup_match(struct perf_event *event)
681{
682 struct perf_event_context *ctx = event->ctx;
683 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
684
685
686 if (!event->cgrp)
687 return true;
688
689
690 if (!cpuctx->cgrp)
691 return false;
692
693
694
695
696
697
698
699 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
700 event->cgrp->css.cgroup);
701}
702
703static inline void perf_detach_cgroup(struct perf_event *event)
704{
705 css_put(&event->cgrp->css);
706 event->cgrp = NULL;
707}
708
709static inline int is_cgroup_event(struct perf_event *event)
710{
711 return event->cgrp != NULL;
712}
713
714static inline u64 perf_cgroup_event_time(struct perf_event *event)
715{
716 struct perf_cgroup_info *t;
717
718 t = per_cpu_ptr(event->cgrp->info, event->cpu);
719 return t->time;
720}
721
722static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
723{
724 struct perf_cgroup_info *info;
725 u64 now;
726
727 now = perf_clock();
728
729 info = this_cpu_ptr(cgrp->info);
730
731 info->time += now - info->timestamp;
732 info->timestamp = now;
733}
734
735static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
736{
737 struct perf_cgroup *cgrp = cpuctx->cgrp;
738 struct cgroup_subsys_state *css;
739
740 if (cgrp) {
741 for (css = &cgrp->css; css; css = css->parent) {
742 cgrp = container_of(css, struct perf_cgroup, css);
743 __update_cgrp_time(cgrp);
744 }
745 }
746}
747
748static inline void update_cgrp_time_from_event(struct perf_event *event)
749{
750 struct perf_cgroup *cgrp;
751
752
753
754
755
756 if (!is_cgroup_event(event))
757 return;
758
759 cgrp = perf_cgroup_from_task(current, event->ctx);
760
761
762
763 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
764 __update_cgrp_time(event->cgrp);
765}
766
767static inline void
768perf_cgroup_set_timestamp(struct task_struct *task,
769 struct perf_event_context *ctx)
770{
771 struct perf_cgroup *cgrp;
772 struct perf_cgroup_info *info;
773 struct cgroup_subsys_state *css;
774
775
776
777
778
779
780 if (!task || !ctx->nr_cgroups)
781 return;
782
783 cgrp = perf_cgroup_from_task(task, ctx);
784
785 for (css = &cgrp->css; css; css = css->parent) {
786 cgrp = container_of(css, struct perf_cgroup, css);
787 info = this_cpu_ptr(cgrp->info);
788 info->timestamp = ctx->timestamp;
789 }
790}
791
792static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
793
794#define PERF_CGROUP_SWOUT 0x1
795#define PERF_CGROUP_SWIN 0x2
796
797
798
799
800
801
802
803static void perf_cgroup_switch(struct task_struct *task, int mode)
804{
805 struct perf_cpu_context *cpuctx;
806 struct list_head *list;
807 unsigned long flags;
808
809
810
811
812
813 local_irq_save(flags);
814
815 list = this_cpu_ptr(&cgrp_cpuctx_list);
816 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
817 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
818
819 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
820 perf_pmu_disable(cpuctx->ctx.pmu);
821
822 if (mode & PERF_CGROUP_SWOUT) {
823 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
824
825
826
827
828 cpuctx->cgrp = NULL;
829 }
830
831 if (mode & PERF_CGROUP_SWIN) {
832 WARN_ON_ONCE(cpuctx->cgrp);
833
834
835
836
837
838
839
840 cpuctx->cgrp = perf_cgroup_from_task(task,
841 &cpuctx->ctx);
842 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
843 }
844 perf_pmu_enable(cpuctx->ctx.pmu);
845 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
846 }
847
848 local_irq_restore(flags);
849}
850
851static inline void perf_cgroup_sched_out(struct task_struct *task,
852 struct task_struct *next)
853{
854 struct perf_cgroup *cgrp1;
855 struct perf_cgroup *cgrp2 = NULL;
856
857 rcu_read_lock();
858
859
860
861
862
863 cgrp1 = perf_cgroup_from_task(task, NULL);
864 cgrp2 = perf_cgroup_from_task(next, NULL);
865
866
867
868
869
870
871 if (cgrp1 != cgrp2)
872 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
873
874 rcu_read_unlock();
875}
876
877static inline void perf_cgroup_sched_in(struct task_struct *prev,
878 struct task_struct *task)
879{
880 struct perf_cgroup *cgrp1;
881 struct perf_cgroup *cgrp2 = NULL;
882
883 rcu_read_lock();
884
885
886
887
888
889 cgrp1 = perf_cgroup_from_task(task, NULL);
890 cgrp2 = perf_cgroup_from_task(prev, NULL);
891
892
893
894
895
896
897 if (cgrp1 != cgrp2)
898 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
899
900 rcu_read_unlock();
901}
902
903static int perf_cgroup_ensure_storage(struct perf_event *event,
904 struct cgroup_subsys_state *css)
905{
906 struct perf_cpu_context *cpuctx;
907 struct perf_event **storage;
908 int cpu, heap_size, ret = 0;
909
910
911
912
913
914 for (heap_size = 1; css; css = css->parent)
915 heap_size++;
916
917 for_each_possible_cpu(cpu) {
918 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
919 if (heap_size <= cpuctx->heap_size)
920 continue;
921
922 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
923 GFP_KERNEL, cpu_to_node(cpu));
924 if (!storage) {
925 ret = -ENOMEM;
926 break;
927 }
928
929 raw_spin_lock_irq(&cpuctx->ctx.lock);
930 if (cpuctx->heap_size < heap_size) {
931 swap(cpuctx->heap, storage);
932 if (storage == cpuctx->heap_default)
933 storage = NULL;
934 cpuctx->heap_size = heap_size;
935 }
936 raw_spin_unlock_irq(&cpuctx->ctx.lock);
937
938 kfree(storage);
939 }
940
941 return ret;
942}
943
944static inline int perf_cgroup_connect(int fd, struct perf_event *event,
945 struct perf_event_attr *attr,
946 struct perf_event *group_leader)
947{
948 struct perf_cgroup *cgrp;
949 struct cgroup_subsys_state *css;
950 struct fd f = fdget(fd);
951 int ret = 0;
952
953 if (!f.file)
954 return -EBADF;
955
956 css = css_tryget_online_from_dir(f.file->f_path.dentry,
957 &perf_event_cgrp_subsys);
958 if (IS_ERR(css)) {
959 ret = PTR_ERR(css);
960 goto out;
961 }
962
963 ret = perf_cgroup_ensure_storage(event, css);
964 if (ret)
965 goto out;
966
967 cgrp = container_of(css, struct perf_cgroup, css);
968 event->cgrp = cgrp;
969
970
971
972
973
974
975 if (group_leader && group_leader->cgrp != cgrp) {
976 perf_detach_cgroup(event);
977 ret = -EINVAL;
978 }
979out:
980 fdput(f);
981 return ret;
982}
983
984static inline void
985perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
986{
987 struct perf_cgroup_info *t;
988 t = per_cpu_ptr(event->cgrp->info, event->cpu);
989 event->shadow_ctx_time = now - t->timestamp;
990}
991
992static inline void
993perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
994{
995 struct perf_cpu_context *cpuctx;
996
997 if (!is_cgroup_event(event))
998 return;
999
1000
1001
1002
1003
1004 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1005
1006
1007
1008
1009
1010
1011
1012 if (ctx->is_active && !cpuctx->cgrp) {
1013 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1014
1015 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1016 cpuctx->cgrp = cgrp;
1017 }
1018
1019 if (ctx->nr_cgroups++)
1020 return;
1021
1022 list_add(&cpuctx->cgrp_cpuctx_entry,
1023 per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1024}
1025
1026static inline void
1027perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1028{
1029 struct perf_cpu_context *cpuctx;
1030
1031 if (!is_cgroup_event(event))
1032 return;
1033
1034
1035
1036
1037
1038 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1039
1040 if (--ctx->nr_cgroups)
1041 return;
1042
1043 if (ctx->is_active && cpuctx->cgrp)
1044 cpuctx->cgrp = NULL;
1045
1046 list_del(&cpuctx->cgrp_cpuctx_entry);
1047}
1048
1049#else
1050
1051static inline bool
1052perf_cgroup_match(struct perf_event *event)
1053{
1054 return true;
1055}
1056
1057static inline void perf_detach_cgroup(struct perf_event *event)
1058{}
1059
1060static inline int is_cgroup_event(struct perf_event *event)
1061{
1062 return 0;
1063}
1064
1065static inline void update_cgrp_time_from_event(struct perf_event *event)
1066{
1067}
1068
1069static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1070{
1071}
1072
1073static inline void perf_cgroup_sched_out(struct task_struct *task,
1074 struct task_struct *next)
1075{
1076}
1077
1078static inline void perf_cgroup_sched_in(struct task_struct *prev,
1079 struct task_struct *task)
1080{
1081}
1082
1083static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1084 struct perf_event_attr *attr,
1085 struct perf_event *group_leader)
1086{
1087 return -EINVAL;
1088}
1089
1090static inline void
1091perf_cgroup_set_timestamp(struct task_struct *task,
1092 struct perf_event_context *ctx)
1093{
1094}
1095
1096static inline void
1097perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1098{
1099}
1100
1101static inline void
1102perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1103{
1104}
1105
1106static inline u64 perf_cgroup_event_time(struct perf_event *event)
1107{
1108 return 0;
1109}
1110
1111static inline void
1112perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1113{
1114}
1115
1116static inline void
1117perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1118{
1119}
1120#endif
1121
1122
1123
1124
1125
1126#define PERF_CPU_HRTIMER (1000 / HZ)
1127
1128
1129
1130static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1131{
1132 struct perf_cpu_context *cpuctx;
1133 bool rotations;
1134
1135 lockdep_assert_irqs_disabled();
1136
1137 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1138 rotations = perf_rotate_context(cpuctx);
1139
1140 raw_spin_lock(&cpuctx->hrtimer_lock);
1141 if (rotations)
1142 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1143 else
1144 cpuctx->hrtimer_active = 0;
1145 raw_spin_unlock(&cpuctx->hrtimer_lock);
1146
1147 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1148}
1149
1150static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1151{
1152 struct hrtimer *timer = &cpuctx->hrtimer;
1153 struct pmu *pmu = cpuctx->ctx.pmu;
1154 u64 interval;
1155
1156
1157 if (pmu->task_ctx_nr == perf_sw_context)
1158 return;
1159
1160
1161
1162
1163
1164 interval = pmu->hrtimer_interval_ms;
1165 if (interval < 1)
1166 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1167
1168 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1169
1170 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1171 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1172 timer->function = perf_mux_hrtimer_handler;
1173}
1174
1175static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1176{
1177 struct hrtimer *timer = &cpuctx->hrtimer;
1178 struct pmu *pmu = cpuctx->ctx.pmu;
1179 unsigned long flags;
1180
1181
1182 if (pmu->task_ctx_nr == perf_sw_context)
1183 return 0;
1184
1185 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1186 if (!cpuctx->hrtimer_active) {
1187 cpuctx->hrtimer_active = 1;
1188 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1189 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1190 }
1191 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1192
1193 return 0;
1194}
1195
1196void perf_pmu_disable(struct pmu *pmu)
1197{
1198 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1199 if (!(*count)++)
1200 pmu->pmu_disable(pmu);
1201}
1202
1203void perf_pmu_enable(struct pmu *pmu)
1204{
1205 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1206 if (!--(*count))
1207 pmu->pmu_enable(pmu);
1208}
1209
1210static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1211
1212
1213
1214
1215
1216
1217
1218static void perf_event_ctx_activate(struct perf_event_context *ctx)
1219{
1220 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1221
1222 lockdep_assert_irqs_disabled();
1223
1224 WARN_ON(!list_empty(&ctx->active_ctx_list));
1225
1226 list_add(&ctx->active_ctx_list, head);
1227}
1228
1229static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1230{
1231 lockdep_assert_irqs_disabled();
1232
1233 WARN_ON(list_empty(&ctx->active_ctx_list));
1234
1235 list_del_init(&ctx->active_ctx_list);
1236}
1237
1238static void get_ctx(struct perf_event_context *ctx)
1239{
1240 refcount_inc(&ctx->refcount);
1241}
1242
1243static void *alloc_task_ctx_data(struct pmu *pmu)
1244{
1245 if (pmu->task_ctx_cache)
1246 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1247
1248 return NULL;
1249}
1250
1251static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1252{
1253 if (pmu->task_ctx_cache && task_ctx_data)
1254 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1255}
1256
1257static void free_ctx(struct rcu_head *head)
1258{
1259 struct perf_event_context *ctx;
1260
1261 ctx = container_of(head, struct perf_event_context, rcu_head);
1262 free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1263 kfree(ctx);
1264}
1265
1266static void put_ctx(struct perf_event_context *ctx)
1267{
1268 if (refcount_dec_and_test(&ctx->refcount)) {
1269 if (ctx->parent_ctx)
1270 put_ctx(ctx->parent_ctx);
1271 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1272 put_task_struct(ctx->task);
1273 call_rcu(&ctx->rcu_head, free_ctx);
1274 }
1275}
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343static struct perf_event_context *
1344perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1345{
1346 struct perf_event_context *ctx;
1347
1348again:
1349 rcu_read_lock();
1350 ctx = READ_ONCE(event->ctx);
1351 if (!refcount_inc_not_zero(&ctx->refcount)) {
1352 rcu_read_unlock();
1353 goto again;
1354 }
1355 rcu_read_unlock();
1356
1357 mutex_lock_nested(&ctx->mutex, nesting);
1358 if (event->ctx != ctx) {
1359 mutex_unlock(&ctx->mutex);
1360 put_ctx(ctx);
1361 goto again;
1362 }
1363
1364 return ctx;
1365}
1366
1367static inline struct perf_event_context *
1368perf_event_ctx_lock(struct perf_event *event)
1369{
1370 return perf_event_ctx_lock_nested(event, 0);
1371}
1372
1373static void perf_event_ctx_unlock(struct perf_event *event,
1374 struct perf_event_context *ctx)
1375{
1376 mutex_unlock(&ctx->mutex);
1377 put_ctx(ctx);
1378}
1379
1380
1381
1382
1383
1384
1385static __must_check struct perf_event_context *
1386unclone_ctx(struct perf_event_context *ctx)
1387{
1388 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1389
1390 lockdep_assert_held(&ctx->lock);
1391
1392 if (parent_ctx)
1393 ctx->parent_ctx = NULL;
1394 ctx->generation++;
1395
1396 return parent_ctx;
1397}
1398
1399static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1400 enum pid_type type)
1401{
1402 u32 nr;
1403
1404
1405
1406 if (event->parent)
1407 event = event->parent;
1408
1409 nr = __task_pid_nr_ns(p, type, event->ns);
1410
1411 if (!nr && !pid_alive(p))
1412 nr = -1;
1413 return nr;
1414}
1415
1416static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1417{
1418 return perf_event_pid_type(event, p, PIDTYPE_TGID);
1419}
1420
1421static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1422{
1423 return perf_event_pid_type(event, p, PIDTYPE_PID);
1424}
1425
1426
1427
1428
1429
1430static u64 primary_event_id(struct perf_event *event)
1431{
1432 u64 id = event->id;
1433
1434 if (event->parent)
1435 id = event->parent->id;
1436
1437 return id;
1438}
1439
1440
1441
1442
1443
1444
1445
1446static struct perf_event_context *
1447perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1448{
1449 struct perf_event_context *ctx;
1450
1451retry:
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461 local_irq_save(*flags);
1462 rcu_read_lock();
1463 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1464 if (ctx) {
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475 raw_spin_lock(&ctx->lock);
1476 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1477 raw_spin_unlock(&ctx->lock);
1478 rcu_read_unlock();
1479 local_irq_restore(*flags);
1480 goto retry;
1481 }
1482
1483 if (ctx->task == TASK_TOMBSTONE ||
1484 !refcount_inc_not_zero(&ctx->refcount)) {
1485 raw_spin_unlock(&ctx->lock);
1486 ctx = NULL;
1487 } else {
1488 WARN_ON_ONCE(ctx->task != task);
1489 }
1490 }
1491 rcu_read_unlock();
1492 if (!ctx)
1493 local_irq_restore(*flags);
1494 return ctx;
1495}
1496
1497
1498
1499
1500
1501
1502static struct perf_event_context *
1503perf_pin_task_context(struct task_struct *task, int ctxn)
1504{
1505 struct perf_event_context *ctx;
1506 unsigned long flags;
1507
1508 ctx = perf_lock_task_context(task, ctxn, &flags);
1509 if (ctx) {
1510 ++ctx->pin_count;
1511 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1512 }
1513 return ctx;
1514}
1515
1516static void perf_unpin_context(struct perf_event_context *ctx)
1517{
1518 unsigned long flags;
1519
1520 raw_spin_lock_irqsave(&ctx->lock, flags);
1521 --ctx->pin_count;
1522 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1523}
1524
1525
1526
1527
1528static void update_context_time(struct perf_event_context *ctx)
1529{
1530 u64 now = perf_clock();
1531
1532 ctx->time += now - ctx->timestamp;
1533 ctx->timestamp = now;
1534}
1535
1536static u64 perf_event_time(struct perf_event *event)
1537{
1538 struct perf_event_context *ctx = event->ctx;
1539
1540 if (is_cgroup_event(event))
1541 return perf_cgroup_event_time(event);
1542
1543 return ctx ? ctx->time : 0;
1544}
1545
1546static enum event_type_t get_event_type(struct perf_event *event)
1547{
1548 struct perf_event_context *ctx = event->ctx;
1549 enum event_type_t event_type;
1550
1551 lockdep_assert_held(&ctx->lock);
1552
1553
1554
1555
1556
1557 if (event->group_leader != event)
1558 event = event->group_leader;
1559
1560 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1561 if (!ctx->task)
1562 event_type |= EVENT_CPU;
1563
1564 return event_type;
1565}
1566
1567
1568
1569
1570static void init_event_group(struct perf_event *event)
1571{
1572 RB_CLEAR_NODE(&event->group_node);
1573 event->group_index = 0;
1574}
1575
1576
1577
1578
1579
1580static struct perf_event_groups *
1581get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1582{
1583 if (event->attr.pinned)
1584 return &ctx->pinned_groups;
1585 else
1586 return &ctx->flexible_groups;
1587}
1588
1589
1590
1591
1592static void perf_event_groups_init(struct perf_event_groups *groups)
1593{
1594 groups->tree = RB_ROOT;
1595 groups->index = 0;
1596}
1597
1598static inline struct cgroup *event_cgroup(const struct perf_event *event)
1599{
1600 struct cgroup *cgroup = NULL;
1601
1602#ifdef CONFIG_CGROUP_PERF
1603 if (event->cgrp)
1604 cgroup = event->cgrp->css.cgroup;
1605#endif
1606
1607 return cgroup;
1608}
1609
1610
1611
1612
1613
1614
1615
1616static __always_inline int
1617perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1618 const u64 left_group_index, const struct perf_event *right)
1619{
1620 if (left_cpu < right->cpu)
1621 return -1;
1622 if (left_cpu > right->cpu)
1623 return 1;
1624
1625#ifdef CONFIG_CGROUP_PERF
1626 {
1627 const struct cgroup *right_cgroup = event_cgroup(right);
1628
1629 if (left_cgroup != right_cgroup) {
1630 if (!left_cgroup) {
1631
1632
1633
1634
1635 return -1;
1636 }
1637 if (!right_cgroup) {
1638
1639
1640
1641
1642 return 1;
1643 }
1644
1645 if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1646 return -1;
1647
1648 return 1;
1649 }
1650 }
1651#endif
1652
1653 if (left_group_index < right->group_index)
1654 return -1;
1655 if (left_group_index > right->group_index)
1656 return 1;
1657
1658 return 0;
1659}
1660
1661#define __node_2_pe(node) \
1662 rb_entry((node), struct perf_event, group_node)
1663
1664static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1665{
1666 struct perf_event *e = __node_2_pe(a);
1667 return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1668 __node_2_pe(b)) < 0;
1669}
1670
1671struct __group_key {
1672 int cpu;
1673 struct cgroup *cgroup;
1674};
1675
1676static inline int __group_cmp(const void *key, const struct rb_node *node)
1677{
1678 const struct __group_key *a = key;
1679 const struct perf_event *b = __node_2_pe(node);
1680
1681
1682 return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1683}
1684
1685
1686
1687
1688
1689
1690static void
1691perf_event_groups_insert(struct perf_event_groups *groups,
1692 struct perf_event *event)
1693{
1694 event->group_index = ++groups->index;
1695
1696 rb_add(&event->group_node, &groups->tree, __group_less);
1697}
1698
1699
1700
1701
1702static void
1703add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1704{
1705 struct perf_event_groups *groups;
1706
1707 groups = get_event_groups(event, ctx);
1708 perf_event_groups_insert(groups, event);
1709}
1710
1711
1712
1713
1714static void
1715perf_event_groups_delete(struct perf_event_groups *groups,
1716 struct perf_event *event)
1717{
1718 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1719 RB_EMPTY_ROOT(&groups->tree));
1720
1721 rb_erase(&event->group_node, &groups->tree);
1722 init_event_group(event);
1723}
1724
1725
1726
1727
1728static void
1729del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1730{
1731 struct perf_event_groups *groups;
1732
1733 groups = get_event_groups(event, ctx);
1734 perf_event_groups_delete(groups, event);
1735}
1736
1737
1738
1739
1740static struct perf_event *
1741perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1742 struct cgroup *cgrp)
1743{
1744 struct __group_key key = {
1745 .cpu = cpu,
1746 .cgroup = cgrp,
1747 };
1748 struct rb_node *node;
1749
1750 node = rb_find_first(&key, &groups->tree, __group_cmp);
1751 if (node)
1752 return __node_2_pe(node);
1753
1754 return NULL;
1755}
1756
1757
1758
1759
1760static struct perf_event *
1761perf_event_groups_next(struct perf_event *event)
1762{
1763 struct __group_key key = {
1764 .cpu = event->cpu,
1765 .cgroup = event_cgroup(event),
1766 };
1767 struct rb_node *next;
1768
1769 next = rb_next_match(&key, &event->group_node, __group_cmp);
1770 if (next)
1771 return __node_2_pe(next);
1772
1773 return NULL;
1774}
1775
1776
1777
1778
1779#define perf_event_groups_for_each(event, groups) \
1780 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1781 typeof(*event), group_node); event; \
1782 event = rb_entry_safe(rb_next(&event->group_node), \
1783 typeof(*event), group_node))
1784
1785
1786
1787
1788
1789static void
1790list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1791{
1792 lockdep_assert_held(&ctx->lock);
1793
1794 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1795 event->attach_state |= PERF_ATTACH_CONTEXT;
1796
1797 event->tstamp = perf_event_time(event);
1798
1799
1800
1801
1802
1803
1804 if (event->group_leader == event) {
1805 event->group_caps = event->event_caps;
1806 add_event_to_groups(event, ctx);
1807 }
1808
1809 list_add_rcu(&event->event_entry, &ctx->event_list);
1810 ctx->nr_events++;
1811 if (event->attr.inherit_stat)
1812 ctx->nr_stat++;
1813
1814 if (event->state > PERF_EVENT_STATE_OFF)
1815 perf_cgroup_event_enable(event, ctx);
1816
1817 ctx->generation++;
1818}
1819
1820
1821
1822
1823static inline void perf_event__state_init(struct perf_event *event)
1824{
1825 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1826 PERF_EVENT_STATE_INACTIVE;
1827}
1828
1829static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1830{
1831 int entry = sizeof(u64);
1832 int size = 0;
1833 int nr = 1;
1834
1835 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1836 size += sizeof(u64);
1837
1838 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1839 size += sizeof(u64);
1840
1841 if (event->attr.read_format & PERF_FORMAT_ID)
1842 entry += sizeof(u64);
1843
1844 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1845 nr += nr_siblings;
1846 size += sizeof(u64);
1847 }
1848
1849 size += entry * nr;
1850 event->read_size = size;
1851}
1852
1853static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1854{
1855 struct perf_sample_data *data;
1856 u16 size = 0;
1857
1858 if (sample_type & PERF_SAMPLE_IP)
1859 size += sizeof(data->ip);
1860
1861 if (sample_type & PERF_SAMPLE_ADDR)
1862 size += sizeof(data->addr);
1863
1864 if (sample_type & PERF_SAMPLE_PERIOD)
1865 size += sizeof(data->period);
1866
1867 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1868 size += sizeof(data->weight.full);
1869
1870 if (sample_type & PERF_SAMPLE_READ)
1871 size += event->read_size;
1872
1873 if (sample_type & PERF_SAMPLE_DATA_SRC)
1874 size += sizeof(data->data_src.val);
1875
1876 if (sample_type & PERF_SAMPLE_TRANSACTION)
1877 size += sizeof(data->txn);
1878
1879 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1880 size += sizeof(data->phys_addr);
1881
1882 if (sample_type & PERF_SAMPLE_CGROUP)
1883 size += sizeof(data->cgroup);
1884
1885 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1886 size += sizeof(data->data_page_size);
1887
1888 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1889 size += sizeof(data->code_page_size);
1890
1891 event->header_size = size;
1892}
1893
1894
1895
1896
1897
1898static void perf_event__header_size(struct perf_event *event)
1899{
1900 __perf_event_read_size(event,
1901 event->group_leader->nr_siblings);
1902 __perf_event_header_size(event, event->attr.sample_type);
1903}
1904
1905static void perf_event__id_header_size(struct perf_event *event)
1906{
1907 struct perf_sample_data *data;
1908 u64 sample_type = event->attr.sample_type;
1909 u16 size = 0;
1910
1911 if (sample_type & PERF_SAMPLE_TID)
1912 size += sizeof(data->tid_entry);
1913
1914 if (sample_type & PERF_SAMPLE_TIME)
1915 size += sizeof(data->time);
1916
1917 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1918 size += sizeof(data->id);
1919
1920 if (sample_type & PERF_SAMPLE_ID)
1921 size += sizeof(data->id);
1922
1923 if (sample_type & PERF_SAMPLE_STREAM_ID)
1924 size += sizeof(data->stream_id);
1925
1926 if (sample_type & PERF_SAMPLE_CPU)
1927 size += sizeof(data->cpu_entry);
1928
1929 event->id_header_size = size;
1930}
1931
1932static bool perf_event_validate_size(struct perf_event *event)
1933{
1934
1935
1936
1937
1938 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1939 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1940 perf_event__id_header_size(event);
1941
1942
1943
1944
1945
1946 if (event->read_size + event->header_size +
1947 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1948 return false;
1949
1950 return true;
1951}
1952
1953static void perf_group_attach(struct perf_event *event)
1954{
1955 struct perf_event *group_leader = event->group_leader, *pos;
1956
1957 lockdep_assert_held(&event->ctx->lock);
1958
1959
1960
1961
1962 if (event->attach_state & PERF_ATTACH_GROUP)
1963 return;
1964
1965 event->attach_state |= PERF_ATTACH_GROUP;
1966
1967 if (group_leader == event)
1968 return;
1969
1970 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1971
1972 group_leader->group_caps &= event->event_caps;
1973
1974 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1975 group_leader->nr_siblings++;
1976
1977 perf_event__header_size(group_leader);
1978
1979 for_each_sibling_event(pos, group_leader)
1980 perf_event__header_size(pos);
1981}
1982
1983
1984
1985
1986
1987static void
1988list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1989{
1990 WARN_ON_ONCE(event->ctx != ctx);
1991 lockdep_assert_held(&ctx->lock);
1992
1993
1994
1995
1996 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1997 return;
1998
1999 event->attach_state &= ~PERF_ATTACH_CONTEXT;
2000
2001 ctx->nr_events--;
2002 if (event->attr.inherit_stat)
2003 ctx->nr_stat--;
2004
2005 list_del_rcu(&event->event_entry);
2006
2007 if (event->group_leader == event)
2008 del_event_from_groups(event, ctx);
2009
2010
2011
2012
2013
2014
2015
2016
2017 if (event->state > PERF_EVENT_STATE_OFF) {
2018 perf_cgroup_event_disable(event, ctx);
2019 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2020 }
2021
2022 ctx->generation++;
2023}
2024
2025static int
2026perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2027{
2028 if (!has_aux(aux_event))
2029 return 0;
2030
2031 if (!event->pmu->aux_output_match)
2032 return 0;
2033
2034 return event->pmu->aux_output_match(aux_event);
2035}
2036
2037static void put_event(struct perf_event *event);
2038static void event_sched_out(struct perf_event *event,
2039 struct perf_cpu_context *cpuctx,
2040 struct perf_event_context *ctx);
2041
2042static void perf_put_aux_event(struct perf_event *event)
2043{
2044 struct perf_event_context *ctx = event->ctx;
2045 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2046 struct perf_event *iter;
2047
2048
2049
2050
2051 if (event->aux_event) {
2052 iter = event->aux_event;
2053 event->aux_event = NULL;
2054 put_event(iter);
2055 return;
2056 }
2057
2058
2059
2060
2061
2062 for_each_sibling_event(iter, event->group_leader) {
2063 if (iter->aux_event != event)
2064 continue;
2065
2066 iter->aux_event = NULL;
2067 put_event(event);
2068
2069
2070
2071
2072
2073
2074 event_sched_out(iter, cpuctx, ctx);
2075 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2076 }
2077}
2078
2079static bool perf_need_aux_event(struct perf_event *event)
2080{
2081 return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2082}
2083
2084static int perf_get_aux_event(struct perf_event *event,
2085 struct perf_event *group_leader)
2086{
2087
2088
2089
2090
2091
2092
2093 if (!group_leader)
2094 return 0;
2095
2096
2097
2098
2099 if (event->attr.aux_output && event->attr.aux_sample_size)
2100 return 0;
2101
2102 if (event->attr.aux_output &&
2103 !perf_aux_output_match(event, group_leader))
2104 return 0;
2105
2106 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2107 return 0;
2108
2109 if (!atomic_long_inc_not_zero(&group_leader->refcount))
2110 return 0;
2111
2112
2113
2114
2115
2116
2117
2118 event->aux_event = group_leader;
2119
2120 return 1;
2121}
2122
2123static inline struct list_head *get_event_list(struct perf_event *event)
2124{
2125 struct perf_event_context *ctx = event->ctx;
2126 return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2127}
2128
2129
2130
2131
2132
2133
2134
2135static inline void perf_remove_sibling_event(struct perf_event *event)
2136{
2137 struct perf_event_context *ctx = event->ctx;
2138 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2139
2140 event_sched_out(event, cpuctx, ctx);
2141 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2142}
2143
2144static void perf_group_detach(struct perf_event *event)
2145{
2146 struct perf_event *leader = event->group_leader;
2147 struct perf_event *sibling, *tmp;
2148 struct perf_event_context *ctx = event->ctx;
2149
2150 lockdep_assert_held(&ctx->lock);
2151
2152
2153
2154
2155 if (!(event->attach_state & PERF_ATTACH_GROUP))
2156 return;
2157
2158 event->attach_state &= ~PERF_ATTACH_GROUP;
2159
2160 perf_put_aux_event(event);
2161
2162
2163
2164
2165 if (leader != event) {
2166 list_del_init(&event->sibling_list);
2167 event->group_leader->nr_siblings--;
2168 goto out;
2169 }
2170
2171
2172
2173
2174
2175
2176 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2177
2178 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2179 perf_remove_sibling_event(sibling);
2180
2181 sibling->group_leader = sibling;
2182 list_del_init(&sibling->sibling_list);
2183
2184
2185 sibling->group_caps = event->group_caps;
2186
2187 if (!RB_EMPTY_NODE(&event->group_node)) {
2188 add_event_to_groups(sibling, event->ctx);
2189
2190 if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2191 list_add_tail(&sibling->active_list, get_event_list(sibling));
2192 }
2193
2194 WARN_ON_ONCE(sibling->ctx != event->ctx);
2195 }
2196
2197out:
2198 for_each_sibling_event(tmp, leader)
2199 perf_event__header_size(tmp);
2200
2201 perf_event__header_size(leader);
2202}
2203
2204static void sync_child_event(struct perf_event *child_event);
2205
2206static void perf_child_detach(struct perf_event *event)
2207{
2208 struct perf_event *parent_event = event->parent;
2209
2210 if (!(event->attach_state & PERF_ATTACH_CHILD))
2211 return;
2212
2213 event->attach_state &= ~PERF_ATTACH_CHILD;
2214
2215 if (WARN_ON_ONCE(!parent_event))
2216 return;
2217
2218 lockdep_assert_held(&parent_event->child_mutex);
2219
2220 sync_child_event(event);
2221 list_del_init(&event->child_list);
2222}
2223
2224static bool is_orphaned_event(struct perf_event *event)
2225{
2226 return event->state == PERF_EVENT_STATE_DEAD;
2227}
2228
2229static inline int __pmu_filter_match(struct perf_event *event)
2230{
2231 struct pmu *pmu = event->pmu;
2232 return pmu->filter_match ? pmu->filter_match(event) : 1;
2233}
2234
2235
2236
2237
2238
2239
2240
2241static inline int pmu_filter_match(struct perf_event *event)
2242{
2243 struct perf_event *sibling;
2244
2245 if (!__pmu_filter_match(event))
2246 return 0;
2247
2248 for_each_sibling_event(sibling, event) {
2249 if (!__pmu_filter_match(sibling))
2250 return 0;
2251 }
2252
2253 return 1;
2254}
2255
2256static inline int
2257event_filter_match(struct perf_event *event)
2258{
2259 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2260 perf_cgroup_match(event) && pmu_filter_match(event);
2261}
2262
2263static void
2264event_sched_out(struct perf_event *event,
2265 struct perf_cpu_context *cpuctx,
2266 struct perf_event_context *ctx)
2267{
2268 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2269
2270 WARN_ON_ONCE(event->ctx != ctx);
2271 lockdep_assert_held(&ctx->lock);
2272
2273 if (event->state != PERF_EVENT_STATE_ACTIVE)
2274 return;
2275
2276
2277
2278
2279
2280
2281 list_del_init(&event->active_list);
2282
2283 perf_pmu_disable(event->pmu);
2284
2285 event->pmu->del(event, 0);
2286 event->oncpu = -1;
2287
2288 if (READ_ONCE(event->pending_disable) >= 0) {
2289 WRITE_ONCE(event->pending_disable, -1);
2290 perf_cgroup_event_disable(event, ctx);
2291 state = PERF_EVENT_STATE_OFF;
2292 }
2293 perf_event_set_state(event, state);
2294
2295 if (!is_software_event(event))
2296 cpuctx->active_oncpu--;
2297 if (!--ctx->nr_active)
2298 perf_event_ctx_deactivate(ctx);
2299 if (event->attr.freq && event->attr.sample_freq)
2300 ctx->nr_freq--;
2301 if (event->attr.exclusive || !cpuctx->active_oncpu)
2302 cpuctx->exclusive = 0;
2303
2304 perf_pmu_enable(event->pmu);
2305}
2306
2307static void
2308group_sched_out(struct perf_event *group_event,
2309 struct perf_cpu_context *cpuctx,
2310 struct perf_event_context *ctx)
2311{
2312 struct perf_event *event;
2313
2314 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2315 return;
2316
2317 perf_pmu_disable(ctx->pmu);
2318
2319 event_sched_out(group_event, cpuctx, ctx);
2320
2321
2322
2323
2324 for_each_sibling_event(event, group_event)
2325 event_sched_out(event, cpuctx, ctx);
2326
2327 perf_pmu_enable(ctx->pmu);
2328}
2329
2330#define DETACH_GROUP 0x01UL
2331#define DETACH_CHILD 0x02UL
2332
2333
2334
2335
2336
2337
2338
2339static void
2340__perf_remove_from_context(struct perf_event *event,
2341 struct perf_cpu_context *cpuctx,
2342 struct perf_event_context *ctx,
2343 void *info)
2344{
2345 unsigned long flags = (unsigned long)info;
2346
2347 if (ctx->is_active & EVENT_TIME) {
2348 update_context_time(ctx);
2349 update_cgrp_time_from_cpuctx(cpuctx);
2350 }
2351
2352 event_sched_out(event, cpuctx, ctx);
2353 if (flags & DETACH_GROUP)
2354 perf_group_detach(event);
2355 if (flags & DETACH_CHILD)
2356 perf_child_detach(event);
2357 list_del_event(event, ctx);
2358
2359 if (!ctx->nr_events && ctx->is_active) {
2360 ctx->is_active = 0;
2361 ctx->rotate_necessary = 0;
2362 if (ctx->task) {
2363 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2364 cpuctx->task_ctx = NULL;
2365 }
2366 }
2367}
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2380{
2381 struct perf_event_context *ctx = event->ctx;
2382
2383 lockdep_assert_held(&ctx->mutex);
2384
2385
2386
2387
2388
2389
2390 raw_spin_lock_irq(&ctx->lock);
2391 if (!ctx->is_active) {
2392 __perf_remove_from_context(event, __get_cpu_context(ctx),
2393 ctx, (void *)flags);
2394 raw_spin_unlock_irq(&ctx->lock);
2395 return;
2396 }
2397 raw_spin_unlock_irq(&ctx->lock);
2398
2399 event_function_call(event, __perf_remove_from_context, (void *)flags);
2400}
2401
2402
2403
2404
2405static void __perf_event_disable(struct perf_event *event,
2406 struct perf_cpu_context *cpuctx,
2407 struct perf_event_context *ctx,
2408 void *info)
2409{
2410 if (event->state < PERF_EVENT_STATE_INACTIVE)
2411 return;
2412
2413 if (ctx->is_active & EVENT_TIME) {
2414 update_context_time(ctx);
2415 update_cgrp_time_from_event(event);
2416 }
2417
2418 if (event == event->group_leader)
2419 group_sched_out(event, cpuctx, ctx);
2420 else
2421 event_sched_out(event, cpuctx, ctx);
2422
2423 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2424 perf_cgroup_event_disable(event, ctx);
2425}
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441static void _perf_event_disable(struct perf_event *event)
2442{
2443 struct perf_event_context *ctx = event->ctx;
2444
2445 raw_spin_lock_irq(&ctx->lock);
2446 if (event->state <= PERF_EVENT_STATE_OFF) {
2447 raw_spin_unlock_irq(&ctx->lock);
2448 return;
2449 }
2450 raw_spin_unlock_irq(&ctx->lock);
2451
2452 event_function_call(event, __perf_event_disable, NULL);
2453}
2454
2455void perf_event_disable_local(struct perf_event *event)
2456{
2457 event_function_local(event, __perf_event_disable, NULL);
2458}
2459
2460
2461
2462
2463
2464void perf_event_disable(struct perf_event *event)
2465{
2466 struct perf_event_context *ctx;
2467
2468 ctx = perf_event_ctx_lock(event);
2469 _perf_event_disable(event);
2470 perf_event_ctx_unlock(event, ctx);
2471}
2472EXPORT_SYMBOL_GPL(perf_event_disable);
2473
2474void perf_event_disable_inatomic(struct perf_event *event)
2475{
2476 WRITE_ONCE(event->pending_disable, smp_processor_id());
2477
2478 irq_work_queue(&event->pending);
2479}
2480
2481static void perf_set_shadow_time(struct perf_event *event,
2482 struct perf_event_context *ctx)
2483{
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509 if (is_cgroup_event(event))
2510 perf_cgroup_set_shadow_time(event, event->tstamp);
2511 else
2512 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2513}
2514
2515#define MAX_INTERRUPTS (~0ULL)
2516
2517static void perf_log_throttle(struct perf_event *event, int enable);
2518static void perf_log_itrace_start(struct perf_event *event);
2519
2520static int
2521event_sched_in(struct perf_event *event,
2522 struct perf_cpu_context *cpuctx,
2523 struct perf_event_context *ctx)
2524{
2525 int ret = 0;
2526
2527 WARN_ON_ONCE(event->ctx != ctx);
2528
2529 lockdep_assert_held(&ctx->lock);
2530
2531 if (event->state <= PERF_EVENT_STATE_OFF)
2532 return 0;
2533
2534 WRITE_ONCE(event->oncpu, smp_processor_id());
2535
2536
2537
2538
2539
2540 smp_wmb();
2541 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2542
2543
2544
2545
2546
2547
2548 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2549 perf_log_throttle(event, 1);
2550 event->hw.interrupts = 0;
2551 }
2552
2553 perf_pmu_disable(event->pmu);
2554
2555 perf_set_shadow_time(event, ctx);
2556
2557 perf_log_itrace_start(event);
2558
2559 if (event->pmu->add(event, PERF_EF_START)) {
2560 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2561 event->oncpu = -1;
2562 ret = -EAGAIN;
2563 goto out;
2564 }
2565
2566 if (!is_software_event(event))
2567 cpuctx->active_oncpu++;
2568 if (!ctx->nr_active++)
2569 perf_event_ctx_activate(ctx);
2570 if (event->attr.freq && event->attr.sample_freq)
2571 ctx->nr_freq++;
2572
2573 if (event->attr.exclusive)
2574 cpuctx->exclusive = 1;
2575
2576out:
2577 perf_pmu_enable(event->pmu);
2578
2579 return ret;
2580}
2581
2582static int
2583group_sched_in(struct perf_event *group_event,
2584 struct perf_cpu_context *cpuctx,
2585 struct perf_event_context *ctx)
2586{
2587 struct perf_event *event, *partial_group = NULL;
2588 struct pmu *pmu = ctx->pmu;
2589
2590 if (group_event->state == PERF_EVENT_STATE_OFF)
2591 return 0;
2592
2593 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2594
2595 if (event_sched_in(group_event, cpuctx, ctx))
2596 goto error;
2597
2598
2599
2600
2601 for_each_sibling_event(event, group_event) {
2602 if (event_sched_in(event, cpuctx, ctx)) {
2603 partial_group = event;
2604 goto group_error;
2605 }
2606 }
2607
2608 if (!pmu->commit_txn(pmu))
2609 return 0;
2610
2611group_error:
2612
2613
2614
2615
2616
2617 for_each_sibling_event(event, group_event) {
2618 if (event == partial_group)
2619 break;
2620
2621 event_sched_out(event, cpuctx, ctx);
2622 }
2623 event_sched_out(group_event, cpuctx, ctx);
2624
2625error:
2626 pmu->cancel_txn(pmu);
2627 return -EAGAIN;
2628}
2629
2630
2631
2632
2633static int group_can_go_on(struct perf_event *event,
2634 struct perf_cpu_context *cpuctx,
2635 int can_add_hw)
2636{
2637
2638
2639
2640 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2641 return 1;
2642
2643
2644
2645
2646 if (cpuctx->exclusive)
2647 return 0;
2648
2649
2650
2651
2652 if (event->attr.exclusive && !list_empty(get_event_list(event)))
2653 return 0;
2654
2655
2656
2657
2658 return can_add_hw;
2659}
2660
2661static void add_event_to_ctx(struct perf_event *event,
2662 struct perf_event_context *ctx)
2663{
2664 list_add_event(event, ctx);
2665 perf_group_attach(event);
2666}
2667
2668static void ctx_sched_out(struct perf_event_context *ctx,
2669 struct perf_cpu_context *cpuctx,
2670 enum event_type_t event_type);
2671static void
2672ctx_sched_in(struct perf_event_context *ctx,
2673 struct perf_cpu_context *cpuctx,
2674 enum event_type_t event_type,
2675 struct task_struct *task);
2676
2677static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2678 struct perf_event_context *ctx,
2679 enum event_type_t event_type)
2680{
2681 if (!cpuctx->task_ctx)
2682 return;
2683
2684 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2685 return;
2686
2687 ctx_sched_out(ctx, cpuctx, event_type);
2688}
2689
2690static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2691 struct perf_event_context *ctx,
2692 struct task_struct *task)
2693{
2694 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2695 if (ctx)
2696 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2697 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2698 if (ctx)
2699 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2700}
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717static void ctx_resched(struct perf_cpu_context *cpuctx,
2718 struct perf_event_context *task_ctx,
2719 enum event_type_t event_type)
2720{
2721 enum event_type_t ctx_event_type;
2722 bool cpu_event = !!(event_type & EVENT_CPU);
2723
2724
2725
2726
2727
2728 if (event_type & EVENT_PINNED)
2729 event_type |= EVENT_FLEXIBLE;
2730
2731 ctx_event_type = event_type & EVENT_ALL;
2732
2733 perf_pmu_disable(cpuctx->ctx.pmu);
2734 if (task_ctx)
2735 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2736
2737
2738
2739
2740
2741
2742
2743
2744 if (cpu_event)
2745 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2746 else if (ctx_event_type & EVENT_PINNED)
2747 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2748
2749 perf_event_sched_in(cpuctx, task_ctx, current);
2750 perf_pmu_enable(cpuctx->ctx.pmu);
2751}
2752
2753void perf_pmu_resched(struct pmu *pmu)
2754{
2755 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2756 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2757
2758 perf_ctx_lock(cpuctx, task_ctx);
2759 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2760 perf_ctx_unlock(cpuctx, task_ctx);
2761}
2762
2763
2764
2765
2766
2767
2768
2769static int __perf_install_in_context(void *info)
2770{
2771 struct perf_event *event = info;
2772 struct perf_event_context *ctx = event->ctx;
2773 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2774 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2775 bool reprogram = true;
2776 int ret = 0;
2777
2778 raw_spin_lock(&cpuctx->ctx.lock);
2779 if (ctx->task) {
2780 raw_spin_lock(&ctx->lock);
2781 task_ctx = ctx;
2782
2783 reprogram = (ctx->task == current);
2784
2785
2786
2787
2788
2789
2790
2791
2792 if (task_curr(ctx->task) && !reprogram) {
2793 ret = -ESRCH;
2794 goto unlock;
2795 }
2796
2797 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2798 } else if (task_ctx) {
2799 raw_spin_lock(&task_ctx->lock);
2800 }
2801
2802#ifdef CONFIG_CGROUP_PERF
2803 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2804
2805
2806
2807
2808 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2809 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2810 event->cgrp->css.cgroup);
2811 }
2812#endif
2813
2814 if (reprogram) {
2815 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2816 add_event_to_ctx(event, ctx);
2817 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2818 } else {
2819 add_event_to_ctx(event, ctx);
2820 }
2821
2822unlock:
2823 perf_ctx_unlock(cpuctx, task_ctx);
2824
2825 return ret;
2826}
2827
2828static bool exclusive_event_installable(struct perf_event *event,
2829 struct perf_event_context *ctx);
2830
2831
2832
2833
2834
2835
2836static void
2837perf_install_in_context(struct perf_event_context *ctx,
2838 struct perf_event *event,
2839 int cpu)
2840{
2841 struct task_struct *task = READ_ONCE(ctx->task);
2842
2843 lockdep_assert_held(&ctx->mutex);
2844
2845 WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2846
2847 if (event->cpu != -1)
2848 event->cpu = cpu;
2849
2850
2851
2852
2853
2854 smp_store_release(&event->ctx, ctx);
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2865 raw_spin_lock_irq(&ctx->lock);
2866 if (ctx->task == TASK_TOMBSTONE) {
2867 raw_spin_unlock_irq(&ctx->lock);
2868 return;
2869 }
2870 add_event_to_ctx(event, ctx);
2871 raw_spin_unlock_irq(&ctx->lock);
2872 return;
2873 }
2874
2875 if (!task) {
2876 cpu_function_call(cpu, __perf_install_in_context, event);
2877 return;
2878 }
2879
2880
2881
2882
2883 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2884 return;
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916 smp_mb();
2917again:
2918 if (!task_function_call(task, __perf_install_in_context, event))
2919 return;
2920
2921 raw_spin_lock_irq(&ctx->lock);
2922 task = ctx->task;
2923 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2924
2925
2926
2927
2928
2929 raw_spin_unlock_irq(&ctx->lock);
2930 return;
2931 }
2932
2933
2934
2935
2936 if (task_curr(task)) {
2937 raw_spin_unlock_irq(&ctx->lock);
2938 goto again;
2939 }
2940 add_event_to_ctx(event, ctx);
2941 raw_spin_unlock_irq(&ctx->lock);
2942}
2943
2944
2945
2946
2947static void __perf_event_enable(struct perf_event *event,
2948 struct perf_cpu_context *cpuctx,
2949 struct perf_event_context *ctx,
2950 void *info)
2951{
2952 struct perf_event *leader = event->group_leader;
2953 struct perf_event_context *task_ctx;
2954
2955 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2956 event->state <= PERF_EVENT_STATE_ERROR)
2957 return;
2958
2959 if (ctx->is_active)
2960 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2961
2962 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2963 perf_cgroup_event_enable(event, ctx);
2964
2965 if (!ctx->is_active)
2966 return;
2967
2968 if (!event_filter_match(event)) {
2969 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2970 return;
2971 }
2972
2973
2974
2975
2976
2977 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2978 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2979 return;
2980 }
2981
2982 task_ctx = cpuctx->task_ctx;
2983 if (ctx->task)
2984 WARN_ON_ONCE(task_ctx != ctx);
2985
2986 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2987}
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998static void _perf_event_enable(struct perf_event *event)
2999{
3000 struct perf_event_context *ctx = event->ctx;
3001
3002 raw_spin_lock_irq(&ctx->lock);
3003 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3004 event->state < PERF_EVENT_STATE_ERROR) {
3005out:
3006 raw_spin_unlock_irq(&ctx->lock);
3007 return;
3008 }
3009
3010
3011
3012
3013
3014
3015
3016
3017 if (event->state == PERF_EVENT_STATE_ERROR) {
3018
3019
3020
3021 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3022 event->group_leader == event)
3023 goto out;
3024
3025 event->state = PERF_EVENT_STATE_OFF;
3026 }
3027 raw_spin_unlock_irq(&ctx->lock);
3028
3029 event_function_call(event, __perf_event_enable, NULL);
3030}
3031
3032
3033
3034
3035void perf_event_enable(struct perf_event *event)
3036{
3037 struct perf_event_context *ctx;
3038
3039 ctx = perf_event_ctx_lock(event);
3040 _perf_event_enable(event);
3041 perf_event_ctx_unlock(event, ctx);
3042}
3043EXPORT_SYMBOL_GPL(perf_event_enable);
3044
3045struct stop_event_data {
3046 struct perf_event *event;
3047 unsigned int restart;
3048};
3049
3050static int __perf_event_stop(void *info)
3051{
3052 struct stop_event_data *sd = info;
3053 struct perf_event *event = sd->event;
3054
3055
3056 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3057 return 0;
3058
3059
3060 smp_rmb();
3061
3062
3063
3064
3065
3066 if (READ_ONCE(event->oncpu) != smp_processor_id())
3067 return -EAGAIN;
3068
3069 event->pmu->stop(event, PERF_EF_UPDATE);
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080 if (sd->restart)
3081 event->pmu->start(event, 0);
3082
3083 return 0;
3084}
3085
3086static int perf_event_stop(struct perf_event *event, int restart)
3087{
3088 struct stop_event_data sd = {
3089 .event = event,
3090 .restart = restart,
3091 };
3092 int ret = 0;
3093
3094 do {
3095 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3096 return 0;
3097
3098
3099 smp_rmb();
3100
3101
3102
3103
3104
3105
3106 ret = cpu_function_call(READ_ONCE(event->oncpu),
3107 __perf_event_stop, &sd);
3108 } while (ret == -EAGAIN);
3109
3110 return ret;
3111}
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135void perf_event_addr_filters_sync(struct perf_event *event)
3136{
3137 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3138
3139 if (!has_addr_filter(event))
3140 return;
3141
3142 raw_spin_lock(&ifh->lock);
3143 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3144 event->pmu->addr_filters_sync(event);
3145 event->hw.addr_filters_gen = event->addr_filters_gen;
3146 }
3147 raw_spin_unlock(&ifh->lock);
3148}
3149EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3150
3151static int _perf_event_refresh(struct perf_event *event, int refresh)
3152{
3153
3154
3155
3156 if (event->attr.inherit || !is_sampling_event(event))
3157 return -EINVAL;
3158
3159 atomic_add(refresh, &event->event_limit);
3160 _perf_event_enable(event);
3161
3162 return 0;
3163}
3164
3165
3166
3167
3168int perf_event_refresh(struct perf_event *event, int refresh)
3169{
3170 struct perf_event_context *ctx;
3171 int ret;
3172
3173 ctx = perf_event_ctx_lock(event);
3174 ret = _perf_event_refresh(event, refresh);
3175 perf_event_ctx_unlock(event, ctx);
3176
3177 return ret;
3178}
3179EXPORT_SYMBOL_GPL(perf_event_refresh);
3180
3181static int perf_event_modify_breakpoint(struct perf_event *bp,
3182 struct perf_event_attr *attr)
3183{
3184 int err;
3185
3186 _perf_event_disable(bp);
3187
3188 err = modify_user_hw_breakpoint_check(bp, attr, true);
3189
3190 if (!bp->attr.disabled)
3191 _perf_event_enable(bp);
3192
3193 return err;
3194}
3195
3196static int perf_event_modify_attr(struct perf_event *event,
3197 struct perf_event_attr *attr)
3198{
3199 int (*func)(struct perf_event *, struct perf_event_attr *);
3200 struct perf_event *child;
3201 int err;
3202
3203 if (event->attr.type != attr->type)
3204 return -EINVAL;
3205
3206 switch (event->attr.type) {
3207 case PERF_TYPE_BREAKPOINT:
3208 func = perf_event_modify_breakpoint;
3209 break;
3210 default:
3211
3212 return -EOPNOTSUPP;
3213 }
3214
3215 WARN_ON_ONCE(event->ctx->parent_ctx);
3216
3217 mutex_lock(&event->child_mutex);
3218 err = func(event, attr);
3219 if (err)
3220 goto out;
3221 list_for_each_entry(child, &event->child_list, child_list) {
3222 err = func(child, attr);
3223 if (err)
3224 goto out;
3225 }
3226out:
3227 mutex_unlock(&event->child_mutex);
3228 return err;
3229}
3230
3231static void ctx_sched_out(struct perf_event_context *ctx,
3232 struct perf_cpu_context *cpuctx,
3233 enum event_type_t event_type)
3234{
3235 struct perf_event *event, *tmp;
3236 int is_active = ctx->is_active;
3237
3238 lockdep_assert_held(&ctx->lock);
3239
3240 if (likely(!ctx->nr_events)) {
3241
3242
3243
3244 WARN_ON_ONCE(ctx->is_active);
3245 if (ctx->task)
3246 WARN_ON_ONCE(cpuctx->task_ctx);
3247 return;
3248 }
3249
3250 ctx->is_active &= ~event_type;
3251 if (!(ctx->is_active & EVENT_ALL))
3252 ctx->is_active = 0;
3253
3254 if (ctx->task) {
3255 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3256 if (!ctx->is_active)
3257 cpuctx->task_ctx = NULL;
3258 }
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270 if (is_active & EVENT_TIME) {
3271
3272 update_context_time(ctx);
3273 update_cgrp_time_from_cpuctx(cpuctx);
3274 }
3275
3276 is_active ^= ctx->is_active;
3277
3278 if (!ctx->nr_active || !(is_active & EVENT_ALL))
3279 return;
3280
3281 perf_pmu_disable(ctx->pmu);
3282 if (is_active & EVENT_PINNED) {
3283 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3284 group_sched_out(event, cpuctx, ctx);
3285 }
3286
3287 if (is_active & EVENT_FLEXIBLE) {
3288 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3289 group_sched_out(event, cpuctx, ctx);
3290
3291
3292
3293
3294
3295
3296 ctx->rotate_necessary = 0;
3297 }
3298 perf_pmu_enable(ctx->pmu);
3299}
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309static int context_equiv(struct perf_event_context *ctx1,
3310 struct perf_event_context *ctx2)
3311{
3312 lockdep_assert_held(&ctx1->lock);
3313 lockdep_assert_held(&ctx2->lock);
3314
3315
3316 if (ctx1->pin_count || ctx2->pin_count)
3317 return 0;
3318
3319
3320 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3321 return 1;
3322
3323
3324 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3325 return 1;
3326
3327
3328
3329
3330
3331 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3332 ctx1->parent_gen == ctx2->parent_gen)
3333 return 1;
3334
3335
3336 return 0;
3337}
3338
3339static void __perf_event_sync_stat(struct perf_event *event,
3340 struct perf_event *next_event)
3341{
3342 u64 value;
3343
3344 if (!event->attr.inherit_stat)
3345 return;
3346
3347
3348
3349
3350
3351
3352
3353
3354 if (event->state == PERF_EVENT_STATE_ACTIVE)
3355 event->pmu->read(event);
3356
3357 perf_event_update_time(event);
3358
3359
3360
3361
3362
3363 value = local64_read(&next_event->count);
3364 value = local64_xchg(&event->count, value);
3365 local64_set(&next_event->count, value);
3366
3367 swap(event->total_time_enabled, next_event->total_time_enabled);
3368 swap(event->total_time_running, next_event->total_time_running);
3369
3370
3371
3372
3373 perf_event_update_userpage(event);
3374 perf_event_update_userpage(next_event);
3375}
3376
3377static void perf_event_sync_stat(struct perf_event_context *ctx,
3378 struct perf_event_context *next_ctx)
3379{
3380 struct perf_event *event, *next_event;
3381
3382 if (!ctx->nr_stat)
3383 return;
3384
3385 update_context_time(ctx);
3386
3387 event = list_first_entry(&ctx->event_list,
3388 struct perf_event, event_entry);
3389
3390 next_event = list_first_entry(&next_ctx->event_list,
3391 struct perf_event, event_entry);
3392
3393 while (&event->event_entry != &ctx->event_list &&
3394 &next_event->event_entry != &next_ctx->event_list) {
3395
3396 __perf_event_sync_stat(event, next_event);
3397
3398 event = list_next_entry(event, event_entry);
3399 next_event = list_next_entry(next_event, event_entry);
3400 }
3401}
3402
3403static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3404 struct task_struct *next)
3405{
3406 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3407 struct perf_event_context *next_ctx;
3408 struct perf_event_context *parent, *next_parent;
3409 struct perf_cpu_context *cpuctx;
3410 int do_switch = 1;
3411 struct pmu *pmu;
3412
3413 if (likely(!ctx))
3414 return;
3415
3416 pmu = ctx->pmu;
3417 cpuctx = __get_cpu_context(ctx);
3418 if (!cpuctx->task_ctx)
3419 return;
3420
3421 rcu_read_lock();
3422 next_ctx = next->perf_event_ctxp[ctxn];
3423 if (!next_ctx)
3424 goto unlock;
3425
3426 parent = rcu_dereference(ctx->parent_ctx);
3427 next_parent = rcu_dereference(next_ctx->parent_ctx);
3428
3429
3430 if (!parent && !next_parent)
3431 goto unlock;
3432
3433 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443 raw_spin_lock(&ctx->lock);
3444 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3445 if (context_equiv(ctx, next_ctx)) {
3446
3447 WRITE_ONCE(ctx->task, next);
3448 WRITE_ONCE(next_ctx->task, task);
3449
3450 perf_pmu_disable(pmu);
3451
3452 if (cpuctx->sched_cb_usage && pmu->sched_task)
3453 pmu->sched_task(ctx, false);
3454
3455
3456
3457
3458
3459
3460
3461 if (pmu->swap_task_ctx)
3462 pmu->swap_task_ctx(ctx, next_ctx);
3463 else
3464 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3465
3466 perf_pmu_enable(pmu);
3467
3468
3469
3470
3471
3472
3473
3474
3475 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3476 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3477
3478 do_switch = 0;
3479
3480 perf_event_sync_stat(ctx, next_ctx);
3481 }
3482 raw_spin_unlock(&next_ctx->lock);
3483 raw_spin_unlock(&ctx->lock);
3484 }
3485unlock:
3486 rcu_read_unlock();
3487
3488 if (do_switch) {
3489 raw_spin_lock(&ctx->lock);
3490 perf_pmu_disable(pmu);
3491
3492 if (cpuctx->sched_cb_usage && pmu->sched_task)
3493 pmu->sched_task(ctx, false);
3494 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3495
3496 perf_pmu_enable(pmu);
3497 raw_spin_unlock(&ctx->lock);
3498 }
3499}
3500
3501static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3502
3503void perf_sched_cb_dec(struct pmu *pmu)
3504{
3505 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3506
3507 this_cpu_dec(perf_sched_cb_usages);
3508
3509 if (!--cpuctx->sched_cb_usage)
3510 list_del(&cpuctx->sched_cb_entry);
3511}
3512
3513
3514void perf_sched_cb_inc(struct pmu *pmu)
3515{
3516 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3517
3518 if (!cpuctx->sched_cb_usage++)
3519 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3520
3521 this_cpu_inc(perf_sched_cb_usages);
3522}
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3533{
3534 struct pmu *pmu;
3535
3536 pmu = cpuctx->ctx.pmu;
3537
3538 if (WARN_ON_ONCE(!pmu->sched_task))
3539 return;
3540
3541 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3542 perf_pmu_disable(pmu);
3543
3544 pmu->sched_task(cpuctx->task_ctx, sched_in);
3545
3546 perf_pmu_enable(pmu);
3547 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3548}
3549
3550static void perf_pmu_sched_task(struct task_struct *prev,
3551 struct task_struct *next,
3552 bool sched_in)
3553{
3554 struct perf_cpu_context *cpuctx;
3555
3556 if (prev == next)
3557 return;
3558
3559 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3560
3561 if (cpuctx->task_ctx)
3562 continue;
3563
3564 __perf_pmu_sched_task(cpuctx, sched_in);
3565 }
3566}
3567
3568static void perf_event_switch(struct task_struct *task,
3569 struct task_struct *next_prev, bool sched_in);
3570
3571#define for_each_task_context_nr(ctxn) \
3572 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585void __perf_event_task_sched_out(struct task_struct *task,
3586 struct task_struct *next)
3587{
3588 int ctxn;
3589
3590 if (__this_cpu_read(perf_sched_cb_usages))
3591 perf_pmu_sched_task(task, next, false);
3592
3593 if (atomic_read(&nr_switch_events))
3594 perf_event_switch(task, next, false);
3595
3596 for_each_task_context_nr(ctxn)
3597 perf_event_context_sched_out(task, ctxn, next);
3598
3599
3600
3601
3602
3603
3604 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3605 perf_cgroup_sched_out(task, next);
3606}
3607
3608
3609
3610
3611static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3612 enum event_type_t event_type)
3613{
3614 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3615}
3616
3617static bool perf_less_group_idx(const void *l, const void *r)
3618{
3619 const struct perf_event *le = *(const struct perf_event **)l;
3620 const struct perf_event *re = *(const struct perf_event **)r;
3621
3622 return le->group_index < re->group_index;
3623}
3624
3625static void swap_ptr(void *l, void *r)
3626{
3627 void **lp = l, **rp = r;
3628
3629 swap(*lp, *rp);
3630}
3631
3632static const struct min_heap_callbacks perf_min_heap = {
3633 .elem_size = sizeof(struct perf_event *),
3634 .less = perf_less_group_idx,
3635 .swp = swap_ptr,
3636};
3637
3638static void __heap_add(struct min_heap *heap, struct perf_event *event)
3639{
3640 struct perf_event **itrs = heap->data;
3641
3642 if (event) {
3643 itrs[heap->nr] = event;
3644 heap->nr++;
3645 }
3646}
3647
3648static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3649 struct perf_event_groups *groups, int cpu,
3650 int (*func)(struct perf_event *, void *),
3651 void *data)
3652{
3653#ifdef CONFIG_CGROUP_PERF
3654 struct cgroup_subsys_state *css = NULL;
3655#endif
3656
3657 struct perf_event *itrs[2];
3658 struct min_heap event_heap;
3659 struct perf_event **evt;
3660 int ret;
3661
3662 if (cpuctx) {
3663 event_heap = (struct min_heap){
3664 .data = cpuctx->heap,
3665 .nr = 0,
3666 .size = cpuctx->heap_size,
3667 };
3668
3669 lockdep_assert_held(&cpuctx->ctx.lock);
3670
3671#ifdef CONFIG_CGROUP_PERF
3672 if (cpuctx->cgrp)
3673 css = &cpuctx->cgrp->css;
3674#endif
3675 } else {
3676 event_heap = (struct min_heap){
3677 .data = itrs,
3678 .nr = 0,
3679 .size = ARRAY_SIZE(itrs),
3680 };
3681
3682 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3683 }
3684 evt = event_heap.data;
3685
3686 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3687
3688#ifdef CONFIG_CGROUP_PERF
3689 for (; css; css = css->parent)
3690 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3691#endif
3692
3693 min_heapify_all(&event_heap, &perf_min_heap);
3694
3695 while (event_heap.nr) {
3696 ret = func(*evt, data);
3697 if (ret)
3698 return ret;
3699
3700 *evt = perf_event_groups_next(*evt);
3701 if (*evt)
3702 min_heapify(&event_heap, 0, &perf_min_heap);
3703 else
3704 min_heap_pop(&event_heap, &perf_min_heap);
3705 }
3706
3707 return 0;
3708}
3709
3710static inline bool event_update_userpage(struct perf_event *event)
3711{
3712 if (likely(!atomic_read(&event->mmap_count)))
3713 return false;
3714
3715 perf_event_update_time(event);
3716 perf_set_shadow_time(event, event->ctx);
3717 perf_event_update_userpage(event);
3718
3719 return true;
3720}
3721
3722static inline void group_update_userpage(struct perf_event *group_event)
3723{
3724 struct perf_event *event;
3725
3726 if (!event_update_userpage(group_event))
3727 return;
3728
3729 for_each_sibling_event(event, group_event)
3730 event_update_userpage(event);
3731}
3732
3733static int merge_sched_in(struct perf_event *event, void *data)
3734{
3735 struct perf_event_context *ctx = event->ctx;
3736 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3737 int *can_add_hw = data;
3738
3739 if (event->state <= PERF_EVENT_STATE_OFF)
3740 return 0;
3741
3742 if (!event_filter_match(event))
3743 return 0;
3744
3745 if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3746 if (!group_sched_in(event, cpuctx, ctx))
3747 list_add_tail(&event->active_list, get_event_list(event));
3748 }
3749
3750 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3751 *can_add_hw = 0;
3752 if (event->attr.pinned) {
3753 perf_cgroup_event_disable(event, ctx);
3754 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3755 } else {
3756 ctx->rotate_necessary = 1;
3757 perf_mux_hrtimer_restart(cpuctx);
3758 group_update_userpage(event);
3759 }
3760 }
3761
3762 return 0;
3763}
3764
3765static void
3766ctx_pinned_sched_in(struct perf_event_context *ctx,
3767 struct perf_cpu_context *cpuctx)
3768{
3769 int can_add_hw = 1;
3770
3771 if (ctx != &cpuctx->ctx)
3772 cpuctx = NULL;
3773
3774 visit_groups_merge(cpuctx, &ctx->pinned_groups,
3775 smp_processor_id(),
3776 merge_sched_in, &can_add_hw);
3777}
3778
3779static void
3780ctx_flexible_sched_in(struct perf_event_context *ctx,
3781 struct perf_cpu_context *cpuctx)
3782{
3783 int can_add_hw = 1;
3784
3785 if (ctx != &cpuctx->ctx)
3786 cpuctx = NULL;
3787
3788 visit_groups_merge(cpuctx, &ctx->flexible_groups,
3789 smp_processor_id(),
3790 merge_sched_in, &can_add_hw);
3791}
3792
3793static void
3794ctx_sched_in(struct perf_event_context *ctx,
3795 struct perf_cpu_context *cpuctx,
3796 enum event_type_t event_type,
3797 struct task_struct *task)
3798{
3799 int is_active = ctx->is_active;
3800 u64 now;
3801
3802 lockdep_assert_held(&ctx->lock);
3803
3804 if (likely(!ctx->nr_events))
3805 return;
3806
3807 ctx->is_active |= (event_type | EVENT_TIME);
3808 if (ctx->task) {
3809 if (!is_active)
3810 cpuctx->task_ctx = ctx;
3811 else
3812 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3813 }
3814
3815 is_active ^= ctx->is_active;
3816
3817 if (is_active & EVENT_TIME) {
3818
3819 now = perf_clock();
3820 ctx->timestamp = now;
3821 perf_cgroup_set_timestamp(task, ctx);
3822 }
3823
3824
3825
3826
3827
3828 if (is_active & EVENT_PINNED)
3829 ctx_pinned_sched_in(ctx, cpuctx);
3830
3831
3832 if (is_active & EVENT_FLEXIBLE)
3833 ctx_flexible_sched_in(ctx, cpuctx);
3834}
3835
3836static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3837 enum event_type_t event_type,
3838 struct task_struct *task)
3839{
3840 struct perf_event_context *ctx = &cpuctx->ctx;
3841
3842 ctx_sched_in(ctx, cpuctx, event_type, task);
3843}
3844
3845static void perf_event_context_sched_in(struct perf_event_context *ctx,
3846 struct task_struct *task)
3847{
3848 struct perf_cpu_context *cpuctx;
3849 struct pmu *pmu;
3850
3851 cpuctx = __get_cpu_context(ctx);
3852
3853
3854
3855
3856
3857 pmu = ctx->pmu = cpuctx->ctx.pmu;
3858
3859 if (cpuctx->task_ctx == ctx) {
3860 if (cpuctx->sched_cb_usage)
3861 __perf_pmu_sched_task(cpuctx, true);
3862 return;
3863 }
3864
3865 perf_ctx_lock(cpuctx, ctx);
3866
3867
3868
3869
3870 if (!ctx->nr_events)
3871 goto unlock;
3872
3873 perf_pmu_disable(pmu);
3874
3875
3876
3877
3878
3879
3880
3881
3882 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3883 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3884 perf_event_sched_in(cpuctx, ctx, task);
3885
3886 if (cpuctx->sched_cb_usage && pmu->sched_task)
3887 pmu->sched_task(cpuctx->task_ctx, true);
3888
3889 perf_pmu_enable(pmu);
3890
3891unlock:
3892 perf_ctx_unlock(cpuctx, ctx);
3893}
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906void __perf_event_task_sched_in(struct task_struct *prev,
3907 struct task_struct *task)
3908{
3909 struct perf_event_context *ctx;
3910 int ctxn;
3911
3912
3913
3914
3915
3916
3917
3918
3919 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3920 perf_cgroup_sched_in(prev, task);
3921
3922 for_each_task_context_nr(ctxn) {
3923 ctx = task->perf_event_ctxp[ctxn];
3924 if (likely(!ctx))
3925 continue;
3926
3927 perf_event_context_sched_in(ctx, task);
3928 }
3929
3930 if (atomic_read(&nr_switch_events))
3931 perf_event_switch(task, prev, true);
3932
3933 if (__this_cpu_read(perf_sched_cb_usages))
3934 perf_pmu_sched_task(prev, task, true);
3935}
3936
3937static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3938{
3939 u64 frequency = event->attr.sample_freq;
3940 u64 sec = NSEC_PER_SEC;
3941 u64 divisor, dividend;
3942
3943 int count_fls, nsec_fls, frequency_fls, sec_fls;
3944
3945 count_fls = fls64(count);
3946 nsec_fls = fls64(nsec);
3947 frequency_fls = fls64(frequency);
3948 sec_fls = 30;
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964#define REDUCE_FLS(a, b) \
3965do { \
3966 if (a##_fls > b##_fls) { \
3967 a >>= 1; \
3968 a##_fls--; \
3969 } else { \
3970 b >>= 1; \
3971 b##_fls--; \
3972 } \
3973} while (0)
3974
3975
3976
3977
3978
3979 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3980 REDUCE_FLS(nsec, frequency);
3981 REDUCE_FLS(sec, count);
3982 }
3983
3984 if (count_fls + sec_fls > 64) {
3985 divisor = nsec * frequency;
3986
3987 while (count_fls + sec_fls > 64) {
3988 REDUCE_FLS(count, sec);
3989 divisor >>= 1;
3990 }
3991
3992 dividend = count * sec;
3993 } else {
3994 dividend = count * sec;
3995
3996 while (nsec_fls + frequency_fls > 64) {
3997 REDUCE_FLS(nsec, frequency);
3998 dividend >>= 1;
3999 }
4000
4001 divisor = nsec * frequency;
4002 }
4003
4004 if (!divisor)
4005 return dividend;
4006
4007 return div64_u64(dividend, divisor);
4008}
4009
4010static DEFINE_PER_CPU(int, perf_throttled_count);
4011static DEFINE_PER_CPU(u64, perf_throttled_seq);
4012
4013static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4014{
4015 struct hw_perf_event *hwc = &event->hw;
4016 s64 period, sample_period;
4017 s64 delta;
4018
4019 period = perf_calculate_period(event, nsec, count);
4020
4021 delta = (s64)(period - hwc->sample_period);
4022 delta = (delta + 7) / 8;
4023
4024 sample_period = hwc->sample_period + delta;
4025
4026 if (!sample_period)
4027 sample_period = 1;
4028
4029 hwc->sample_period = sample_period;
4030
4031 if (local64_read(&hwc->period_left) > 8*sample_period) {
4032 if (disable)
4033 event->pmu->stop(event, PERF_EF_UPDATE);
4034
4035 local64_set(&hwc->period_left, 0);
4036
4037 if (disable)
4038 event->pmu->start(event, PERF_EF_RELOAD);
4039 }
4040}
4041
4042
4043
4044
4045
4046
4047static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4048 int needs_unthr)
4049{
4050 struct perf_event *event;
4051 struct hw_perf_event *hwc;
4052 u64 now, period = TICK_NSEC;
4053 s64 delta;
4054
4055
4056
4057
4058
4059
4060 if (!(ctx->nr_freq || needs_unthr))
4061 return;
4062
4063 raw_spin_lock(&ctx->lock);
4064 perf_pmu_disable(ctx->pmu);
4065
4066 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4067 if (event->state != PERF_EVENT_STATE_ACTIVE)
4068 continue;
4069
4070 if (!event_filter_match(event))
4071 continue;
4072
4073 perf_pmu_disable(event->pmu);
4074
4075 hwc = &event->hw;
4076
4077 if (hwc->interrupts == MAX_INTERRUPTS) {
4078 hwc->interrupts = 0;
4079 perf_log_throttle(event, 1);
4080 event->pmu->start(event, 0);
4081 }
4082
4083 if (!event->attr.freq || !event->attr.sample_freq)
4084 goto next;
4085
4086
4087
4088
4089 event->pmu->stop(event, PERF_EF_UPDATE);
4090
4091 now = local64_read(&event->count);
4092 delta = now - hwc->freq_count_stamp;
4093 hwc->freq_count_stamp = now;
4094
4095
4096
4097
4098
4099
4100
4101
4102 if (delta > 0)
4103 perf_adjust_period(event, period, delta, false);
4104
4105 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4106 next:
4107 perf_pmu_enable(event->pmu);
4108 }
4109
4110 perf_pmu_enable(ctx->pmu);
4111 raw_spin_unlock(&ctx->lock);
4112}
4113
4114
4115
4116
4117static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4118{
4119
4120
4121
4122
4123 if (ctx->rotate_disable)
4124 return;
4125
4126 perf_event_groups_delete(&ctx->flexible_groups, event);
4127 perf_event_groups_insert(&ctx->flexible_groups, event);
4128}
4129
4130
4131static inline struct perf_event *
4132ctx_event_to_rotate(struct perf_event_context *ctx)
4133{
4134 struct perf_event *event;
4135
4136
4137 event = list_first_entry_or_null(&ctx->flexible_active,
4138 struct perf_event, active_list);
4139
4140
4141 if (!event) {
4142 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4143 typeof(*event), group_node);
4144 }
4145
4146
4147
4148
4149
4150 ctx->rotate_necessary = 0;
4151
4152 return event;
4153}
4154
4155static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4156{
4157 struct perf_event *cpu_event = NULL, *task_event = NULL;
4158 struct perf_event_context *task_ctx = NULL;
4159 int cpu_rotate, task_rotate;
4160
4161
4162
4163
4164
4165
4166 cpu_rotate = cpuctx->ctx.rotate_necessary;
4167 task_ctx = cpuctx->task_ctx;
4168 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4169
4170 if (!(cpu_rotate || task_rotate))
4171 return false;
4172
4173 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4174 perf_pmu_disable(cpuctx->ctx.pmu);
4175
4176 if (task_rotate)
4177 task_event = ctx_event_to_rotate(task_ctx);
4178 if (cpu_rotate)
4179 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4180
4181
4182
4183
4184
4185 if (task_event || (task_ctx && cpu_event))
4186 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4187 if (cpu_event)
4188 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4189
4190 if (task_event)
4191 rotate_ctx(task_ctx, task_event);
4192 if (cpu_event)
4193 rotate_ctx(&cpuctx->ctx, cpu_event);
4194
4195 perf_event_sched_in(cpuctx, task_ctx, current);
4196
4197 perf_pmu_enable(cpuctx->ctx.pmu);
4198 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4199
4200 return true;
4201}
4202
4203void perf_event_task_tick(void)
4204{
4205 struct list_head *head = this_cpu_ptr(&active_ctx_list);
4206 struct perf_event_context *ctx, *tmp;
4207 int throttled;
4208
4209 lockdep_assert_irqs_disabled();
4210
4211 __this_cpu_inc(perf_throttled_seq);
4212 throttled = __this_cpu_xchg(perf_throttled_count, 0);
4213 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4214
4215 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4216 perf_adjust_freq_unthr_context(ctx, throttled);
4217}
4218
4219static int event_enable_on_exec(struct perf_event *event,
4220 struct perf_event_context *ctx)
4221{
4222 if (!event->attr.enable_on_exec)
4223 return 0;
4224
4225 event->attr.enable_on_exec = 0;
4226 if (event->state >= PERF_EVENT_STATE_INACTIVE)
4227 return 0;
4228
4229 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4230
4231 return 1;
4232}
4233
4234
4235
4236
4237
4238static void perf_event_enable_on_exec(int ctxn)
4239{
4240 struct perf_event_context *ctx, *clone_ctx = NULL;
4241 enum event_type_t event_type = 0;
4242 struct perf_cpu_context *cpuctx;
4243 struct perf_event *event;
4244 unsigned long flags;
4245 int enabled = 0;
4246
4247 local_irq_save(flags);
4248 ctx = current->perf_event_ctxp[ctxn];
4249 if (!ctx || !ctx->nr_events)
4250 goto out;
4251
4252 cpuctx = __get_cpu_context(ctx);
4253 perf_ctx_lock(cpuctx, ctx);
4254 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4255 list_for_each_entry(event, &ctx->event_list, event_entry) {
4256 enabled |= event_enable_on_exec(event, ctx);
4257 event_type |= get_event_type(event);
4258 }
4259
4260
4261
4262
4263 if (enabled) {
4264 clone_ctx = unclone_ctx(ctx);
4265 ctx_resched(cpuctx, ctx, event_type);
4266 } else {
4267 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4268 }
4269 perf_ctx_unlock(cpuctx, ctx);
4270
4271out:
4272 local_irq_restore(flags);
4273
4274 if (clone_ctx)
4275 put_ctx(clone_ctx);
4276}
4277
4278static void perf_remove_from_owner(struct perf_event *event);
4279static void perf_event_exit_event(struct perf_event *event,
4280 struct perf_event_context *ctx);
4281
4282
4283
4284
4285
4286static void perf_event_remove_on_exec(int ctxn)
4287{
4288 struct perf_event_context *ctx, *clone_ctx = NULL;
4289 struct perf_event *event, *next;
4290 LIST_HEAD(free_list);
4291 unsigned long flags;
4292 bool modified = false;
4293
4294 ctx = perf_pin_task_context(current, ctxn);
4295 if (!ctx)
4296 return;
4297
4298 mutex_lock(&ctx->mutex);
4299
4300 if (WARN_ON_ONCE(ctx->task != current))
4301 goto unlock;
4302
4303 list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4304 if (!event->attr.remove_on_exec)
4305 continue;
4306
4307 if (!is_kernel_event(event))
4308 perf_remove_from_owner(event);
4309
4310 modified = true;
4311
4312 perf_event_exit_event(event, ctx);
4313 }
4314
4315 raw_spin_lock_irqsave(&ctx->lock, flags);
4316 if (modified)
4317 clone_ctx = unclone_ctx(ctx);
4318 --ctx->pin_count;
4319 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4320
4321unlock:
4322 mutex_unlock(&ctx->mutex);
4323
4324 put_ctx(ctx);
4325 if (clone_ctx)
4326 put_ctx(clone_ctx);
4327}
4328
4329struct perf_read_data {
4330 struct perf_event *event;
4331 bool group;
4332 int ret;
4333};
4334
4335static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4336{
4337 u16 local_pkg, event_pkg;
4338
4339 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4340 int local_cpu = smp_processor_id();
4341
4342 event_pkg = topology_physical_package_id(event_cpu);
4343 local_pkg = topology_physical_package_id(local_cpu);
4344
4345 if (event_pkg == local_pkg)
4346 return local_cpu;
4347 }
4348
4349 return event_cpu;
4350}
4351
4352
4353
4354
4355static void __perf_event_read(void *info)
4356{
4357 struct perf_read_data *data = info;
4358 struct perf_event *sub, *event = data->event;
4359 struct perf_event_context *ctx = event->ctx;
4360 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4361 struct pmu *pmu = event->pmu;
4362
4363
4364
4365
4366
4367
4368
4369
4370 if (ctx->task && cpuctx->task_ctx != ctx)
4371 return;
4372
4373 raw_spin_lock(&ctx->lock);
4374 if (ctx->is_active & EVENT_TIME) {
4375 update_context_time(ctx);
4376 update_cgrp_time_from_event(event);
4377 }
4378
4379 perf_event_update_time(event);
4380 if (data->group)
4381 perf_event_update_sibling_time(event);
4382
4383 if (event->state != PERF_EVENT_STATE_ACTIVE)
4384 goto unlock;
4385
4386 if (!data->group) {
4387 pmu->read(event);
4388 data->ret = 0;
4389 goto unlock;
4390 }
4391
4392 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4393
4394 pmu->read(event);
4395
4396 for_each_sibling_event(sub, event) {
4397 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4398
4399
4400
4401
4402 sub->pmu->read(sub);
4403 }
4404 }
4405
4406 data->ret = pmu->commit_txn(pmu);
4407
4408unlock:
4409 raw_spin_unlock(&ctx->lock);
4410}
4411
4412static inline u64 perf_event_count(struct perf_event *event)
4413{
4414 return local64_read(&event->count) + atomic64_read(&event->child_count);
4415}
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425int perf_event_read_local(struct perf_event *event, u64 *value,
4426 u64 *enabled, u64 *running)
4427{
4428 unsigned long flags;
4429 int ret = 0;
4430
4431
4432
4433
4434
4435 local_irq_save(flags);
4436
4437
4438
4439
4440
4441 if (event->attr.inherit) {
4442 ret = -EOPNOTSUPP;
4443 goto out;
4444 }
4445
4446
4447 if ((event->attach_state & PERF_ATTACH_TASK) &&
4448 event->hw.target != current) {
4449 ret = -EINVAL;
4450 goto out;
4451 }
4452
4453
4454 if (!(event->attach_state & PERF_ATTACH_TASK) &&
4455 event->cpu != smp_processor_id()) {
4456 ret = -EINVAL;
4457 goto out;
4458 }
4459
4460
4461 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4462 ret = -EBUSY;
4463 goto out;
4464 }
4465
4466
4467
4468
4469
4470
4471 if (event->oncpu == smp_processor_id())
4472 event->pmu->read(event);
4473
4474 *value = local64_read(&event->count);
4475 if (enabled || running) {
4476 u64 now = event->shadow_ctx_time + perf_clock();
4477 u64 __enabled, __running;
4478
4479 __perf_update_times(event, now, &__enabled, &__running);
4480 if (enabled)
4481 *enabled = __enabled;
4482 if (running)
4483 *running = __running;
4484 }
4485out:
4486 local_irq_restore(flags);
4487
4488 return ret;
4489}
4490
4491static int perf_event_read(struct perf_event *event, bool group)
4492{
4493 enum perf_event_state state = READ_ONCE(event->state);
4494 int event_cpu, ret = 0;
4495
4496
4497
4498
4499
4500again:
4501 if (state == PERF_EVENT_STATE_ACTIVE) {
4502 struct perf_read_data data;
4503
4504
4505
4506
4507
4508
4509
4510 smp_rmb();
4511
4512 event_cpu = READ_ONCE(event->oncpu);
4513 if ((unsigned)event_cpu >= nr_cpu_ids)
4514 return 0;
4515
4516 data = (struct perf_read_data){
4517 .event = event,
4518 .group = group,
4519 .ret = 0,
4520 };
4521
4522 preempt_disable();
4523 event_cpu = __perf_event_read_cpu(event, event_cpu);
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4536 preempt_enable();
4537 ret = data.ret;
4538
4539 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4540 struct perf_event_context *ctx = event->ctx;
4541 unsigned long flags;
4542
4543 raw_spin_lock_irqsave(&ctx->lock, flags);
4544 state = event->state;
4545 if (state != PERF_EVENT_STATE_INACTIVE) {
4546 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4547 goto again;
4548 }
4549
4550
4551
4552
4553
4554 if (ctx->is_active & EVENT_TIME) {
4555 update_context_time(ctx);
4556 update_cgrp_time_from_event(event);
4557 }
4558
4559 perf_event_update_time(event);
4560 if (group)
4561 perf_event_update_sibling_time(event);
4562 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4563 }
4564
4565 return ret;
4566}
4567
4568
4569
4570
4571static void __perf_event_init_context(struct perf_event_context *ctx)
4572{
4573 raw_spin_lock_init(&ctx->lock);
4574 mutex_init(&ctx->mutex);
4575 INIT_LIST_HEAD(&ctx->active_ctx_list);
4576 perf_event_groups_init(&ctx->pinned_groups);
4577 perf_event_groups_init(&ctx->flexible_groups);
4578 INIT_LIST_HEAD(&ctx->event_list);
4579 INIT_LIST_HEAD(&ctx->pinned_active);
4580 INIT_LIST_HEAD(&ctx->flexible_active);
4581 refcount_set(&ctx->refcount, 1);
4582}
4583
4584static struct perf_event_context *
4585alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4586{
4587 struct perf_event_context *ctx;
4588
4589 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4590 if (!ctx)
4591 return NULL;
4592
4593 __perf_event_init_context(ctx);
4594 if (task)
4595 ctx->task = get_task_struct(task);
4596 ctx->pmu = pmu;
4597
4598 return ctx;
4599}
4600
4601static struct task_struct *
4602find_lively_task_by_vpid(pid_t vpid)
4603{
4604 struct task_struct *task;
4605
4606 rcu_read_lock();
4607 if (!vpid)
4608 task = current;
4609 else
4610 task = find_task_by_vpid(vpid);
4611 if (task)
4612 get_task_struct(task);
4613 rcu_read_unlock();
4614
4615 if (!task)
4616 return ERR_PTR(-ESRCH);
4617
4618 return task;
4619}
4620
4621
4622
4623
4624static struct perf_event_context *
4625find_get_context(struct pmu *pmu, struct task_struct *task,
4626 struct perf_event *event)
4627{
4628 struct perf_event_context *ctx, *clone_ctx = NULL;
4629 struct perf_cpu_context *cpuctx;
4630 void *task_ctx_data = NULL;
4631 unsigned long flags;
4632 int ctxn, err;
4633 int cpu = event->cpu;
4634
4635 if (!task) {
4636
4637 err = perf_allow_cpu(&event->attr);
4638 if (err)
4639 return ERR_PTR(err);
4640
4641 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4642 ctx = &cpuctx->ctx;
4643 get_ctx(ctx);
4644 raw_spin_lock_irqsave(&ctx->lock, flags);
4645 ++ctx->pin_count;
4646 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4647
4648 return ctx;
4649 }
4650
4651 err = -EINVAL;
4652 ctxn = pmu->task_ctx_nr;
4653 if (ctxn < 0)
4654 goto errout;
4655
4656 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4657 task_ctx_data = alloc_task_ctx_data(pmu);
4658 if (!task_ctx_data) {
4659 err = -ENOMEM;
4660 goto errout;
4661 }
4662 }
4663
4664retry:
4665 ctx = perf_lock_task_context(task, ctxn, &flags);
4666 if (ctx) {
4667 clone_ctx = unclone_ctx(ctx);
4668 ++ctx->pin_count;
4669
4670 if (task_ctx_data && !ctx->task_ctx_data) {
4671 ctx->task_ctx_data = task_ctx_data;
4672 task_ctx_data = NULL;
4673 }
4674 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4675
4676 if (clone_ctx)
4677 put_ctx(clone_ctx);
4678 } else {
4679 ctx = alloc_perf_context(pmu, task);
4680 err = -ENOMEM;
4681 if (!ctx)
4682 goto errout;
4683
4684 if (task_ctx_data) {
4685 ctx->task_ctx_data = task_ctx_data;
4686 task_ctx_data = NULL;
4687 }
4688
4689 err = 0;
4690 mutex_lock(&task->perf_event_mutex);
4691
4692
4693
4694
4695 if (task->flags & PF_EXITING)
4696 err = -ESRCH;
4697 else if (task->perf_event_ctxp[ctxn])
4698 err = -EAGAIN;
4699 else {
4700 get_ctx(ctx);
4701 ++ctx->pin_count;
4702 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4703 }
4704 mutex_unlock(&task->perf_event_mutex);
4705
4706 if (unlikely(err)) {
4707 put_ctx(ctx);
4708
4709 if (err == -EAGAIN)
4710 goto retry;
4711 goto errout;
4712 }
4713 }
4714
4715 free_task_ctx_data(pmu, task_ctx_data);
4716 return ctx;
4717
4718errout:
4719 free_task_ctx_data(pmu, task_ctx_data);
4720 return ERR_PTR(err);
4721}
4722
4723static void perf_event_free_filter(struct perf_event *event);
4724
4725static void free_event_rcu(struct rcu_head *head)
4726{
4727 struct perf_event *event;
4728
4729 event = container_of(head, struct perf_event, rcu_head);
4730 if (event->ns)
4731 put_pid_ns(event->ns);
4732 perf_event_free_filter(event);
4733 kmem_cache_free(perf_event_cache, event);
4734}
4735
4736static void ring_buffer_attach(struct perf_event *event,
4737 struct perf_buffer *rb);
4738
4739static void detach_sb_event(struct perf_event *event)
4740{
4741 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4742
4743 raw_spin_lock(&pel->lock);
4744 list_del_rcu(&event->sb_list);
4745 raw_spin_unlock(&pel->lock);
4746}
4747
4748static bool is_sb_event(struct perf_event *event)
4749{
4750 struct perf_event_attr *attr = &event->attr;
4751
4752 if (event->parent)
4753 return false;
4754
4755 if (event->attach_state & PERF_ATTACH_TASK)
4756 return false;
4757
4758 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4759 attr->comm || attr->comm_exec ||
4760 attr->task || attr->ksymbol ||
4761 attr->context_switch || attr->text_poke ||
4762 attr->bpf_event)
4763 return true;
4764 return false;
4765}
4766
4767static void unaccount_pmu_sb_event(struct perf_event *event)
4768{
4769 if (is_sb_event(event))
4770 detach_sb_event(event);
4771}
4772
4773static void unaccount_event_cpu(struct perf_event *event, int cpu)
4774{
4775 if (event->parent)
4776 return;
4777
4778 if (is_cgroup_event(event))
4779 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4780}
4781
4782#ifdef CONFIG_NO_HZ_FULL
4783static DEFINE_SPINLOCK(nr_freq_lock);
4784#endif
4785
4786static void unaccount_freq_event_nohz(void)
4787{
4788#ifdef CONFIG_NO_HZ_FULL
4789 spin_lock(&nr_freq_lock);
4790 if (atomic_dec_and_test(&nr_freq_events))
4791 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4792 spin_unlock(&nr_freq_lock);
4793#endif
4794}
4795
4796static void unaccount_freq_event(void)
4797{
4798 if (tick_nohz_full_enabled())
4799 unaccount_freq_event_nohz();
4800 else
4801 atomic_dec(&nr_freq_events);
4802}
4803
4804static void unaccount_event(struct perf_event *event)
4805{
4806 bool dec = false;
4807
4808 if (event->parent)
4809 return;
4810
4811 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4812 dec = true;
4813 if (event->attr.mmap || event->attr.mmap_data)
4814 atomic_dec(&nr_mmap_events);
4815 if (event->attr.build_id)
4816 atomic_dec(&nr_build_id_events);
4817 if (event->attr.comm)
4818 atomic_dec(&nr_comm_events);
4819 if (event->attr.namespaces)
4820 atomic_dec(&nr_namespaces_events);
4821 if (event->attr.cgroup)
4822 atomic_dec(&nr_cgroup_events);
4823 if (event->attr.task)
4824 atomic_dec(&nr_task_events);
4825 if (event->attr.freq)
4826 unaccount_freq_event();
4827 if (event->attr.context_switch) {
4828 dec = true;
4829 atomic_dec(&nr_switch_events);
4830 }
4831 if (is_cgroup_event(event))
4832 dec = true;
4833 if (has_branch_stack(event))
4834 dec = true;
4835 if (event->attr.ksymbol)
4836 atomic_dec(&nr_ksymbol_events);
4837 if (event->attr.bpf_event)
4838 atomic_dec(&nr_bpf_events);
4839 if (event->attr.text_poke)
4840 atomic_dec(&nr_text_poke_events);
4841
4842 if (dec) {
4843 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4844 schedule_delayed_work(&perf_sched_work, HZ);
4845 }
4846
4847 unaccount_event_cpu(event, event->cpu);
4848
4849 unaccount_pmu_sb_event(event);
4850}
4851
4852static void perf_sched_delayed(struct work_struct *work)
4853{
4854 mutex_lock(&perf_sched_mutex);
4855 if (atomic_dec_and_test(&perf_sched_count))
4856 static_branch_disable(&perf_sched_events);
4857 mutex_unlock(&perf_sched_mutex);
4858}
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872static int exclusive_event_init(struct perf_event *event)
4873{
4874 struct pmu *pmu = event->pmu;
4875
4876 if (!is_exclusive_pmu(pmu))
4877 return 0;
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892 if (event->attach_state & PERF_ATTACH_TASK) {
4893 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4894 return -EBUSY;
4895 } else {
4896 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4897 return -EBUSY;
4898 }
4899
4900 return 0;
4901}
4902
4903static void exclusive_event_destroy(struct perf_event *event)
4904{
4905 struct pmu *pmu = event->pmu;
4906
4907 if (!is_exclusive_pmu(pmu))
4908 return;
4909
4910
4911 if (event->attach_state & PERF_ATTACH_TASK)
4912 atomic_dec(&pmu->exclusive_cnt);
4913 else
4914 atomic_inc(&pmu->exclusive_cnt);
4915}
4916
4917static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4918{
4919 if ((e1->pmu == e2->pmu) &&
4920 (e1->cpu == e2->cpu ||
4921 e1->cpu == -1 ||
4922 e2->cpu == -1))
4923 return true;
4924 return false;
4925}
4926
4927static bool exclusive_event_installable(struct perf_event *event,
4928 struct perf_event_context *ctx)
4929{
4930 struct perf_event *iter_event;
4931 struct pmu *pmu = event->pmu;
4932
4933 lockdep_assert_held(&ctx->mutex);
4934
4935 if (!is_exclusive_pmu(pmu))
4936 return true;
4937
4938 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4939 if (exclusive_event_match(iter_event, event))
4940 return false;
4941 }
4942
4943 return true;
4944}
4945
4946static void perf_addr_filters_splice(struct perf_event *event,
4947 struct list_head *head);
4948
4949static void _free_event(struct perf_event *event)
4950{
4951 irq_work_sync(&event->pending);
4952
4953 unaccount_event(event);
4954
4955 security_perf_event_free(event);
4956
4957 if (event->rb) {
4958
4959
4960
4961
4962
4963
4964 mutex_lock(&event->mmap_mutex);
4965 ring_buffer_attach(event, NULL);
4966 mutex_unlock(&event->mmap_mutex);
4967 }
4968
4969 if (is_cgroup_event(event))
4970 perf_detach_cgroup(event);
4971
4972 if (!event->parent) {
4973 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4974 put_callchain_buffers();
4975 }
4976
4977 perf_event_free_bpf_prog(event);
4978 perf_addr_filters_splice(event, NULL);
4979 kfree(event->addr_filter_ranges);
4980
4981 if (event->destroy)
4982 event->destroy(event);
4983
4984
4985
4986
4987
4988 if (event->hw.target)
4989 put_task_struct(event->hw.target);
4990
4991
4992
4993
4994
4995 if (event->ctx)
4996 put_ctx(event->ctx);
4997
4998 exclusive_event_destroy(event);
4999 module_put(event->pmu->module);
5000
5001 call_rcu(&event->rcu_head, free_event_rcu);
5002}
5003
5004
5005
5006
5007
5008static void free_event(struct perf_event *event)
5009{
5010 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5011 "unexpected event refcount: %ld; ptr=%p\n",
5012 atomic_long_read(&event->refcount), event)) {
5013
5014 return;
5015 }
5016
5017 _free_event(event);
5018}
5019
5020
5021
5022
5023static void perf_remove_from_owner(struct perf_event *event)
5024{
5025 struct task_struct *owner;
5026
5027 rcu_read_lock();
5028
5029
5030
5031
5032
5033
5034 owner = READ_ONCE(event->owner);
5035 if (owner) {
5036
5037
5038
5039
5040
5041 get_task_struct(owner);
5042 }
5043 rcu_read_unlock();
5044
5045 if (owner) {
5046
5047
5048
5049
5050
5051
5052
5053
5054 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5055
5056
5057
5058
5059
5060
5061
5062 if (event->owner) {
5063 list_del_init(&event->owner_entry);
5064 smp_store_release(&event->owner, NULL);
5065 }
5066 mutex_unlock(&owner->perf_event_mutex);
5067 put_task_struct(owner);
5068 }
5069}
5070
5071static void put_event(struct perf_event *event)
5072{
5073 if (!atomic_long_dec_and_test(&event->refcount))
5074 return;
5075
5076 _free_event(event);
5077}
5078
5079
5080
5081
5082
5083
5084int perf_event_release_kernel(struct perf_event *event)
5085{
5086 struct perf_event_context *ctx = event->ctx;
5087 struct perf_event *child, *tmp;
5088 LIST_HEAD(free_list);
5089
5090
5091
5092
5093
5094 if (!ctx) {
5095 WARN_ON_ONCE(event->attach_state &
5096 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5097 goto no_ctx;
5098 }
5099
5100 if (!is_kernel_event(event))
5101 perf_remove_from_owner(event);
5102
5103 ctx = perf_event_ctx_lock(event);
5104 WARN_ON_ONCE(ctx->parent_ctx);
5105 perf_remove_from_context(event, DETACH_GROUP);
5106
5107 raw_spin_lock_irq(&ctx->lock);
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119 event->state = PERF_EVENT_STATE_DEAD;
5120 raw_spin_unlock_irq(&ctx->lock);
5121
5122 perf_event_ctx_unlock(event, ctx);
5123
5124again:
5125 mutex_lock(&event->child_mutex);
5126 list_for_each_entry(child, &event->child_list, child_list) {
5127
5128
5129
5130
5131
5132 ctx = READ_ONCE(child->ctx);
5133
5134
5135
5136
5137
5138
5139
5140
5141 get_ctx(ctx);
5142
5143
5144
5145
5146
5147
5148 mutex_unlock(&event->child_mutex);
5149 mutex_lock(&ctx->mutex);
5150 mutex_lock(&event->child_mutex);
5151
5152
5153
5154
5155
5156
5157 tmp = list_first_entry_or_null(&event->child_list,
5158 struct perf_event, child_list);
5159 if (tmp == child) {
5160 perf_remove_from_context(child, DETACH_GROUP);
5161 list_move(&child->child_list, &free_list);
5162
5163
5164
5165
5166 put_event(event);
5167 }
5168
5169 mutex_unlock(&event->child_mutex);
5170 mutex_unlock(&ctx->mutex);
5171 put_ctx(ctx);
5172 goto again;
5173 }
5174 mutex_unlock(&event->child_mutex);
5175
5176 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5177 void *var = &child->ctx->refcount;
5178
5179 list_del(&child->child_list);
5180 free_event(child);
5181
5182
5183
5184
5185
5186 smp_mb();
5187 wake_up_var(var);
5188 }
5189
5190no_ctx:
5191 put_event(event);
5192 return 0;
5193}
5194EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5195
5196
5197
5198
5199static int perf_release(struct inode *inode, struct file *file)
5200{
5201 perf_event_release_kernel(file->private_data);
5202 return 0;
5203}
5204
5205static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5206{
5207 struct perf_event *child;
5208 u64 total = 0;
5209
5210 *enabled = 0;
5211 *running = 0;
5212
5213 mutex_lock(&event->child_mutex);
5214
5215 (void)perf_event_read(event, false);
5216 total += perf_event_count(event);
5217
5218 *enabled += event->total_time_enabled +
5219 atomic64_read(&event->child_total_time_enabled);
5220 *running += event->total_time_running +
5221 atomic64_read(&event->child_total_time_running);
5222
5223 list_for_each_entry(child, &event->child_list, child_list) {
5224 (void)perf_event_read(child, false);
5225 total += perf_event_count(child);
5226 *enabled += child->total_time_enabled;
5227 *running += child->total_time_running;
5228 }
5229 mutex_unlock(&event->child_mutex);
5230
5231 return total;
5232}
5233
5234u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5235{
5236 struct perf_event_context *ctx;
5237 u64 count;
5238
5239 ctx = perf_event_ctx_lock(event);
5240 count = __perf_event_read_value(event, enabled, running);
5241 perf_event_ctx_unlock(event, ctx);
5242
5243 return count;
5244}
5245EXPORT_SYMBOL_GPL(perf_event_read_value);
5246
5247static int __perf_read_group_add(struct perf_event *leader,
5248 u64 read_format, u64 *values)
5249{
5250 struct perf_event_context *ctx = leader->ctx;
5251 struct perf_event *sub;
5252 unsigned long flags;
5253 int n = 1;
5254 int ret;
5255
5256 ret = perf_event_read(leader, true);
5257 if (ret)
5258 return ret;
5259
5260 raw_spin_lock_irqsave(&ctx->lock, flags);
5261
5262
5263
5264
5265
5266
5267 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5268 values[n++] += leader->total_time_enabled +
5269 atomic64_read(&leader->child_total_time_enabled);
5270 }
5271
5272 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5273 values[n++] += leader->total_time_running +
5274 atomic64_read(&leader->child_total_time_running);
5275 }
5276
5277
5278
5279
5280 values[n++] += perf_event_count(leader);
5281 if (read_format & PERF_FORMAT_ID)
5282 values[n++] = primary_event_id(leader);
5283
5284 for_each_sibling_event(sub, leader) {
5285 values[n++] += perf_event_count(sub);
5286 if (read_format & PERF_FORMAT_ID)
5287 values[n++] = primary_event_id(sub);
5288 }
5289
5290 raw_spin_unlock_irqrestore(&ctx->lock, flags);
5291 return 0;
5292}
5293
5294static int perf_read_group(struct perf_event *event,
5295 u64 read_format, char __user *buf)
5296{
5297 struct perf_event *leader = event->group_leader, *child;
5298 struct perf_event_context *ctx = leader->ctx;
5299 int ret;
5300 u64 *values;
5301
5302 lockdep_assert_held(&ctx->mutex);
5303
5304 values = kzalloc(event->read_size, GFP_KERNEL);
5305 if (!values)
5306 return -ENOMEM;
5307
5308 values[0] = 1 + leader->nr_siblings;
5309
5310
5311
5312
5313
5314 mutex_lock(&leader->child_mutex);
5315
5316 ret = __perf_read_group_add(leader, read_format, values);
5317 if (ret)
5318 goto unlock;
5319
5320 list_for_each_entry(child, &leader->child_list, child_list) {
5321 ret = __perf_read_group_add(child, read_format, values);
5322 if (ret)
5323 goto unlock;
5324 }
5325
5326 mutex_unlock(&leader->child_mutex);
5327
5328 ret = event->read_size;
5329 if (copy_to_user(buf, values, event->read_size))
5330 ret = -EFAULT;
5331 goto out;
5332
5333unlock:
5334 mutex_unlock(&leader->child_mutex);
5335out:
5336 kfree(values);
5337 return ret;
5338}
5339
5340static int perf_read_one(struct perf_event *event,
5341 u64 read_format, char __user *buf)
5342{
5343 u64 enabled, running;
5344 u64 values[4];
5345 int n = 0;
5346
5347 values[n++] = __perf_event_read_value(event, &enabled, &running);
5348 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5349 values[n++] = enabled;
5350 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5351 values[n++] = running;
5352 if (read_format & PERF_FORMAT_ID)
5353 values[n++] = primary_event_id(event);
5354
5355 if (copy_to_user(buf, values, n * sizeof(u64)))
5356 return -EFAULT;
5357
5358 return n * sizeof(u64);
5359}
5360
5361static bool is_event_hup(struct perf_event *event)
5362{
5363 bool no_children;
5364
5365 if (event->state > PERF_EVENT_STATE_EXIT)
5366 return false;
5367
5368 mutex_lock(&event->child_mutex);
5369 no_children = list_empty(&event->child_list);
5370 mutex_unlock(&event->child_mutex);
5371 return no_children;
5372}
5373
5374
5375
5376
5377static ssize_t
5378__perf_read(struct perf_event *event, char __user *buf, size_t count)
5379{
5380 u64 read_format = event->attr.read_format;
5381 int ret;
5382
5383
5384
5385
5386
5387
5388 if (event->state == PERF_EVENT_STATE_ERROR)
5389 return 0;
5390
5391 if (count < event->read_size)
5392 return -ENOSPC;
5393
5394 WARN_ON_ONCE(event->ctx->parent_ctx);
5395 if (read_format & PERF_FORMAT_GROUP)
5396 ret = perf_read_group(event, read_format, buf);
5397 else
5398 ret = perf_read_one(event, read_format, buf);
5399
5400 return ret;
5401}
5402
5403static ssize_t
5404perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5405{
5406 struct perf_event *event = file->private_data;
5407 struct perf_event_context *ctx;
5408 int ret;
5409
5410 ret = security_perf_event_read(event);
5411 if (ret)
5412 return ret;
5413
5414 ctx = perf_event_ctx_lock(event);
5415 ret = __perf_read(event, buf, count);
5416 perf_event_ctx_unlock(event, ctx);
5417
5418 return ret;
5419}
5420
5421static __poll_t perf_poll(struct file *file, poll_table *wait)
5422{
5423 struct perf_event *event = file->private_data;
5424 struct perf_buffer *rb;
5425 __poll_t events = EPOLLHUP;
5426
5427 poll_wait(file, &event->waitq, wait);
5428
5429 if (is_event_hup(event))
5430 return events;
5431
5432
5433
5434
5435
5436 mutex_lock(&event->mmap_mutex);
5437 rb = event->rb;
5438 if (rb)
5439 events = atomic_xchg(&rb->poll, 0);
5440 mutex_unlock(&event->mmap_mutex);
5441 return events;
5442}
5443
5444static void _perf_event_reset(struct perf_event *event)
5445{
5446 (void)perf_event_read(event, false);
5447 local64_set(&event->count, 0);
5448 perf_event_update_userpage(event);
5449}
5450
5451
5452u64 perf_event_pause(struct perf_event *event, bool reset)
5453{
5454 struct perf_event_context *ctx;
5455 u64 count;
5456
5457 ctx = perf_event_ctx_lock(event);
5458 WARN_ON_ONCE(event->attr.inherit);
5459 _perf_event_disable(event);
5460 count = local64_read(&event->count);
5461 if (reset)
5462 local64_set(&event->count, 0);
5463 perf_event_ctx_unlock(event, ctx);
5464
5465 return count;
5466}
5467EXPORT_SYMBOL_GPL(perf_event_pause);
5468
5469
5470
5471
5472
5473
5474
5475static void perf_event_for_each_child(struct perf_event *event,
5476 void (*func)(struct perf_event *))
5477{
5478 struct perf_event *child;
5479
5480 WARN_ON_ONCE(event->ctx->parent_ctx);
5481
5482 mutex_lock(&event->child_mutex);
5483 func(event);
5484 list_for_each_entry(child, &event->child_list, child_list)
5485 func(child);
5486 mutex_unlock(&event->child_mutex);
5487}
5488
5489static void perf_event_for_each(struct perf_event *event,
5490 void (*func)(struct perf_event *))
5491{
5492 struct perf_event_context *ctx = event->ctx;
5493 struct perf_event *sibling;
5494
5495 lockdep_assert_held(&ctx->mutex);
5496
5497 event = event->group_leader;
5498
5499 perf_event_for_each_child(event, func);
5500 for_each_sibling_event(sibling, event)
5501 perf_event_for_each_child(sibling, func);
5502}
5503
5504static void __perf_event_period(struct perf_event *event,
5505 struct perf_cpu_context *cpuctx,
5506 struct perf_event_context *ctx,
5507 void *info)
5508{
5509 u64 value = *((u64 *)info);
5510 bool active;
5511
5512 if (event->attr.freq) {
5513 event->attr.sample_freq = value;
5514 } else {
5515 event->attr.sample_period = value;
5516 event->hw.sample_period = value;
5517 }
5518
5519 active = (event->state == PERF_EVENT_STATE_ACTIVE);
5520 if (active) {
5521 perf_pmu_disable(ctx->pmu);
5522
5523
5524
5525
5526 if (event->hw.interrupts == MAX_INTERRUPTS) {
5527 event->hw.interrupts = 0;
5528 perf_log_throttle(event, 1);
5529 }
5530 event->pmu->stop(event, PERF_EF_UPDATE);
5531 }
5532
5533 local64_set(&event->hw.period_left, 0);
5534
5535 if (active) {
5536 event->pmu->start(event, PERF_EF_RELOAD);
5537 perf_pmu_enable(ctx->pmu);
5538 }
5539}
5540
5541static int perf_event_check_period(struct perf_event *event, u64 value)
5542{
5543 return event->pmu->check_period(event, value);
5544}
5545
5546static int _perf_event_period(struct perf_event *event, u64 value)
5547{
5548 if (!is_sampling_event(event))
5549 return -EINVAL;
5550
5551 if (!value)
5552 return -EINVAL;
5553
5554 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5555 return -EINVAL;
5556
5557 if (perf_event_check_period(event, value))
5558 return -EINVAL;
5559
5560 if (!event->attr.freq && (value & (1ULL << 63)))
5561 return -EINVAL;
5562
5563 event_function_call(event, __perf_event_period, &value);
5564
5565 return 0;
5566}
5567
5568int perf_event_period(struct perf_event *event, u64 value)
5569{
5570 struct perf_event_context *ctx;
5571 int ret;
5572
5573 ctx = perf_event_ctx_lock(event);
5574 ret = _perf_event_period(event, value);
5575 perf_event_ctx_unlock(event, ctx);
5576
5577 return ret;
5578}
5579EXPORT_SYMBOL_GPL(perf_event_period);
5580
5581static const struct file_operations perf_fops;
5582
5583static inline int perf_fget_light(int fd, struct fd *p)
5584{
5585 struct fd f = fdget(fd);
5586 if (!f.file)
5587 return -EBADF;
5588
5589 if (f.file->f_op != &perf_fops) {
5590 fdput(f);
5591 return -EBADF;
5592 }
5593 *p = f;
5594 return 0;
5595}
5596
5597static int perf_event_set_output(struct perf_event *event,
5598 struct perf_event *output_event);
5599static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5600static int perf_copy_attr(struct perf_event_attr __user *uattr,
5601 struct perf_event_attr *attr);
5602
5603static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5604{
5605 void (*func)(struct perf_event *);
5606 u32 flags = arg;
5607
5608 switch (cmd) {
5609 case PERF_EVENT_IOC_ENABLE:
5610 func = _perf_event_enable;
5611 break;
5612 case PERF_EVENT_IOC_DISABLE:
5613 func = _perf_event_disable;
5614 break;
5615 case PERF_EVENT_IOC_RESET:
5616 func = _perf_event_reset;
5617 break;
5618
5619 case PERF_EVENT_IOC_REFRESH:
5620 return _perf_event_refresh(event, arg);
5621
5622 case PERF_EVENT_IOC_PERIOD:
5623 {
5624 u64 value;
5625
5626 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5627 return -EFAULT;
5628
5629 return _perf_event_period(event, value);
5630 }
5631 case PERF_EVENT_IOC_ID:
5632 {
5633 u64 id = primary_event_id(event);
5634
5635 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5636 return -EFAULT;
5637 return 0;
5638 }
5639
5640 case PERF_EVENT_IOC_SET_OUTPUT:
5641 {
5642 int ret;
5643 if (arg != -1) {
5644 struct perf_event *output_event;
5645 struct fd output;
5646 ret = perf_fget_light(arg, &output);
5647 if (ret)
5648 return ret;
5649 output_event = output.file->private_data;
5650 ret = perf_event_set_output(event, output_event);
5651 fdput(output);
5652 } else {
5653 ret = perf_event_set_output(event, NULL);
5654 }
5655 return ret;
5656 }
5657
5658 case PERF_EVENT_IOC_SET_FILTER:
5659 return perf_event_set_filter(event, (void __user *)arg);
5660
5661 case PERF_EVENT_IOC_SET_BPF:
5662 {
5663 struct bpf_prog *prog;
5664 int err;
5665
5666 prog = bpf_prog_get(arg);
5667 if (IS_ERR(prog))
5668 return PTR_ERR(prog);
5669
5670 err = perf_event_set_bpf_prog(event, prog, 0);
5671 if (err) {
5672 bpf_prog_put(prog);
5673 return err;
5674 }
5675
5676 return 0;
5677 }
5678
5679 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5680 struct perf_buffer *rb;
5681
5682 rcu_read_lock();
5683 rb = rcu_dereference(event->rb);
5684 if (!rb || !rb->nr_pages) {
5685 rcu_read_unlock();
5686 return -EINVAL;
5687 }
5688 rb_toggle_paused(rb, !!arg);
5689 rcu_read_unlock();
5690 return 0;
5691 }
5692
5693 case PERF_EVENT_IOC_QUERY_BPF:
5694 return perf_event_query_prog_array(event, (void __user *)arg);
5695
5696 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5697 struct perf_event_attr new_attr;
5698 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5699 &new_attr);
5700
5701 if (err)
5702 return err;
5703
5704 return perf_event_modify_attr(event, &new_attr);
5705 }
5706 default:
5707 return -ENOTTY;
5708 }
5709
5710 if (flags & PERF_IOC_FLAG_GROUP)
5711 perf_event_for_each(event, func);
5712 else
5713 perf_event_for_each_child(event, func);
5714
5715 return 0;
5716}
5717
5718static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5719{
5720 struct perf_event *event = file->private_data;
5721 struct perf_event_context *ctx;
5722 long ret;
5723
5724
5725 ret = security_perf_event_write(event);
5726 if (ret)
5727 return ret;
5728
5729 ctx = perf_event_ctx_lock(event);
5730 ret = _perf_ioctl(event, cmd, arg);
5731 perf_event_ctx_unlock(event, ctx);
5732
5733 return ret;
5734}
5735
5736#ifdef CONFIG_COMPAT
5737static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5738 unsigned long arg)
5739{
5740 switch (_IOC_NR(cmd)) {
5741 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5742 case _IOC_NR(PERF_EVENT_IOC_ID):
5743 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5744 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5745
5746 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5747 cmd &= ~IOCSIZE_MASK;
5748 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5749 }
5750 break;
5751 }
5752 return perf_ioctl(file, cmd, arg);
5753}
5754#else
5755# define perf_compat_ioctl NULL
5756#endif
5757
5758int perf_event_task_enable(void)
5759{
5760 struct perf_event_context *ctx;
5761 struct perf_event *event;
5762
5763 mutex_lock(¤t->perf_event_mutex);
5764 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5765 ctx = perf_event_ctx_lock(event);
5766 perf_event_for_each_child(event, _perf_event_enable);
5767 perf_event_ctx_unlock(event, ctx);
5768 }
5769 mutex_unlock(¤t->perf_event_mutex);
5770
5771 return 0;
5772}
5773
5774int perf_event_task_disable(void)
5775{
5776 struct perf_event_context *ctx;
5777 struct perf_event *event;
5778
5779 mutex_lock(¤t->perf_event_mutex);
5780 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5781 ctx = perf_event_ctx_lock(event);
5782 perf_event_for_each_child(event, _perf_event_disable);
5783 perf_event_ctx_unlock(event, ctx);
5784 }
5785 mutex_unlock(¤t->perf_event_mutex);
5786
5787 return 0;
5788}
5789
5790static int perf_event_index(struct perf_event *event)
5791{
5792 if (event->hw.state & PERF_HES_STOPPED)
5793 return 0;
5794
5795 if (event->state != PERF_EVENT_STATE_ACTIVE)
5796 return 0;
5797
5798 return event->pmu->event_idx(event);
5799}
5800
5801static void calc_timer_values(struct perf_event *event,
5802 u64 *now,
5803 u64 *enabled,
5804 u64 *running)
5805{
5806 u64 ctx_time;
5807
5808 *now = perf_clock();
5809 ctx_time = event->shadow_ctx_time + *now;
5810 __perf_update_times(event, ctx_time, enabled, running);
5811}
5812
5813static void perf_event_init_userpage(struct perf_event *event)
5814{
5815 struct perf_event_mmap_page *userpg;
5816 struct perf_buffer *rb;
5817
5818 rcu_read_lock();
5819 rb = rcu_dereference(event->rb);
5820 if (!rb)
5821 goto unlock;
5822
5823 userpg = rb->user_page;
5824
5825
5826 userpg->cap_bit0_is_deprecated = 1;
5827 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5828 userpg->data_offset = PAGE_SIZE;
5829 userpg->data_size = perf_data_size(rb);
5830
5831unlock:
5832 rcu_read_unlock();
5833}
5834
5835void __weak arch_perf_update_userpage(
5836 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5837{
5838}
5839
5840
5841
5842
5843
5844
5845void perf_event_update_userpage(struct perf_event *event)
5846{
5847 struct perf_event_mmap_page *userpg;
5848 struct perf_buffer *rb;
5849 u64 enabled, running, now;
5850
5851 rcu_read_lock();
5852 rb = rcu_dereference(event->rb);
5853 if (!rb)
5854 goto unlock;
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865 calc_timer_values(event, &now, &enabled, &running);
5866
5867 userpg = rb->user_page;
5868
5869
5870
5871
5872 preempt_disable();
5873 ++userpg->lock;
5874 barrier();
5875 userpg->index = perf_event_index(event);
5876 userpg->offset = perf_event_count(event);
5877 if (userpg->index)
5878 userpg->offset -= local64_read(&event->hw.prev_count);
5879
5880 userpg->time_enabled = enabled +
5881 atomic64_read(&event->child_total_time_enabled);
5882
5883 userpg->time_running = running +
5884 atomic64_read(&event->child_total_time_running);
5885
5886 arch_perf_update_userpage(event, userpg, now);
5887
5888 barrier();
5889 ++userpg->lock;
5890 preempt_enable();
5891unlock:
5892 rcu_read_unlock();
5893}
5894EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5895
5896static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5897{
5898 struct perf_event *event = vmf->vma->vm_file->private_data;
5899 struct perf_buffer *rb;
5900 vm_fault_t ret = VM_FAULT_SIGBUS;
5901
5902 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5903 if (vmf->pgoff == 0)
5904 ret = 0;
5905 return ret;
5906 }
5907
5908 rcu_read_lock();
5909 rb = rcu_dereference(event->rb);
5910 if (!rb)
5911 goto unlock;
5912
5913 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5914 goto unlock;
5915
5916 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5917 if (!vmf->page)
5918 goto unlock;
5919
5920 get_page(vmf->page);
5921 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5922 vmf->page->index = vmf->pgoff;
5923
5924 ret = 0;
5925unlock:
5926 rcu_read_unlock();
5927
5928 return ret;
5929}
5930
5931static void ring_buffer_attach(struct perf_event *event,
5932 struct perf_buffer *rb)
5933{
5934 struct perf_buffer *old_rb = NULL;
5935 unsigned long flags;
5936
5937 if (event->rb) {
5938
5939
5940
5941
5942 WARN_ON_ONCE(event->rcu_pending);
5943
5944 old_rb = event->rb;
5945 spin_lock_irqsave(&old_rb->event_lock, flags);
5946 list_del_rcu(&event->rb_entry);
5947 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5948
5949 event->rcu_batches = get_state_synchronize_rcu();
5950 event->rcu_pending = 1;
5951 }
5952
5953 if (rb) {
5954 if (event->rcu_pending) {
5955 cond_synchronize_rcu(event->rcu_batches);
5956 event->rcu_pending = 0;
5957 }
5958
5959 spin_lock_irqsave(&rb->event_lock, flags);
5960 list_add_rcu(&event->rb_entry, &rb->event_list);
5961 spin_unlock_irqrestore(&rb->event_lock, flags);
5962 }
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974 if (has_aux(event))
5975 perf_event_stop(event, 0);
5976
5977 rcu_assign_pointer(event->rb, rb);
5978
5979 if (old_rb) {
5980 ring_buffer_put(old_rb);
5981
5982
5983
5984
5985
5986 wake_up_all(&event->waitq);
5987 }
5988}
5989
5990static void ring_buffer_wakeup(struct perf_event *event)
5991{
5992 struct perf_buffer *rb;
5993
5994 rcu_read_lock();
5995 rb = rcu_dereference(event->rb);
5996 if (rb) {
5997 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5998 wake_up_all(&event->waitq);
5999 }
6000 rcu_read_unlock();
6001}
6002
6003struct perf_buffer *ring_buffer_get(struct perf_event *event)
6004{
6005 struct perf_buffer *rb;
6006
6007 rcu_read_lock();
6008 rb = rcu_dereference(event->rb);
6009 if (rb) {
6010 if (!refcount_inc_not_zero(&rb->refcount))
6011 rb = NULL;
6012 }
6013 rcu_read_unlock();
6014
6015 return rb;
6016}
6017
6018void ring_buffer_put(struct perf_buffer *rb)
6019{
6020 if (!refcount_dec_and_test(&rb->refcount))
6021 return;
6022
6023 WARN_ON_ONCE(!list_empty(&rb->event_list));
6024
6025 call_rcu(&rb->rcu_head, rb_free_rcu);
6026}
6027
6028static void perf_mmap_open(struct vm_area_struct *vma)
6029{
6030 struct perf_event *event = vma->vm_file->private_data;
6031
6032 atomic_inc(&event->mmap_count);
6033 atomic_inc(&event->rb->mmap_count);
6034
6035 if (vma->vm_pgoff)
6036 atomic_inc(&event->rb->aux_mmap_count);
6037
6038 if (event->pmu->event_mapped)
6039 event->pmu->event_mapped(event, vma->vm_mm);
6040}
6041
6042static void perf_pmu_output_stop(struct perf_event *event);
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052static void perf_mmap_close(struct vm_area_struct *vma)
6053{
6054 struct perf_event *event = vma->vm_file->private_data;
6055 struct perf_buffer *rb = ring_buffer_get(event);
6056 struct user_struct *mmap_user = rb->mmap_user;
6057 int mmap_locked = rb->mmap_locked;
6058 unsigned long size = perf_data_size(rb);
6059 bool detach_rest = false;
6060
6061 if (event->pmu->event_unmapped)
6062 event->pmu->event_unmapped(event, vma->vm_mm);
6063
6064
6065
6066
6067
6068
6069 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6070 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6071
6072
6073
6074
6075
6076
6077 perf_pmu_output_stop(event);
6078
6079
6080 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6081 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6082
6083
6084 rb_free_aux(rb);
6085 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6086
6087 mutex_unlock(&event->mmap_mutex);
6088 }
6089
6090 if (atomic_dec_and_test(&rb->mmap_count))
6091 detach_rest = true;
6092
6093 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6094 goto out_put;
6095
6096 ring_buffer_attach(event, NULL);
6097 mutex_unlock(&event->mmap_mutex);
6098
6099
6100 if (!detach_rest)
6101 goto out_put;
6102
6103
6104
6105
6106
6107
6108again:
6109 rcu_read_lock();
6110 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6111 if (!atomic_long_inc_not_zero(&event->refcount)) {
6112
6113
6114
6115
6116 continue;
6117 }
6118 rcu_read_unlock();
6119
6120 mutex_lock(&event->mmap_mutex);
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131 if (event->rb == rb)
6132 ring_buffer_attach(event, NULL);
6133
6134 mutex_unlock(&event->mmap_mutex);
6135 put_event(event);
6136
6137
6138
6139
6140
6141 goto again;
6142 }
6143 rcu_read_unlock();
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154 atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6155 &mmap_user->locked_vm);
6156 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6157 free_uid(mmap_user);
6158
6159out_put:
6160 ring_buffer_put(rb);
6161}
6162
6163static const struct vm_operations_struct perf_mmap_vmops = {
6164 .open = perf_mmap_open,
6165 .close = perf_mmap_close,
6166 .fault = perf_mmap_fault,
6167 .page_mkwrite = perf_mmap_fault,
6168};
6169
6170static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6171{
6172 struct perf_event *event = file->private_data;
6173 unsigned long user_locked, user_lock_limit;
6174 struct user_struct *user = current_user();
6175 struct perf_buffer *rb = NULL;
6176 unsigned long locked, lock_limit;
6177 unsigned long vma_size;
6178 unsigned long nr_pages;
6179 long user_extra = 0, extra = 0;
6180 int ret = 0, flags = 0;
6181
6182
6183
6184
6185
6186
6187 if (event->cpu == -1 && event->attr.inherit)
6188 return -EINVAL;
6189
6190 if (!(vma->vm_flags & VM_SHARED))
6191 return -EINVAL;
6192
6193 ret = security_perf_event_read(event);
6194 if (ret)
6195 return ret;
6196
6197 vma_size = vma->vm_end - vma->vm_start;
6198
6199 if (vma->vm_pgoff == 0) {
6200 nr_pages = (vma_size / PAGE_SIZE) - 1;
6201 } else {
6202
6203
6204
6205
6206
6207 u64 aux_offset, aux_size;
6208
6209 if (!event->rb)
6210 return -EINVAL;
6211
6212 nr_pages = vma_size / PAGE_SIZE;
6213
6214 mutex_lock(&event->mmap_mutex);
6215 ret = -EINVAL;
6216
6217 rb = event->rb;
6218 if (!rb)
6219 goto aux_unlock;
6220
6221 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6222 aux_size = READ_ONCE(rb->user_page->aux_size);
6223
6224 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6225 goto aux_unlock;
6226
6227 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6228 goto aux_unlock;
6229
6230
6231 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6232 goto aux_unlock;
6233
6234 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6235 goto aux_unlock;
6236
6237
6238 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6239 goto aux_unlock;
6240
6241 if (!is_power_of_2(nr_pages))
6242 goto aux_unlock;
6243
6244 if (!atomic_inc_not_zero(&rb->mmap_count))
6245 goto aux_unlock;
6246
6247 if (rb_has_aux(rb)) {
6248 atomic_inc(&rb->aux_mmap_count);
6249 ret = 0;
6250 goto unlock;
6251 }
6252
6253 atomic_set(&rb->aux_mmap_count, 1);
6254 user_extra = nr_pages;
6255
6256 goto accounting;
6257 }
6258
6259
6260
6261
6262
6263 if (nr_pages != 0 && !is_power_of_2(nr_pages))
6264 return -EINVAL;
6265
6266 if (vma_size != PAGE_SIZE * (1 + nr_pages))
6267 return -EINVAL;
6268
6269 WARN_ON_ONCE(event->ctx->parent_ctx);
6270again:
6271 mutex_lock(&event->mmap_mutex);
6272 if (event->rb) {
6273 if (event->rb->nr_pages != nr_pages) {
6274 ret = -EINVAL;
6275 goto unlock;
6276 }
6277
6278 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6279
6280
6281
6282
6283
6284 mutex_unlock(&event->mmap_mutex);
6285 goto again;
6286 }
6287
6288 goto unlock;
6289 }
6290
6291 user_extra = nr_pages + 1;
6292
6293accounting:
6294 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6295
6296
6297
6298
6299 user_lock_limit *= num_online_cpus();
6300
6301 user_locked = atomic_long_read(&user->locked_vm);
6302
6303
6304
6305
6306
6307 if (user_locked > user_lock_limit)
6308 user_locked = user_lock_limit;
6309 user_locked += user_extra;
6310
6311 if (user_locked > user_lock_limit) {
6312
6313
6314
6315
6316 extra = user_locked - user_lock_limit;
6317 user_extra -= extra;
6318 }
6319
6320 lock_limit = rlimit(RLIMIT_MEMLOCK);
6321 lock_limit >>= PAGE_SHIFT;
6322 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6323
6324 if ((locked > lock_limit) && perf_is_paranoid() &&
6325 !capable(CAP_IPC_LOCK)) {
6326 ret = -EPERM;
6327 goto unlock;
6328 }
6329
6330 WARN_ON(!rb && event->rb);
6331
6332 if (vma->vm_flags & VM_WRITE)
6333 flags |= RING_BUFFER_WRITABLE;
6334
6335 if (!rb) {
6336 rb = rb_alloc(nr_pages,
6337 event->attr.watermark ? event->attr.wakeup_watermark : 0,
6338 event->cpu, flags);
6339
6340 if (!rb) {
6341 ret = -ENOMEM;
6342 goto unlock;
6343 }
6344
6345 atomic_set(&rb->mmap_count, 1);
6346 rb->mmap_user = get_current_user();
6347 rb->mmap_locked = extra;
6348
6349 ring_buffer_attach(event, rb);
6350
6351 perf_event_update_time(event);
6352 perf_set_shadow_time(event, event->ctx);
6353 perf_event_init_userpage(event);
6354 perf_event_update_userpage(event);
6355 } else {
6356 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6357 event->attr.aux_watermark, flags);
6358 if (!ret)
6359 rb->aux_mmap_locked = extra;
6360 }
6361
6362unlock:
6363 if (!ret) {
6364 atomic_long_add(user_extra, &user->locked_vm);
6365 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6366
6367 atomic_inc(&event->mmap_count);
6368 } else if (rb) {
6369 atomic_dec(&rb->mmap_count);
6370 }
6371aux_unlock:
6372 mutex_unlock(&event->mmap_mutex);
6373
6374
6375
6376
6377
6378 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6379 vma->vm_ops = &perf_mmap_vmops;
6380
6381 if (event->pmu->event_mapped)
6382 event->pmu->event_mapped(event, vma->vm_mm);
6383
6384 return ret;
6385}
6386
6387static int perf_fasync(int fd, struct file *filp, int on)
6388{
6389 struct inode *inode = file_inode(filp);
6390 struct perf_event *event = filp->private_data;
6391 int retval;
6392
6393 inode_lock(inode);
6394 retval = fasync_helper(fd, filp, on, &event->fasync);
6395 inode_unlock(inode);
6396
6397 if (retval < 0)
6398 return retval;
6399
6400 return 0;
6401}
6402
6403static const struct file_operations perf_fops = {
6404 .llseek = no_llseek,
6405 .release = perf_release,
6406 .read = perf_read,
6407 .poll = perf_poll,
6408 .unlocked_ioctl = perf_ioctl,
6409 .compat_ioctl = perf_compat_ioctl,
6410 .mmap = perf_mmap,
6411 .fasync = perf_fasync,
6412};
6413
6414
6415
6416
6417
6418
6419
6420
6421static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6422{
6423
6424 if (event->parent)
6425 event = event->parent;
6426 return &event->fasync;
6427}
6428
6429void perf_event_wakeup(struct perf_event *event)
6430{
6431 ring_buffer_wakeup(event);
6432
6433 if (event->pending_kill) {
6434 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6435 event->pending_kill = 0;
6436 }
6437}
6438
6439static void perf_sigtrap(struct perf_event *event)
6440{
6441
6442
6443
6444
6445
6446 if (WARN_ON_ONCE(event->ctx->task != current))
6447 return;
6448
6449
6450
6451
6452 if (current->flags & PF_EXITING)
6453 return;
6454
6455 force_sig_perf((void __user *)event->pending_addr,
6456 event->attr.type, event->attr.sig_data);
6457}
6458
6459static void perf_pending_event_disable(struct perf_event *event)
6460{
6461 int cpu = READ_ONCE(event->pending_disable);
6462
6463 if (cpu < 0)
6464 return;
6465
6466 if (cpu == smp_processor_id()) {
6467 WRITE_ONCE(event->pending_disable, -1);
6468
6469 if (event->attr.sigtrap) {
6470 perf_sigtrap(event);
6471 atomic_set_release(&event->event_limit, 1);
6472 return;
6473 }
6474
6475 perf_event_disable_local(event);
6476 return;
6477 }
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499 irq_work_queue_on(&event->pending, cpu);
6500}
6501
6502static void perf_pending_event(struct irq_work *entry)
6503{
6504 struct perf_event *event = container_of(entry, struct perf_event, pending);
6505 int rctx;
6506
6507 rctx = perf_swevent_get_recursion_context();
6508
6509
6510
6511
6512
6513 perf_pending_event_disable(event);
6514
6515 if (event->pending_wakeup) {
6516 event->pending_wakeup = 0;
6517 perf_event_wakeup(event);
6518 }
6519
6520 if (rctx >= 0)
6521 perf_swevent_put_recursion_context(rctx);
6522}
6523
6524
6525
6526
6527
6528
6529struct perf_guest_info_callbacks *perf_guest_cbs;
6530
6531int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6532{
6533 perf_guest_cbs = cbs;
6534 return 0;
6535}
6536EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6537
6538int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6539{
6540 perf_guest_cbs = NULL;
6541 return 0;
6542}
6543EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6544
6545static void
6546perf_output_sample_regs(struct perf_output_handle *handle,
6547 struct pt_regs *regs, u64 mask)
6548{
6549 int bit;
6550 DECLARE_BITMAP(_mask, 64);
6551
6552 bitmap_from_u64(_mask, mask);
6553 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6554 u64 val;
6555
6556 val = perf_reg_value(regs, bit);
6557 perf_output_put(handle, val);
6558 }
6559}
6560
6561static void perf_sample_regs_user(struct perf_regs *regs_user,
6562 struct pt_regs *regs)
6563{
6564 if (user_mode(regs)) {
6565 regs_user->abi = perf_reg_abi(current);
6566 regs_user->regs = regs;
6567 } else if (!(current->flags & PF_KTHREAD)) {
6568 perf_get_regs_user(regs_user, regs);
6569 } else {
6570 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6571 regs_user->regs = NULL;
6572 }
6573}
6574
6575static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6576 struct pt_regs *regs)
6577{
6578 regs_intr->regs = regs;
6579 regs_intr->abi = perf_reg_abi(current);
6580}
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590static u64 perf_ustack_task_size(struct pt_regs *regs)
6591{
6592 unsigned long addr = perf_user_stack_pointer(regs);
6593
6594 if (!addr || addr >= TASK_SIZE)
6595 return 0;
6596
6597 return TASK_SIZE - addr;
6598}
6599
6600static u16
6601perf_sample_ustack_size(u16 stack_size, u16 header_size,
6602 struct pt_regs *regs)
6603{
6604 u64 task_size;
6605
6606
6607 if (!regs)
6608 return 0;
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6621 stack_size = min(stack_size, (u16) task_size);
6622
6623
6624 header_size += 2 * sizeof(u64);
6625
6626
6627 if ((u16) (header_size + stack_size) < header_size) {
6628
6629
6630
6631
6632 stack_size = USHRT_MAX - header_size - sizeof(u64);
6633 stack_size = round_up(stack_size, sizeof(u64));
6634 }
6635
6636 return stack_size;
6637}
6638
6639static void
6640perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6641 struct pt_regs *regs)
6642{
6643
6644 if (!regs) {
6645 u64 size = 0;
6646 perf_output_put(handle, size);
6647 } else {
6648 unsigned long sp;
6649 unsigned int rem;
6650 u64 dyn_size;
6651 mm_segment_t fs;
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665 perf_output_put(handle, dump_size);
6666
6667
6668 sp = perf_user_stack_pointer(regs);
6669 fs = force_uaccess_begin();
6670 rem = __output_copy_user(handle, (void *) sp, dump_size);
6671 force_uaccess_end(fs);
6672 dyn_size = dump_size - rem;
6673
6674 perf_output_skip(handle, rem);
6675
6676
6677 perf_output_put(handle, dyn_size);
6678 }
6679}
6680
6681static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6682 struct perf_sample_data *data,
6683 size_t size)
6684{
6685 struct perf_event *sampler = event->aux_event;
6686 struct perf_buffer *rb;
6687
6688 data->aux_size = 0;
6689
6690 if (!sampler)
6691 goto out;
6692
6693 if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6694 goto out;
6695
6696 if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6697 goto out;
6698
6699 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6700 if (!rb)
6701 goto out;
6702
6703
6704
6705
6706
6707 if (READ_ONCE(rb->aux_in_sampling)) {
6708 data->aux_size = 0;
6709 } else {
6710 size = min_t(size_t, size, perf_aux_size(rb));
6711 data->aux_size = ALIGN(size, sizeof(u64));
6712 }
6713 ring_buffer_put(rb);
6714
6715out:
6716 return data->aux_size;
6717}
6718
6719static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6720 struct perf_event *event,
6721 struct perf_output_handle *handle,
6722 unsigned long size)
6723{
6724 unsigned long flags;
6725 long ret;
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736 local_irq_save(flags);
6737
6738
6739
6740
6741 WRITE_ONCE(rb->aux_in_sampling, 1);
6742 barrier();
6743
6744 ret = event->pmu->snapshot_aux(event, handle, size);
6745
6746 barrier();
6747 WRITE_ONCE(rb->aux_in_sampling, 0);
6748 local_irq_restore(flags);
6749
6750 return ret;
6751}
6752
6753static void perf_aux_sample_output(struct perf_event *event,
6754 struct perf_output_handle *handle,
6755 struct perf_sample_data *data)
6756{
6757 struct perf_event *sampler = event->aux_event;
6758 struct perf_buffer *rb;
6759 unsigned long pad;
6760 long size;
6761
6762 if (WARN_ON_ONCE(!sampler || !data->aux_size))
6763 return;
6764
6765 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6766 if (!rb)
6767 return;
6768
6769 size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6770
6771
6772
6773
6774
6775
6776
6777 if (WARN_ON_ONCE(size < 0))
6778 goto out_put;
6779
6780
6781
6782
6783
6784 pad = data->aux_size - size;
6785 if (WARN_ON_ONCE(pad >= sizeof(u64)))
6786 pad = 8;
6787
6788 if (pad) {
6789 u64 zero = 0;
6790 perf_output_copy(handle, &zero, pad);
6791 }
6792
6793out_put:
6794 ring_buffer_put(rb);
6795}
6796
6797static void __perf_event_header__init_id(struct perf_event_header *header,
6798 struct perf_sample_data *data,
6799 struct perf_event *event)
6800{
6801 u64 sample_type = event->attr.sample_type;
6802
6803 data->type = sample_type;
6804 header->size += event->id_header_size;
6805
6806 if (sample_type & PERF_SAMPLE_TID) {
6807
6808 data->tid_entry.pid = perf_event_pid(event, current);
6809 data->tid_entry.tid = perf_event_tid(event, current);
6810 }
6811
6812 if (sample_type & PERF_SAMPLE_TIME)
6813 data->time = perf_event_clock(event);
6814
6815 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6816 data->id = primary_event_id(event);
6817
6818 if (sample_type & PERF_SAMPLE_STREAM_ID)
6819 data->stream_id = event->id;
6820
6821 if (sample_type & PERF_SAMPLE_CPU) {
6822 data->cpu_entry.cpu = raw_smp_processor_id();
6823 data->cpu_entry.reserved = 0;
6824 }
6825}
6826
6827void perf_event_header__init_id(struct perf_event_header *header,
6828 struct perf_sample_data *data,
6829 struct perf_event *event)
6830{
6831 if (event->attr.sample_id_all)
6832 __perf_event_header__init_id(header, data, event);
6833}
6834
6835static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6836 struct perf_sample_data *data)
6837{
6838 u64 sample_type = data->type;
6839
6840 if (sample_type & PERF_SAMPLE_TID)
6841 perf_output_put(handle, data->tid_entry);
6842
6843 if (sample_type & PERF_SAMPLE_TIME)
6844 perf_output_put(handle, data->time);
6845
6846 if (sample_type & PERF_SAMPLE_ID)
6847 perf_output_put(handle, data->id);
6848
6849 if (sample_type & PERF_SAMPLE_STREAM_ID)
6850 perf_output_put(handle, data->stream_id);
6851
6852 if (sample_type & PERF_SAMPLE_CPU)
6853 perf_output_put(handle, data->cpu_entry);
6854
6855 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6856 perf_output_put(handle, data->id);
6857}
6858
6859void perf_event__output_id_sample(struct perf_event *event,
6860 struct perf_output_handle *handle,
6861 struct perf_sample_data *sample)
6862{
6863 if (event->attr.sample_id_all)
6864 __perf_event__output_id_sample(handle, sample);
6865}
6866
6867static void perf_output_read_one(struct perf_output_handle *handle,
6868 struct perf_event *event,
6869 u64 enabled, u64 running)
6870{
6871 u64 read_format = event->attr.read_format;
6872 u64 values[4];
6873 int n = 0;
6874
6875 values[n++] = perf_event_count(event);
6876 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6877 values[n++] = enabled +
6878 atomic64_read(&event->child_total_time_enabled);
6879 }
6880 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6881 values[n++] = running +
6882 atomic64_read(&event->child_total_time_running);
6883 }
6884 if (read_format & PERF_FORMAT_ID)
6885 values[n++] = primary_event_id(event);
6886
6887 __output_copy(handle, values, n * sizeof(u64));
6888}
6889
6890static void perf_output_read_group(struct perf_output_handle *handle,
6891 struct perf_event *event,
6892 u64 enabled, u64 running)
6893{
6894 struct perf_event *leader = event->group_leader, *sub;
6895 u64 read_format = event->attr.read_format;
6896 u64 values[5];
6897 int n = 0;
6898
6899 values[n++] = 1 + leader->nr_siblings;
6900
6901 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6902 values[n++] = enabled;
6903
6904 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6905 values[n++] = running;
6906
6907 if ((leader != event) &&
6908 (leader->state == PERF_EVENT_STATE_ACTIVE))
6909 leader->pmu->read(leader);
6910
6911 values[n++] = perf_event_count(leader);
6912 if (read_format & PERF_FORMAT_ID)
6913 values[n++] = primary_event_id(leader);
6914
6915 __output_copy(handle, values, n * sizeof(u64));
6916
6917 for_each_sibling_event(sub, leader) {
6918 n = 0;
6919
6920 if ((sub != event) &&
6921 (sub->state == PERF_EVENT_STATE_ACTIVE))
6922 sub->pmu->read(sub);
6923
6924 values[n++] = perf_event_count(sub);
6925 if (read_format & PERF_FORMAT_ID)
6926 values[n++] = primary_event_id(sub);
6927
6928 __output_copy(handle, values, n * sizeof(u64));
6929 }
6930}
6931
6932#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6933 PERF_FORMAT_TOTAL_TIME_RUNNING)
6934
6935
6936
6937
6938
6939
6940
6941
6942static void perf_output_read(struct perf_output_handle *handle,
6943 struct perf_event *event)
6944{
6945 u64 enabled = 0, running = 0, now;
6946 u64 read_format = event->attr.read_format;
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6958 calc_timer_values(event, &now, &enabled, &running);
6959
6960 if (event->attr.read_format & PERF_FORMAT_GROUP)
6961 perf_output_read_group(handle, event, enabled, running);
6962 else
6963 perf_output_read_one(handle, event, enabled, running);
6964}
6965
6966static inline bool perf_sample_save_hw_index(struct perf_event *event)
6967{
6968 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6969}
6970
6971void perf_output_sample(struct perf_output_handle *handle,
6972 struct perf_event_header *header,
6973 struct perf_sample_data *data,
6974 struct perf_event *event)
6975{
6976 u64 sample_type = data->type;
6977
6978 perf_output_put(handle, *header);
6979
6980 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6981 perf_output_put(handle, data->id);
6982
6983 if (sample_type & PERF_SAMPLE_IP)
6984 perf_output_put(handle, data->ip);
6985
6986 if (sample_type & PERF_SAMPLE_TID)
6987 perf_output_put(handle, data->tid_entry);
6988
6989 if (sample_type & PERF_SAMPLE_TIME)
6990 perf_output_put(handle, data->time);
6991
6992 if (sample_type & PERF_SAMPLE_ADDR)
6993 perf_output_put(handle, data->addr);
6994
6995 if (sample_type & PERF_SAMPLE_ID)
6996 perf_output_put(handle, data->id);
6997
6998 if (sample_type & PERF_SAMPLE_STREAM_ID)
6999 perf_output_put(handle, data->stream_id);
7000
7001 if (sample_type & PERF_SAMPLE_CPU)
7002 perf_output_put(handle, data->cpu_entry);
7003
7004 if (sample_type & PERF_SAMPLE_PERIOD)
7005 perf_output_put(handle, data->period);
7006
7007 if (sample_type & PERF_SAMPLE_READ)
7008 perf_output_read(handle, event);
7009
7010 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7011 int size = 1;
7012
7013 size += data->callchain->nr;
7014 size *= sizeof(u64);
7015 __output_copy(handle, data->callchain, size);
7016 }
7017
7018 if (sample_type & PERF_SAMPLE_RAW) {
7019 struct perf_raw_record *raw = data->raw;
7020
7021 if (raw) {
7022 struct perf_raw_frag *frag = &raw->frag;
7023
7024 perf_output_put(handle, raw->size);
7025 do {
7026 if (frag->copy) {
7027 __output_custom(handle, frag->copy,
7028 frag->data, frag->size);
7029 } else {
7030 __output_copy(handle, frag->data,
7031 frag->size);
7032 }
7033 if (perf_raw_frag_last(frag))
7034 break;
7035 frag = frag->next;
7036 } while (1);
7037 if (frag->pad)
7038 __output_skip(handle, NULL, frag->pad);
7039 } else {
7040 struct {
7041 u32 size;
7042 u32 data;
7043 } raw = {
7044 .size = sizeof(u32),
7045 .data = 0,
7046 };
7047 perf_output_put(handle, raw);
7048 }
7049 }
7050
7051 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7052 if (data->br_stack) {
7053 size_t size;
7054
7055 size = data->br_stack->nr
7056 * sizeof(struct perf_branch_entry);
7057
7058 perf_output_put(handle, data->br_stack->nr);
7059 if (perf_sample_save_hw_index(event))
7060 perf_output_put(handle, data->br_stack->hw_idx);
7061 perf_output_copy(handle, data->br_stack->entries, size);
7062 } else {
7063
7064
7065
7066 u64 nr = 0;
7067 perf_output_put(handle, nr);
7068 }
7069 }
7070
7071 if (sample_type & PERF_SAMPLE_REGS_USER) {
7072 u64 abi = data->regs_user.abi;
7073
7074
7075
7076
7077
7078 perf_output_put(handle, abi);
7079
7080 if (abi) {
7081 u64 mask = event->attr.sample_regs_user;
7082 perf_output_sample_regs(handle,
7083 data->regs_user.regs,
7084 mask);
7085 }
7086 }
7087
7088 if (sample_type & PERF_SAMPLE_STACK_USER) {
7089 perf_output_sample_ustack(handle,
7090 data->stack_user_size,
7091 data->regs_user.regs);
7092 }
7093
7094 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
7095 perf_output_put(handle, data->weight.full);
7096
7097 if (sample_type & PERF_SAMPLE_DATA_SRC)
7098 perf_output_put(handle, data->data_src.val);
7099
7100 if (sample_type & PERF_SAMPLE_TRANSACTION)
7101 perf_output_put(handle, data->txn);
7102
7103 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7104 u64 abi = data->regs_intr.abi;
7105
7106
7107
7108
7109 perf_output_put(handle, abi);
7110
7111 if (abi) {
7112 u64 mask = event->attr.sample_regs_intr;
7113
7114 perf_output_sample_regs(handle,
7115 data->regs_intr.regs,
7116 mask);
7117 }
7118 }
7119
7120 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7121 perf_output_put(handle, data->phys_addr);
7122
7123 if (sample_type & PERF_SAMPLE_CGROUP)
7124 perf_output_put(handle, data->cgroup);
7125
7126 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7127 perf_output_put(handle, data->data_page_size);
7128
7129 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7130 perf_output_put(handle, data->code_page_size);
7131
7132 if (sample_type & PERF_SAMPLE_AUX) {
7133 perf_output_put(handle, data->aux_size);
7134
7135 if (data->aux_size)
7136 perf_aux_sample_output(event, handle, data);
7137 }
7138
7139 if (!event->attr.watermark) {
7140 int wakeup_events = event->attr.wakeup_events;
7141
7142 if (wakeup_events) {
7143 struct perf_buffer *rb = handle->rb;
7144 int events = local_inc_return(&rb->events);
7145
7146 if (events >= wakeup_events) {
7147 local_sub(wakeup_events, &rb->events);
7148 local_inc(&rb->wakeup);
7149 }
7150 }
7151 }
7152}
7153
7154static u64 perf_virt_to_phys(u64 virt)
7155{
7156 u64 phys_addr = 0;
7157 struct page *p = NULL;
7158
7159 if (!virt)
7160 return 0;
7161
7162 if (virt >= TASK_SIZE) {
7163
7164 if (virt_addr_valid((void *)(uintptr_t)virt) &&
7165 !(virt >= VMALLOC_START && virt < VMALLOC_END))
7166 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7167 } else {
7168
7169
7170
7171
7172
7173
7174
7175 if (current->mm != NULL) {
7176 pagefault_disable();
7177 if (get_user_page_fast_only(virt, 0, &p))
7178 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7179 pagefault_enable();
7180 }
7181
7182 if (p)
7183 put_page(p);
7184 }
7185
7186 return phys_addr;
7187}
7188
7189
7190
7191
7192static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
7193{
7194 u64 size = 0;
7195
7196#ifdef CONFIG_HAVE_FAST_GUP
7197 pgd_t *pgdp, pgd;
7198 p4d_t *p4dp, p4d;
7199 pud_t *pudp, pud;
7200 pmd_t *pmdp, pmd;
7201 pte_t *ptep, pte;
7202
7203 pgdp = pgd_offset(mm, addr);
7204 pgd = READ_ONCE(*pgdp);
7205 if (pgd_none(pgd))
7206 return 0;
7207
7208 if (pgd_leaf(pgd))
7209 return pgd_leaf_size(pgd);
7210
7211 p4dp = p4d_offset_lockless(pgdp, pgd, addr);
7212 p4d = READ_ONCE(*p4dp);
7213 if (!p4d_present(p4d))
7214 return 0;
7215
7216 if (p4d_leaf(p4d))
7217 return p4d_leaf_size(p4d);
7218
7219 pudp = pud_offset_lockless(p4dp, p4d, addr);
7220 pud = READ_ONCE(*pudp);
7221 if (!pud_present(pud))
7222 return 0;
7223
7224 if (pud_leaf(pud))
7225 return pud_leaf_size(pud);
7226
7227 pmdp = pmd_offset_lockless(pudp, pud, addr);
7228 pmd = READ_ONCE(*pmdp);
7229 if (!pmd_present(pmd))
7230 return 0;
7231
7232 if (pmd_leaf(pmd))
7233 return pmd_leaf_size(pmd);
7234
7235 ptep = pte_offset_map(&pmd, addr);
7236 pte = ptep_get_lockless(ptep);
7237 if (pte_present(pte))
7238 size = pte_leaf_size(pte);
7239 pte_unmap(ptep);
7240#endif
7241
7242 return size;
7243}
7244
7245static u64 perf_get_page_size(unsigned long addr)
7246{
7247 struct mm_struct *mm;
7248 unsigned long flags;
7249 u64 size;
7250
7251 if (!addr)
7252 return 0;
7253
7254
7255
7256
7257
7258 local_irq_save(flags);
7259
7260 mm = current->mm;
7261 if (!mm) {
7262
7263
7264
7265
7266 mm = &init_mm;
7267 }
7268
7269 size = perf_get_pgtable_size(mm, addr);
7270
7271 local_irq_restore(flags);
7272
7273 return size;
7274}
7275
7276static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7277
7278struct perf_callchain_entry *
7279perf_callchain(struct perf_event *event, struct pt_regs *regs)
7280{
7281 bool kernel = !event->attr.exclude_callchain_kernel;
7282 bool user = !event->attr.exclude_callchain_user;
7283
7284 bool crosstask = event->ctx->task && event->ctx->task != current;
7285 const u32 max_stack = event->attr.sample_max_stack;
7286 struct perf_callchain_entry *callchain;
7287
7288 if (!kernel && !user)
7289 return &__empty_callchain;
7290
7291 callchain = get_perf_callchain(regs, 0, kernel, user,
7292 max_stack, crosstask, true);
7293 return callchain ?: &__empty_callchain;
7294}
7295
7296void perf_prepare_sample(struct perf_event_header *header,
7297 struct perf_sample_data *data,
7298 struct perf_event *event,
7299 struct pt_regs *regs)
7300{
7301 u64 sample_type = event->attr.sample_type;
7302
7303 header->type = PERF_RECORD_SAMPLE;
7304 header->size = sizeof(*header) + event->header_size;
7305
7306 header->misc = 0;
7307 header->misc |= perf_misc_flags(regs);
7308
7309 __perf_event_header__init_id(header, data, event);
7310
7311 if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
7312 data->ip = perf_instruction_pointer(regs);
7313
7314 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7315 int size = 1;
7316
7317 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7318 data->callchain = perf_callchain(event, regs);
7319
7320 size += data->callchain->nr;
7321
7322 header->size += size * sizeof(u64);
7323 }
7324
7325 if (sample_type & PERF_SAMPLE_RAW) {
7326 struct perf_raw_record *raw = data->raw;
7327 int size;
7328
7329 if (raw) {
7330 struct perf_raw_frag *frag = &raw->frag;
7331 u32 sum = 0;
7332
7333 do {
7334 sum += frag->size;
7335 if (perf_raw_frag_last(frag))
7336 break;
7337 frag = frag->next;
7338 } while (1);
7339
7340 size = round_up(sum + sizeof(u32), sizeof(u64));
7341 raw->size = size - sizeof(u32);
7342 frag->pad = raw->size - sum;
7343 } else {
7344 size = sizeof(u64);
7345 }
7346
7347 header->size += size;
7348 }
7349
7350 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7351 int size = sizeof(u64);
7352 if (data->br_stack) {
7353 if (perf_sample_save_hw_index(event))
7354 size += sizeof(u64);
7355
7356 size += data->br_stack->nr
7357 * sizeof(struct perf_branch_entry);
7358 }
7359 header->size += size;
7360 }
7361
7362 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7363 perf_sample_regs_user(&data->regs_user, regs);
7364
7365 if (sample_type & PERF_SAMPLE_REGS_USER) {
7366
7367 int size = sizeof(u64);
7368
7369 if (data->regs_user.regs) {
7370 u64 mask = event->attr.sample_regs_user;
7371 size += hweight64(mask) * sizeof(u64);
7372 }
7373
7374 header->size += size;
7375 }
7376
7377 if (sample_type & PERF_SAMPLE_STACK_USER) {
7378
7379
7380
7381
7382
7383
7384 u16 stack_size = event->attr.sample_stack_user;
7385 u16 size = sizeof(u64);
7386
7387 stack_size = perf_sample_ustack_size(stack_size, header->size,
7388 data->regs_user.regs);
7389
7390
7391
7392
7393
7394
7395 if (stack_size)
7396 size += sizeof(u64) + stack_size;
7397
7398 data->stack_user_size = stack_size;
7399 header->size += size;
7400 }
7401
7402 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7403
7404 int size = sizeof(u64);
7405
7406 perf_sample_regs_intr(&data->regs_intr, regs);
7407
7408 if (data->regs_intr.regs) {
7409 u64 mask = event->attr.sample_regs_intr;
7410
7411 size += hweight64(mask) * sizeof(u64);
7412 }
7413
7414 header->size += size;
7415 }
7416
7417 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7418 data->phys_addr = perf_virt_to_phys(data->addr);
7419
7420#ifdef CONFIG_CGROUP_PERF
7421 if (sample_type & PERF_SAMPLE_CGROUP) {
7422 struct cgroup *cgrp;
7423
7424
7425 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7426 data->cgroup = cgroup_id(cgrp);
7427 }
7428#endif
7429
7430
7431
7432
7433
7434
7435 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7436 data->data_page_size = perf_get_page_size(data->addr);
7437
7438 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7439 data->code_page_size = perf_get_page_size(data->ip);
7440
7441 if (sample_type & PERF_SAMPLE_AUX) {
7442 u64 size;
7443
7444 header->size += sizeof(u64);
7445
7446
7447
7448
7449
7450
7451
7452 size = min_t(size_t, U16_MAX - header->size,
7453 event->attr.aux_sample_size);
7454 size = rounddown(size, 8);
7455 size = perf_prepare_sample_aux(event, data, size);
7456
7457 WARN_ON_ONCE(size + header->size > U16_MAX);
7458 header->size += size;
7459 }
7460
7461
7462
7463
7464
7465
7466
7467
7468 WARN_ON_ONCE(header->size & 7);
7469}
7470
7471static __always_inline int
7472__perf_event_output(struct perf_event *event,
7473 struct perf_sample_data *data,
7474 struct pt_regs *regs,
7475 int (*output_begin)(struct perf_output_handle *,
7476 struct perf_sample_data *,
7477 struct perf_event *,
7478 unsigned int))
7479{
7480 struct perf_output_handle handle;
7481 struct perf_event_header header;
7482 int err;
7483
7484
7485 rcu_read_lock();
7486
7487 perf_prepare_sample(&header, data, event, regs);
7488
7489 err = output_begin(&handle, data, event, header.size);
7490 if (err)
7491 goto exit;
7492
7493 perf_output_sample(&handle, &header, data, event);
7494
7495 perf_output_end(&handle);
7496
7497exit:
7498 rcu_read_unlock();
7499 return err;
7500}
7501
7502void
7503perf_event_output_forward(struct perf_event *event,
7504 struct perf_sample_data *data,
7505 struct pt_regs *regs)
7506{
7507 __perf_event_output(event, data, regs, perf_output_begin_forward);
7508}
7509
7510void
7511perf_event_output_backward(struct perf_event *event,
7512 struct perf_sample_data *data,
7513 struct pt_regs *regs)
7514{
7515 __perf_event_output(event, data, regs, perf_output_begin_backward);
7516}
7517
7518int
7519perf_event_output(struct perf_event *event,
7520 struct perf_sample_data *data,
7521 struct pt_regs *regs)
7522{
7523 return __perf_event_output(event, data, regs, perf_output_begin);
7524}
7525
7526
7527
7528
7529
7530struct perf_read_event {
7531 struct perf_event_header header;
7532
7533 u32 pid;
7534 u32 tid;
7535};
7536
7537static void
7538perf_event_read_event(struct perf_event *event,
7539 struct task_struct *task)
7540{
7541 struct perf_output_handle handle;
7542 struct perf_sample_data sample;
7543 struct perf_read_event read_event = {
7544 .header = {
7545 .type = PERF_RECORD_READ,
7546 .misc = 0,
7547 .size = sizeof(read_event) + event->read_size,
7548 },
7549 .pid = perf_event_pid(event, task),
7550 .tid = perf_event_tid(event, task),
7551 };
7552 int ret;
7553
7554 perf_event_header__init_id(&read_event.header, &sample, event);
7555 ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7556 if (ret)
7557 return;
7558
7559 perf_output_put(&handle, read_event);
7560 perf_output_read(&handle, event);
7561 perf_event__output_id_sample(event, &handle, &sample);
7562
7563 perf_output_end(&handle);
7564}
7565
7566typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7567
7568static void
7569perf_iterate_ctx(struct perf_event_context *ctx,
7570 perf_iterate_f output,
7571 void *data, bool all)
7572{
7573 struct perf_event *event;
7574
7575 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7576 if (!all) {
7577 if (event->state < PERF_EVENT_STATE_INACTIVE)
7578 continue;
7579 if (!event_filter_match(event))
7580 continue;
7581 }
7582
7583 output(event, data);
7584 }
7585}
7586
7587static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7588{
7589 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7590 struct perf_event *event;
7591
7592 list_for_each_entry_rcu(event, &pel->list, sb_list) {
7593
7594
7595
7596
7597
7598 if (!smp_load_acquire(&event->ctx))
7599 continue;
7600
7601 if (event->state < PERF_EVENT_STATE_INACTIVE)
7602 continue;
7603 if (!event_filter_match(event))
7604 continue;
7605 output(event, data);
7606 }
7607}
7608
7609
7610
7611
7612
7613
7614
7615static void
7616perf_iterate_sb(perf_iterate_f output, void *data,
7617 struct perf_event_context *task_ctx)
7618{
7619 struct perf_event_context *ctx;
7620 int ctxn;
7621
7622 rcu_read_lock();
7623 preempt_disable();
7624
7625
7626
7627
7628
7629
7630 if (task_ctx) {
7631 perf_iterate_ctx(task_ctx, output, data, false);
7632 goto done;
7633 }
7634
7635 perf_iterate_sb_cpu(output, data);
7636
7637 for_each_task_context_nr(ctxn) {
7638 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7639 if (ctx)
7640 perf_iterate_ctx(ctx, output, data, false);
7641 }
7642done:
7643 preempt_enable();
7644 rcu_read_unlock();
7645}
7646
7647
7648
7649
7650
7651static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7652{
7653 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7654 struct perf_addr_filter *filter;
7655 unsigned int restart = 0, count = 0;
7656 unsigned long flags;
7657
7658 if (!has_addr_filter(event))
7659 return;
7660
7661 raw_spin_lock_irqsave(&ifh->lock, flags);
7662 list_for_each_entry(filter, &ifh->list, entry) {
7663 if (filter->path.dentry) {
7664 event->addr_filter_ranges[count].start = 0;
7665 event->addr_filter_ranges[count].size = 0;
7666 restart++;
7667 }
7668
7669 count++;
7670 }
7671
7672 if (restart)
7673 event->addr_filters_gen++;
7674 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7675
7676 if (restart)
7677 perf_event_stop(event, 1);
7678}
7679
7680void perf_event_exec(void)
7681{
7682 struct perf_event_context *ctx;
7683 int ctxn;
7684
7685 for_each_task_context_nr(ctxn) {
7686 perf_event_enable_on_exec(ctxn);
7687 perf_event_remove_on_exec(ctxn);
7688
7689 rcu_read_lock();
7690 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7691 if (ctx) {
7692 perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
7693 NULL, true);
7694 }
7695 rcu_read_unlock();
7696 }
7697}
7698
7699struct remote_output {
7700 struct perf_buffer *rb;
7701 int err;
7702};
7703
7704static void __perf_event_output_stop(struct perf_event *event, void *data)
7705{
7706 struct perf_event *parent = event->parent;
7707 struct remote_output *ro = data;
7708 struct perf_buffer *rb = ro->rb;
7709 struct stop_event_data sd = {
7710 .event = event,
7711 };
7712
7713 if (!has_aux(event))
7714 return;
7715
7716 if (!parent)
7717 parent = event;
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729 if (rcu_dereference(parent->rb) == rb)
7730 ro->err = __perf_event_stop(&sd);
7731}
7732
7733static int __perf_pmu_output_stop(void *info)
7734{
7735 struct perf_event *event = info;
7736 struct pmu *pmu = event->ctx->pmu;
7737 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7738 struct remote_output ro = {
7739 .rb = event->rb,
7740 };
7741
7742 rcu_read_lock();
7743 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7744 if (cpuctx->task_ctx)
7745 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7746 &ro, false);
7747 rcu_read_unlock();
7748
7749 return ro.err;
7750}
7751
7752static void perf_pmu_output_stop(struct perf_event *event)
7753{
7754 struct perf_event *iter;
7755 int err, cpu;
7756
7757restart:
7758 rcu_read_lock();
7759 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7760
7761
7762
7763
7764
7765
7766 cpu = iter->cpu;
7767 if (cpu == -1)
7768 cpu = READ_ONCE(iter->oncpu);
7769
7770 if (cpu == -1)
7771 continue;
7772
7773 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7774 if (err == -EAGAIN) {
7775 rcu_read_unlock();
7776 goto restart;
7777 }
7778 }
7779 rcu_read_unlock();
7780}
7781
7782
7783
7784
7785
7786
7787
7788struct perf_task_event {
7789 struct task_struct *task;
7790 struct perf_event_context *task_ctx;
7791
7792 struct {
7793 struct perf_event_header header;
7794
7795 u32 pid;
7796 u32 ppid;
7797 u32 tid;
7798 u32 ptid;
7799 u64 time;
7800 } event_id;
7801};
7802
7803static int perf_event_task_match(struct perf_event *event)
7804{
7805 return event->attr.comm || event->attr.mmap ||
7806 event->attr.mmap2 || event->attr.mmap_data ||
7807 event->attr.task;
7808}
7809
7810static void perf_event_task_output(struct perf_event *event,
7811 void *data)
7812{
7813 struct perf_task_event *task_event = data;
7814 struct perf_output_handle handle;
7815 struct perf_sample_data sample;
7816 struct task_struct *task = task_event->task;
7817 int ret, size = task_event->event_id.header.size;
7818
7819 if (!perf_event_task_match(event))
7820 return;
7821
7822 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7823
7824 ret = perf_output_begin(&handle, &sample, event,
7825 task_event->event_id.header.size);
7826 if (ret)
7827 goto out;
7828
7829 task_event->event_id.pid = perf_event_pid(event, task);
7830 task_event->event_id.tid = perf_event_tid(event, task);
7831
7832 if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7833 task_event->event_id.ppid = perf_event_pid(event,
7834 task->real_parent);
7835 task_event->event_id.ptid = perf_event_pid(event,
7836 task->real_parent);
7837 } else {
7838 task_event->event_id.ppid = perf_event_pid(event, current);
7839 task_event->event_id.ptid = perf_event_tid(event, current);
7840 }
7841
7842 task_event->event_id.time = perf_event_clock(event);
7843
7844 perf_output_put(&handle, task_event->event_id);
7845
7846 perf_event__output_id_sample(event, &handle, &sample);
7847
7848 perf_output_end(&handle);
7849out:
7850 task_event->event_id.header.size = size;
7851}
7852
7853static void perf_event_task(struct task_struct *task,
7854 struct perf_event_context *task_ctx,
7855 int new)
7856{
7857 struct perf_task_event task_event;
7858
7859 if (!atomic_read(&nr_comm_events) &&
7860 !atomic_read(&nr_mmap_events) &&
7861 !atomic_read(&nr_task_events))
7862 return;
7863
7864 task_event = (struct perf_task_event){
7865 .task = task,
7866 .task_ctx = task_ctx,
7867 .event_id = {
7868 .header = {
7869 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7870 .misc = 0,
7871 .size = sizeof(task_event.event_id),
7872 },
7873
7874
7875
7876
7877
7878 },
7879 };
7880
7881 perf_iterate_sb(perf_event_task_output,
7882 &task_event,
7883 task_ctx);
7884}
7885
7886void perf_event_fork(struct task_struct *task)
7887{
7888 perf_event_task(task, NULL, 1);
7889 perf_event_namespaces(task);
7890}
7891
7892
7893
7894
7895
7896struct perf_comm_event {
7897 struct task_struct *task;
7898 char *comm;
7899 int comm_size;
7900
7901 struct {
7902 struct perf_event_header header;
7903
7904 u32 pid;
7905 u32 tid;
7906 } event_id;
7907};
7908
7909static int perf_event_comm_match(struct perf_event *event)
7910{
7911 return event->attr.comm;
7912}
7913
7914static void perf_event_comm_output(struct perf_event *event,
7915 void *data)
7916{
7917 struct perf_comm_event *comm_event = data;
7918 struct perf_output_handle handle;
7919 struct perf_sample_data sample;
7920 int size = comm_event->event_id.header.size;
7921 int ret;
7922
7923 if (!perf_event_comm_match(event))
7924 return;
7925
7926 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7927 ret = perf_output_begin(&handle, &sample, event,
7928 comm_event->event_id.header.size);
7929
7930 if (ret)
7931 goto out;
7932
7933 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7934 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7935
7936 perf_output_put(&handle, comm_event->event_id);
7937 __output_copy(&handle, comm_event->comm,
7938 comm_event->comm_size);
7939
7940 perf_event__output_id_sample(event, &handle, &sample);
7941
7942 perf_output_end(&handle);
7943out:
7944 comm_event->event_id.header.size = size;
7945}
7946
7947static void perf_event_comm_event(struct perf_comm_event *comm_event)
7948{
7949 char comm[TASK_COMM_LEN];
7950 unsigned int size;
7951
7952 memset(comm, 0, sizeof(comm));
7953 strlcpy(comm, comm_event->task->comm, sizeof(comm));
7954 size = ALIGN(strlen(comm)+1, sizeof(u64));
7955
7956 comm_event->comm = comm;
7957 comm_event->comm_size = size;
7958
7959 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7960
7961 perf_iterate_sb(perf_event_comm_output,
7962 comm_event,
7963 NULL);
7964}
7965
7966void perf_event_comm(struct task_struct *task, bool exec)
7967{
7968 struct perf_comm_event comm_event;
7969
7970 if (!atomic_read(&nr_comm_events))
7971 return;
7972
7973 comm_event = (struct perf_comm_event){
7974 .task = task,
7975
7976
7977 .event_id = {
7978 .header = {
7979 .type = PERF_RECORD_COMM,
7980 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7981
7982 },
7983
7984
7985 },
7986 };
7987
7988 perf_event_comm_event(&comm_event);
7989}
7990
7991
7992
7993
7994
7995struct perf_namespaces_event {
7996 struct task_struct *task;
7997
7998 struct {
7999 struct perf_event_header header;
8000
8001 u32 pid;
8002 u32 tid;
8003 u64 nr_namespaces;
8004 struct perf_ns_link_info link_info[NR_NAMESPACES];
8005 } event_id;
8006};
8007
8008static int perf_event_namespaces_match(struct perf_event *event)
8009{
8010 return event->attr.namespaces;
8011}
8012
8013static void perf_event_namespaces_output(struct perf_event *event,
8014 void *data)
8015{
8016 struct perf_namespaces_event *namespaces_event = data;
8017 struct perf_output_handle handle;
8018 struct perf_sample_data sample;
8019 u16 header_size = namespaces_event->event_id.header.size;
8020 int ret;
8021
8022 if (!perf_event_namespaces_match(event))
8023 return;
8024
8025 perf_event_header__init_id(&namespaces_event->event_id.header,
8026 &sample, event);
8027 ret = perf_output_begin(&handle, &sample, event,
8028 namespaces_event->event_id.header.size);
8029 if (ret)
8030 goto out;
8031
8032 namespaces_event->event_id.pid = perf_event_pid(event,
8033 namespaces_event->task);
8034 namespaces_event->event_id.tid = perf_event_tid(event,
8035 namespaces_event->task);
8036
8037 perf_output_put(&handle, namespaces_event->event_id);
8038
8039 perf_event__output_id_sample(event, &handle, &sample);
8040
8041 perf_output_end(&handle);
8042out:
8043 namespaces_event->event_id.header.size = header_size;
8044}
8045
8046static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
8047 struct task_struct *task,
8048 const struct proc_ns_operations *ns_ops)
8049{
8050 struct path ns_path;
8051 struct inode *ns_inode;
8052 int error;
8053
8054 error = ns_get_path(&ns_path, task, ns_ops);
8055 if (!error) {
8056 ns_inode = ns_path.dentry->d_inode;
8057 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
8058 ns_link_info->ino = ns_inode->i_ino;
8059 path_put(&ns_path);
8060 }
8061}
8062
8063void perf_event_namespaces(struct task_struct *task)
8064{
8065 struct perf_namespaces_event namespaces_event;
8066 struct perf_ns_link_info *ns_link_info;
8067
8068 if (!atomic_read(&nr_namespaces_events))
8069 return;
8070
8071 namespaces_event = (struct perf_namespaces_event){
8072 .task = task,
8073 .event_id = {
8074 .header = {
8075 .type = PERF_RECORD_NAMESPACES,
8076 .misc = 0,
8077 .size = sizeof(namespaces_event.event_id),
8078 },
8079
8080
8081 .nr_namespaces = NR_NAMESPACES,
8082
8083 },
8084 };
8085
8086 ns_link_info = namespaces_event.event_id.link_info;
8087
8088 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
8089 task, &mntns_operations);
8090
8091#ifdef CONFIG_USER_NS
8092 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
8093 task, &userns_operations);
8094#endif
8095#ifdef CONFIG_NET_NS
8096 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
8097 task, &netns_operations);
8098#endif
8099#ifdef CONFIG_UTS_NS
8100 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
8101 task, &utsns_operations);
8102#endif
8103#ifdef CONFIG_IPC_NS
8104 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
8105 task, &ipcns_operations);
8106#endif
8107#ifdef CONFIG_PID_NS
8108 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
8109 task, &pidns_operations);
8110#endif
8111#ifdef CONFIG_CGROUPS
8112 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
8113 task, &cgroupns_operations);
8114#endif
8115
8116 perf_iterate_sb(perf_event_namespaces_output,
8117 &namespaces_event,
8118 NULL);
8119}
8120
8121
8122
8123
8124#ifdef CONFIG_CGROUP_PERF
8125
8126struct perf_cgroup_event {
8127 char *path;
8128 int path_size;
8129 struct {
8130 struct perf_event_header header;
8131 u64 id;
8132 char path[];
8133 } event_id;
8134};
8135
8136static int perf_event_cgroup_match(struct perf_event *event)
8137{
8138 return event->attr.cgroup;
8139}
8140
8141static void perf_event_cgroup_output(struct perf_event *event, void *data)
8142{
8143 struct perf_cgroup_event *cgroup_event = data;
8144 struct perf_output_handle handle;
8145 struct perf_sample_data sample;
8146 u16 header_size = cgroup_event->event_id.header.size;
8147 int ret;
8148
8149 if (!perf_event_cgroup_match(event))
8150 return;
8151
8152 perf_event_header__init_id(&cgroup_event->event_id.header,
8153 &sample, event);
8154 ret = perf_output_begin(&handle, &sample, event,
8155 cgroup_event->event_id.header.size);
8156 if (ret)
8157 goto out;
8158
8159 perf_output_put(&handle, cgroup_event->event_id);
8160 __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8161
8162 perf_event__output_id_sample(event, &handle, &sample);
8163
8164 perf_output_end(&handle);
8165out:
8166 cgroup_event->event_id.header.size = header_size;
8167}
8168
8169static void perf_event_cgroup(struct cgroup *cgrp)
8170{
8171 struct perf_cgroup_event cgroup_event;
8172 char path_enomem[16] = "//enomem";
8173 char *pathname;
8174 size_t size;
8175
8176 if (!atomic_read(&nr_cgroup_events))
8177 return;
8178
8179 cgroup_event = (struct perf_cgroup_event){
8180 .event_id = {
8181 .header = {
8182 .type = PERF_RECORD_CGROUP,
8183 .misc = 0,
8184 .size = sizeof(cgroup_event.event_id),
8185 },
8186 .id = cgroup_id(cgrp),
8187 },
8188 };
8189
8190 pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8191 if (pathname == NULL) {
8192 cgroup_event.path = path_enomem;
8193 } else {
8194
8195 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8196 cgroup_event.path = pathname;
8197 }
8198
8199
8200
8201
8202
8203
8204 size = strlen(cgroup_event.path) + 1;
8205 while (!IS_ALIGNED(size, sizeof(u64)))
8206 cgroup_event.path[size++] = '\0';
8207
8208 cgroup_event.event_id.header.size += size;
8209 cgroup_event.path_size = size;
8210
8211 perf_iterate_sb(perf_event_cgroup_output,
8212 &cgroup_event,
8213 NULL);
8214
8215 kfree(pathname);
8216}
8217
8218#endif
8219
8220
8221
8222
8223
8224struct perf_mmap_event {
8225 struct vm_area_struct *vma;
8226
8227 const char *file_name;
8228 int file_size;
8229 int maj, min;
8230 u64 ino;
8231 u64 ino_generation;
8232 u32 prot, flags;
8233 u8 build_id[BUILD_ID_SIZE_MAX];
8234 u32 build_id_size;
8235
8236 struct {
8237 struct perf_event_header header;
8238
8239 u32 pid;
8240 u32 tid;
8241 u64 start;
8242 u64 len;
8243 u64 pgoff;
8244 } event_id;
8245};
8246
8247static int perf_event_mmap_match(struct perf_event *event,
8248 void *data)
8249{
8250 struct perf_mmap_event *mmap_event = data;
8251 struct vm_area_struct *vma = mmap_event->vma;
8252 int executable = vma->vm_flags & VM_EXEC;
8253
8254 return (!executable && event->attr.mmap_data) ||
8255 (executable && (event->attr.mmap || event->attr.mmap2));
8256}
8257
8258static void perf_event_mmap_output(struct perf_event *event,
8259 void *data)
8260{
8261 struct perf_mmap_event *mmap_event = data;
8262 struct perf_output_handle handle;
8263 struct perf_sample_data sample;
8264 int size = mmap_event->event_id.header.size;
8265 u32 type = mmap_event->event_id.header.type;
8266 bool use_build_id;
8267 int ret;
8268
8269 if (!perf_event_mmap_match(event, data))
8270 return;
8271
8272 if (event->attr.mmap2) {
8273 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8274 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8275 mmap_event->event_id.header.size += sizeof(mmap_event->min);
8276 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8277 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8278 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8279 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8280 }
8281
8282 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8283 ret = perf_output_begin(&handle, &sample, event,
8284 mmap_event->event_id.header.size);
8285 if (ret)
8286 goto out;
8287
8288 mmap_event->event_id.pid = perf_event_pid(event, current);
8289 mmap_event->event_id.tid = perf_event_tid(event, current);
8290
8291 use_build_id = event->attr.build_id && mmap_event->build_id_size;
8292
8293 if (event->attr.mmap2 && use_build_id)
8294 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8295
8296 perf_output_put(&handle, mmap_event->event_id);
8297
8298 if (event->attr.mmap2) {
8299 if (use_build_id) {
8300 u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8301
8302 __output_copy(&handle, size, 4);
8303 __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8304 } else {
8305 perf_output_put(&handle, mmap_event->maj);
8306 perf_output_put(&handle, mmap_event->min);
8307 perf_output_put(&handle, mmap_event->ino);
8308 perf_output_put(&handle, mmap_event->ino_generation);
8309 }
8310 perf_output_put(&handle, mmap_event->prot);
8311 perf_output_put(&handle, mmap_event->flags);
8312 }
8313
8314 __output_copy(&handle, mmap_event->file_name,
8315 mmap_event->file_size);
8316
8317 perf_event__output_id_sample(event, &handle, &sample);
8318
8319 perf_output_end(&handle);
8320out:
8321 mmap_event->event_id.header.size = size;
8322 mmap_event->event_id.header.type = type;
8323}
8324
8325static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8326{
8327 struct vm_area_struct *vma = mmap_event->vma;
8328 struct file *file = vma->vm_file;
8329 int maj = 0, min = 0;
8330 u64 ino = 0, gen = 0;
8331 u32 prot = 0, flags = 0;
8332 unsigned int size;
8333 char tmp[16];
8334 char *buf = NULL;
8335 char *name;
8336
8337 if (vma->vm_flags & VM_READ)
8338 prot |= PROT_READ;
8339 if (vma->vm_flags & VM_WRITE)
8340 prot |= PROT_WRITE;
8341 if (vma->vm_flags & VM_EXEC)
8342 prot |= PROT_EXEC;
8343
8344 if (vma->vm_flags & VM_MAYSHARE)
8345 flags = MAP_SHARED;
8346 else
8347 flags = MAP_PRIVATE;
8348
8349 if (vma->vm_flags & VM_LOCKED)
8350 flags |= MAP_LOCKED;
8351 if (is_vm_hugetlb_page(vma))
8352 flags |= MAP_HUGETLB;
8353
8354 if (file) {
8355 struct inode *inode;
8356 dev_t dev;
8357
8358 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8359 if (!buf) {
8360 name = "//enomem";
8361 goto cpy_name;
8362 }
8363
8364
8365
8366
8367
8368 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8369 if (IS_ERR(name)) {
8370 name = "//toolong";
8371 goto cpy_name;
8372 }
8373 inode = file_inode(vma->vm_file);
8374 dev = inode->i_sb->s_dev;
8375 ino = inode->i_ino;
8376 gen = inode->i_generation;
8377 maj = MAJOR(dev);
8378 min = MINOR(dev);
8379
8380 goto got_name;
8381 } else {
8382 if (vma->vm_ops && vma->vm_ops->name) {
8383 name = (char *) vma->vm_ops->name(vma);
8384 if (name)
8385 goto cpy_name;
8386 }
8387
8388 name = (char *)arch_vma_name(vma);
8389 if (name)
8390 goto cpy_name;
8391
8392 if (vma->vm_start <= vma->vm_mm->start_brk &&
8393 vma->vm_end >= vma->vm_mm->brk) {
8394 name = "[heap]";
8395 goto cpy_name;
8396 }
8397 if (vma->vm_start <= vma->vm_mm->start_stack &&
8398 vma->vm_end >= vma->vm_mm->start_stack) {
8399 name = "[stack]";
8400 goto cpy_name;
8401 }
8402
8403 name = "//anon";
8404 goto cpy_name;
8405 }
8406
8407cpy_name:
8408 strlcpy(tmp, name, sizeof(tmp));
8409 name = tmp;
8410got_name:
8411
8412
8413
8414
8415
8416 size = strlen(name)+1;
8417 while (!IS_ALIGNED(size, sizeof(u64)))
8418 name[size++] = '\0';
8419
8420 mmap_event->file_name = name;
8421 mmap_event->file_size = size;
8422 mmap_event->maj = maj;
8423 mmap_event->min = min;
8424 mmap_event->ino = ino;
8425 mmap_event->ino_generation = gen;
8426 mmap_event->prot = prot;
8427 mmap_event->flags = flags;
8428
8429 if (!(vma->vm_flags & VM_EXEC))
8430 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8431
8432 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8433
8434 if (atomic_read(&nr_build_id_events))
8435 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8436
8437 perf_iterate_sb(perf_event_mmap_output,
8438 mmap_event,
8439 NULL);
8440
8441 kfree(buf);
8442}
8443
8444
8445
8446
8447static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8448 struct file *file, unsigned long offset,
8449 unsigned long size)
8450{
8451
8452 if (!filter->path.dentry)
8453 return false;
8454
8455 if (d_inode(filter->path.dentry) != file_inode(file))
8456 return false;
8457
8458 if (filter->offset > offset + size)
8459 return false;
8460
8461 if (filter->offset + filter->size < offset)
8462 return false;
8463
8464 return true;
8465}
8466
8467static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8468 struct vm_area_struct *vma,
8469 struct perf_addr_filter_range *fr)
8470{
8471 unsigned long vma_size = vma->vm_end - vma->vm_start;
8472 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8473 struct file *file = vma->vm_file;
8474
8475 if (!perf_addr_filter_match(filter, file, off, vma_size))
8476 return false;
8477
8478 if (filter->offset < off) {
8479 fr->start = vma->vm_start;
8480 fr->size = min(vma_size, filter->size - (off - filter->offset));
8481 } else {
8482 fr->start = vma->vm_start + filter->offset - off;
8483 fr->size = min(vma->vm_end - fr->start, filter->size);
8484 }
8485
8486 return true;
8487}
8488
8489static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8490{
8491 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8492 struct vm_area_struct *vma = data;
8493 struct perf_addr_filter *filter;
8494 unsigned int restart = 0, count = 0;
8495 unsigned long flags;
8496
8497 if (!has_addr_filter(event))
8498 return;
8499
8500 if (!vma->vm_file)
8501 return;
8502
8503 raw_spin_lock_irqsave(&ifh->lock, flags);
8504 list_for_each_entry(filter, &ifh->list, entry) {
8505 if (perf_addr_filter_vma_adjust(filter, vma,
8506 &event->addr_filter_ranges[count]))
8507 restart++;
8508
8509 count++;
8510 }
8511
8512 if (restart)
8513 event->addr_filters_gen++;
8514 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8515
8516 if (restart)
8517 perf_event_stop(event, 1);
8518}
8519
8520
8521
8522
8523static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8524{
8525 struct perf_event_context *ctx;
8526 int ctxn;
8527
8528
8529
8530
8531
8532 if (!(vma->vm_flags & VM_EXEC))
8533 return;
8534
8535 rcu_read_lock();
8536 for_each_task_context_nr(ctxn) {
8537 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8538 if (!ctx)
8539 continue;
8540
8541 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8542 }
8543 rcu_read_unlock();
8544}
8545
8546void perf_event_mmap(struct vm_area_struct *vma)
8547{
8548 struct perf_mmap_event mmap_event;
8549
8550 if (!atomic_read(&nr_mmap_events))
8551 return;
8552
8553 mmap_event = (struct perf_mmap_event){
8554 .vma = vma,
8555
8556
8557 .event_id = {
8558 .header = {
8559 .type = PERF_RECORD_MMAP,
8560 .misc = PERF_RECORD_MISC_USER,
8561
8562 },
8563
8564
8565 .start = vma->vm_start,
8566 .len = vma->vm_end - vma->vm_start,
8567 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
8568 },
8569
8570
8571
8572
8573
8574
8575 };
8576
8577 perf_addr_filters_adjust(vma);
8578 perf_event_mmap_event(&mmap_event);
8579}
8580
8581void perf_event_aux_event(struct perf_event *event, unsigned long head,
8582 unsigned long size, u64 flags)
8583{
8584 struct perf_output_handle handle;
8585 struct perf_sample_data sample;
8586 struct perf_aux_event {
8587 struct perf_event_header header;
8588 u64 offset;
8589 u64 size;
8590 u64 flags;
8591 } rec = {
8592 .header = {
8593 .type = PERF_RECORD_AUX,
8594 .misc = 0,
8595 .size = sizeof(rec),
8596 },
8597 .offset = head,
8598 .size = size,
8599 .flags = flags,
8600 };
8601 int ret;
8602
8603 perf_event_header__init_id(&rec.header, &sample, event);
8604 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8605
8606 if (ret)
8607 return;
8608
8609 perf_output_put(&handle, rec);
8610 perf_event__output_id_sample(event, &handle, &sample);
8611
8612 perf_output_end(&handle);
8613}
8614
8615
8616
8617
8618void perf_log_lost_samples(struct perf_event *event, u64 lost)
8619{
8620 struct perf_output_handle handle;
8621 struct perf_sample_data sample;
8622 int ret;
8623
8624 struct {
8625 struct perf_event_header header;
8626 u64 lost;
8627 } lost_samples_event = {
8628 .header = {
8629 .type = PERF_RECORD_LOST_SAMPLES,
8630 .misc = 0,
8631 .size = sizeof(lost_samples_event),
8632 },
8633 .lost = lost,
8634 };
8635
8636 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8637
8638 ret = perf_output_begin(&handle, &sample, event,
8639 lost_samples_event.header.size);
8640 if (ret)
8641 return;
8642
8643 perf_output_put(&handle, lost_samples_event);
8644 perf_event__output_id_sample(event, &handle, &sample);
8645 perf_output_end(&handle);
8646}
8647
8648
8649
8650
8651
8652struct perf_switch_event {
8653 struct task_struct *task;
8654 struct task_struct *next_prev;
8655
8656 struct {
8657 struct perf_event_header header;
8658 u32 next_prev_pid;
8659 u32 next_prev_tid;
8660 } event_id;
8661};
8662
8663static int perf_event_switch_match(struct perf_event *event)
8664{
8665 return event->attr.context_switch;
8666}
8667
8668static void perf_event_switch_output(struct perf_event *event, void *data)
8669{
8670 struct perf_switch_event *se = data;
8671 struct perf_output_handle handle;
8672 struct perf_sample_data sample;
8673 int ret;
8674
8675 if (!perf_event_switch_match(event))
8676 return;
8677
8678
8679 if (event->ctx->task) {
8680 se->event_id.header.type = PERF_RECORD_SWITCH;
8681 se->event_id.header.size = sizeof(se->event_id.header);
8682 } else {
8683 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8684 se->event_id.header.size = sizeof(se->event_id);
8685 se->event_id.next_prev_pid =
8686 perf_event_pid(event, se->next_prev);
8687 se->event_id.next_prev_tid =
8688 perf_event_tid(event, se->next_prev);
8689 }
8690
8691 perf_event_header__init_id(&se->event_id.header, &sample, event);
8692
8693 ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8694 if (ret)
8695 return;
8696
8697 if (event->ctx->task)
8698 perf_output_put(&handle, se->event_id.header);
8699 else
8700 perf_output_put(&handle, se->event_id);
8701
8702 perf_event__output_id_sample(event, &handle, &sample);
8703
8704 perf_output_end(&handle);
8705}
8706
8707static void perf_event_switch(struct task_struct *task,
8708 struct task_struct *next_prev, bool sched_in)
8709{
8710 struct perf_switch_event switch_event;
8711
8712
8713
8714 switch_event = (struct perf_switch_event){
8715 .task = task,
8716 .next_prev = next_prev,
8717 .event_id = {
8718 .header = {
8719
8720 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8721
8722 },
8723
8724
8725 },
8726 };
8727
8728 if (!sched_in && task->on_rq) {
8729 switch_event.event_id.header.misc |=
8730 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8731 }
8732
8733 perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
8734}
8735
8736
8737
8738
8739
8740static void perf_log_throttle(struct perf_event *event, int enable)
8741{
8742 struct perf_output_handle handle;
8743 struct perf_sample_data sample;
8744 int ret;
8745
8746 struct {
8747 struct perf_event_header header;
8748 u64 time;
8749 u64 id;
8750 u64 stream_id;
8751 } throttle_event = {
8752 .header = {
8753 .type = PERF_RECORD_THROTTLE,
8754 .misc = 0,
8755 .size = sizeof(throttle_event),
8756 },
8757 .time = perf_event_clock(event),
8758 .id = primary_event_id(event),
8759 .stream_id = event->id,
8760 };
8761
8762 if (enable)
8763 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8764
8765 perf_event_header__init_id(&throttle_event.header, &sample, event);
8766
8767 ret = perf_output_begin(&handle, &sample, event,
8768 throttle_event.header.size);
8769 if (ret)
8770 return;
8771
8772 perf_output_put(&handle, throttle_event);
8773 perf_event__output_id_sample(event, &handle, &sample);
8774 perf_output_end(&handle);
8775}
8776
8777
8778
8779
8780
8781struct perf_ksymbol_event {
8782 const char *name;
8783 int name_len;
8784 struct {
8785 struct perf_event_header header;
8786 u64 addr;
8787 u32 len;
8788 u16 ksym_type;
8789 u16 flags;
8790 } event_id;
8791};
8792
8793static int perf_event_ksymbol_match(struct perf_event *event)
8794{
8795 return event->attr.ksymbol;
8796}
8797
8798static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8799{
8800 struct perf_ksymbol_event *ksymbol_event = data;
8801 struct perf_output_handle handle;
8802 struct perf_sample_data sample;
8803 int ret;
8804
8805 if (!perf_event_ksymbol_match(event))
8806 return;
8807
8808 perf_event_header__init_id(&ksymbol_event->event_id.header,
8809 &sample, event);
8810 ret = perf_output_begin(&handle, &sample, event,
8811 ksymbol_event->event_id.header.size);
8812 if (ret)
8813 return;
8814
8815 perf_output_put(&handle, ksymbol_event->event_id);
8816 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8817 perf_event__output_id_sample(event, &handle, &sample);
8818
8819 perf_output_end(&handle);
8820}
8821
8822void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8823 const char *sym)
8824{
8825 struct perf_ksymbol_event ksymbol_event;
8826 char name[KSYM_NAME_LEN];
8827 u16 flags = 0;
8828 int name_len;
8829
8830 if (!atomic_read(&nr_ksymbol_events))
8831 return;
8832
8833 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8834 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8835 goto err;
8836
8837 strlcpy(name, sym, KSYM_NAME_LEN);
8838 name_len = strlen(name) + 1;
8839 while (!IS_ALIGNED(name_len, sizeof(u64)))
8840 name[name_len++] = '\0';
8841 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8842
8843 if (unregister)
8844 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8845
8846 ksymbol_event = (struct perf_ksymbol_event){
8847 .name = name,
8848 .name_len = name_len,
8849 .event_id = {
8850 .header = {
8851 .type = PERF_RECORD_KSYMBOL,
8852 .size = sizeof(ksymbol_event.event_id) +
8853 name_len,
8854 },
8855 .addr = addr,
8856 .len = len,
8857 .ksym_type = ksym_type,
8858 .flags = flags,
8859 },
8860 };
8861
8862 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8863 return;
8864err:
8865 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8866}
8867
8868
8869
8870
8871
8872struct perf_bpf_event {
8873 struct bpf_prog *prog;
8874 struct {
8875 struct perf_event_header header;
8876 u16 type;
8877 u16 flags;
8878 u32 id;
8879 u8 tag[BPF_TAG_SIZE];
8880 } event_id;
8881};
8882
8883static int perf_event_bpf_match(struct perf_event *event)
8884{
8885 return event->attr.bpf_event;
8886}
8887
8888static void perf_event_bpf_output(struct perf_event *event, void *data)
8889{
8890 struct perf_bpf_event *bpf_event = data;
8891 struct perf_output_handle handle;
8892 struct perf_sample_data sample;
8893 int ret;
8894
8895 if (!perf_event_bpf_match(event))
8896 return;
8897
8898 perf_event_header__init_id(&bpf_event->event_id.header,
8899 &sample, event);
8900 ret = perf_output_begin(&handle, data, event,
8901 bpf_event->event_id.header.size);
8902 if (ret)
8903 return;
8904
8905 perf_output_put(&handle, bpf_event->event_id);
8906 perf_event__output_id_sample(event, &handle, &sample);
8907
8908 perf_output_end(&handle);
8909}
8910
8911static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8912 enum perf_bpf_event_type type)
8913{
8914 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8915 int i;
8916
8917 if (prog->aux->func_cnt == 0) {
8918 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8919 (u64)(unsigned long)prog->bpf_func,
8920 prog->jited_len, unregister,
8921 prog->aux->ksym.name);
8922 } else {
8923 for (i = 0; i < prog->aux->func_cnt; i++) {
8924 struct bpf_prog *subprog = prog->aux->func[i];
8925
8926 perf_event_ksymbol(
8927 PERF_RECORD_KSYMBOL_TYPE_BPF,
8928 (u64)(unsigned long)subprog->bpf_func,
8929 subprog->jited_len, unregister,
8930 prog->aux->ksym.name);
8931 }
8932 }
8933}
8934
8935void perf_event_bpf_event(struct bpf_prog *prog,
8936 enum perf_bpf_event_type type,
8937 u16 flags)
8938{
8939 struct perf_bpf_event bpf_event;
8940
8941 if (type <= PERF_BPF_EVENT_UNKNOWN ||
8942 type >= PERF_BPF_EVENT_MAX)
8943 return;
8944
8945 switch (type) {
8946 case PERF_BPF_EVENT_PROG_LOAD:
8947 case PERF_BPF_EVENT_PROG_UNLOAD:
8948 if (atomic_read(&nr_ksymbol_events))
8949 perf_event_bpf_emit_ksymbols(prog, type);
8950 break;
8951 default:
8952 break;
8953 }
8954
8955 if (!atomic_read(&nr_bpf_events))
8956 return;
8957
8958 bpf_event = (struct perf_bpf_event){
8959 .prog = prog,
8960 .event_id = {
8961 .header = {
8962 .type = PERF_RECORD_BPF_EVENT,
8963 .size = sizeof(bpf_event.event_id),
8964 },
8965 .type = type,
8966 .flags = flags,
8967 .id = prog->aux->id,
8968 },
8969 };
8970
8971 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8972
8973 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8974 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8975}
8976
8977struct perf_text_poke_event {
8978 const void *old_bytes;
8979 const void *new_bytes;
8980 size_t pad;
8981 u16 old_len;
8982 u16 new_len;
8983
8984 struct {
8985 struct perf_event_header header;
8986
8987 u64 addr;
8988 } event_id;
8989};
8990
8991static int perf_event_text_poke_match(struct perf_event *event)
8992{
8993 return event->attr.text_poke;
8994}
8995
8996static void perf_event_text_poke_output(struct perf_event *event, void *data)
8997{
8998 struct perf_text_poke_event *text_poke_event = data;
8999 struct perf_output_handle handle;
9000 struct perf_sample_data sample;
9001 u64 padding = 0;
9002 int ret;
9003
9004 if (!perf_event_text_poke_match(event))
9005 return;
9006
9007 perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
9008
9009 ret = perf_output_begin(&handle, &sample, event,
9010 text_poke_event->event_id.header.size);
9011 if (ret)
9012 return;
9013
9014 perf_output_put(&handle, text_poke_event->event_id);
9015 perf_output_put(&handle, text_poke_event->old_len);
9016 perf_output_put(&handle, text_poke_event->new_len);
9017
9018 __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
9019 __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
9020
9021 if (text_poke_event->pad)
9022 __output_copy(&handle, &padding, text_poke_event->pad);
9023
9024 perf_event__output_id_sample(event, &handle, &sample);
9025
9026 perf_output_end(&handle);
9027}
9028
9029void perf_event_text_poke(const void *addr, const void *old_bytes,
9030 size_t old_len, const void *new_bytes, size_t new_len)
9031{
9032 struct perf_text_poke_event text_poke_event;
9033 size_t tot, pad;
9034
9035 if (!atomic_read(&nr_text_poke_events))
9036 return;
9037
9038 tot = sizeof(text_poke_event.old_len) + old_len;
9039 tot += sizeof(text_poke_event.new_len) + new_len;
9040 pad = ALIGN(tot, sizeof(u64)) - tot;
9041
9042 text_poke_event = (struct perf_text_poke_event){
9043 .old_bytes = old_bytes,
9044 .new_bytes = new_bytes,
9045 .pad = pad,
9046 .old_len = old_len,
9047 .new_len = new_len,
9048 .event_id = {
9049 .header = {
9050 .type = PERF_RECORD_TEXT_POKE,
9051 .misc = PERF_RECORD_MISC_KERNEL,
9052 .size = sizeof(text_poke_event.event_id) + tot + pad,
9053 },
9054 .addr = (unsigned long)addr,
9055 },
9056 };
9057
9058 perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
9059}
9060
9061void perf_event_itrace_started(struct perf_event *event)
9062{
9063 event->attach_state |= PERF_ATTACH_ITRACE;
9064}
9065
9066static void perf_log_itrace_start(struct perf_event *event)
9067{
9068 struct perf_output_handle handle;
9069 struct perf_sample_data sample;
9070 struct perf_aux_event {
9071 struct perf_event_header header;
9072 u32 pid;
9073 u32 tid;
9074 } rec;
9075 int ret;
9076
9077 if (event->parent)
9078 event = event->parent;
9079
9080 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
9081 event->attach_state & PERF_ATTACH_ITRACE)
9082 return;
9083
9084 rec.header.type = PERF_RECORD_ITRACE_START;
9085 rec.header.misc = 0;
9086 rec.header.size = sizeof(rec);
9087 rec.pid = perf_event_pid(event, current);
9088 rec.tid = perf_event_tid(event, current);
9089
9090 perf_event_header__init_id(&rec.header, &sample, event);
9091 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9092
9093 if (ret)
9094 return;
9095
9096 perf_output_put(&handle, rec);
9097 perf_event__output_id_sample(event, &handle, &sample);
9098
9099 perf_output_end(&handle);
9100}
9101
9102static int
9103__perf_event_account_interrupt(struct perf_event *event, int throttle)
9104{
9105 struct hw_perf_event *hwc = &event->hw;
9106 int ret = 0;
9107 u64 seq;
9108
9109 seq = __this_cpu_read(perf_throttled_seq);
9110 if (seq != hwc->interrupts_seq) {
9111 hwc->interrupts_seq = seq;
9112 hwc->interrupts = 1;
9113 } else {
9114 hwc->interrupts++;
9115 if (unlikely(throttle
9116 && hwc->interrupts >= max_samples_per_tick)) {
9117 __this_cpu_inc(perf_throttled_count);
9118 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
9119 hwc->interrupts = MAX_INTERRUPTS;
9120 perf_log_throttle(event, 0);
9121 ret = 1;
9122 }
9123 }
9124
9125 if (event->attr.freq) {
9126 u64 now = perf_clock();
9127 s64 delta = now - hwc->freq_time_stamp;
9128
9129 hwc->freq_time_stamp = now;
9130
9131 if (delta > 0 && delta < 2*TICK_NSEC)
9132 perf_adjust_period(event, delta, hwc->last_period, true);
9133 }
9134
9135 return ret;
9136}
9137
9138int perf_event_account_interrupt(struct perf_event *event)
9139{
9140 return __perf_event_account_interrupt(event, 1);
9141}
9142
9143
9144
9145
9146
9147static int __perf_event_overflow(struct perf_event *event,
9148 int throttle, struct perf_sample_data *data,
9149 struct pt_regs *regs)
9150{
9151 int events = atomic_read(&event->event_limit);
9152 int ret = 0;
9153
9154
9155
9156
9157
9158 if (unlikely(!is_sampling_event(event)))
9159 return 0;
9160
9161 ret = __perf_event_account_interrupt(event, throttle);
9162
9163
9164
9165
9166
9167
9168 event->pending_kill = POLL_IN;
9169 if (events && atomic_dec_and_test(&event->event_limit)) {
9170 ret = 1;
9171 event->pending_kill = POLL_HUP;
9172 event->pending_addr = data->addr;
9173
9174 perf_event_disable_inatomic(event);
9175 }
9176
9177 READ_ONCE(event->overflow_handler)(event, data, regs);
9178
9179 if (*perf_event_fasync(event) && event->pending_kill) {
9180 event->pending_wakeup = 1;
9181 irq_work_queue(&event->pending);
9182 }
9183
9184 return ret;
9185}
9186
9187int perf_event_overflow(struct perf_event *event,
9188 struct perf_sample_data *data,
9189 struct pt_regs *regs)
9190{
9191 return __perf_event_overflow(event, 1, data, regs);
9192}
9193
9194
9195
9196
9197
9198struct swevent_htable {
9199 struct swevent_hlist *swevent_hlist;
9200 struct mutex hlist_mutex;
9201 int hlist_refcount;
9202
9203
9204 int recursion[PERF_NR_CONTEXTS];
9205};
9206
9207static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9208
9209
9210
9211
9212
9213
9214
9215
9216u64 perf_swevent_set_period(struct perf_event *event)
9217{
9218 struct hw_perf_event *hwc = &event->hw;
9219 u64 period = hwc->last_period;
9220 u64 nr, offset;
9221 s64 old, val;
9222
9223 hwc->last_period = hwc->sample_period;
9224
9225again:
9226 old = val = local64_read(&hwc->period_left);
9227 if (val < 0)
9228 return 0;
9229
9230 nr = div64_u64(period + val, period);
9231 offset = nr * period;
9232 val -= offset;
9233 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
9234 goto again;
9235
9236 return nr;
9237}
9238
9239static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9240 struct perf_sample_data *data,
9241 struct pt_regs *regs)
9242{
9243 struct hw_perf_event *hwc = &event->hw;
9244 int throttle = 0;
9245
9246 if (!overflow)
9247 overflow = perf_swevent_set_period(event);
9248
9249 if (hwc->interrupts == MAX_INTERRUPTS)
9250 return;
9251
9252 for (; overflow; overflow--) {
9253 if (__perf_event_overflow(event, throttle,
9254 data, regs)) {
9255
9256
9257
9258
9259 break;
9260 }
9261 throttle = 1;
9262 }
9263}
9264
9265static void perf_swevent_event(struct perf_event *event, u64 nr,
9266 struct perf_sample_data *data,
9267 struct pt_regs *regs)
9268{
9269 struct hw_perf_event *hwc = &event->hw;
9270
9271 local64_add(nr, &event->count);
9272
9273 if (!regs)
9274 return;
9275
9276 if (!is_sampling_event(event))
9277 return;
9278
9279 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9280 data->period = nr;
9281 return perf_swevent_overflow(event, 1, data, regs);
9282 } else
9283 data->period = event->hw.last_period;
9284
9285 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9286 return perf_swevent_overflow(event, 1, data, regs);
9287
9288 if (local64_add_negative(nr, &hwc->period_left))
9289 return;
9290
9291 perf_swevent_overflow(event, 0, data, regs);
9292}
9293
9294static int perf_exclude_event(struct perf_event *event,
9295 struct pt_regs *regs)
9296{
9297 if (event->hw.state & PERF_HES_STOPPED)
9298 return 1;
9299
9300 if (regs) {
9301 if (event->attr.exclude_user && user_mode(regs))
9302 return 1;
9303
9304 if (event->attr.exclude_kernel && !user_mode(regs))
9305 return 1;
9306 }
9307
9308 return 0;
9309}
9310
9311static int perf_swevent_match(struct perf_event *event,
9312 enum perf_type_id type,
9313 u32 event_id,
9314 struct perf_sample_data *data,
9315 struct pt_regs *regs)
9316{
9317 if (event->attr.type != type)
9318 return 0;
9319
9320 if (event->attr.config != event_id)
9321 return 0;
9322
9323 if (perf_exclude_event(event, regs))
9324 return 0;
9325
9326 return 1;
9327}
9328
9329static inline u64 swevent_hash(u64 type, u32 event_id)
9330{
9331 u64 val = event_id | (type << 32);
9332
9333 return hash_64(val, SWEVENT_HLIST_BITS);
9334}
9335
9336static inline struct hlist_head *
9337__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9338{
9339 u64 hash = swevent_hash(type, event_id);
9340
9341 return &hlist->heads[hash];
9342}
9343
9344
9345static inline struct hlist_head *
9346find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9347{
9348 struct swevent_hlist *hlist;
9349
9350 hlist = rcu_dereference(swhash->swevent_hlist);
9351 if (!hlist)
9352 return NULL;
9353
9354 return __find_swevent_head(hlist, type, event_id);
9355}
9356
9357
9358static inline struct hlist_head *
9359find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9360{
9361 struct swevent_hlist *hlist;
9362 u32 event_id = event->attr.config;
9363 u64 type = event->attr.type;
9364
9365
9366
9367
9368
9369
9370 hlist = rcu_dereference_protected(swhash->swevent_hlist,
9371 lockdep_is_held(&event->ctx->lock));
9372 if (!hlist)
9373 return NULL;
9374
9375 return __find_swevent_head(hlist, type, event_id);
9376}
9377
9378static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9379 u64 nr,
9380 struct perf_sample_data *data,
9381 struct pt_regs *regs)
9382{
9383 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9384 struct perf_event *event;
9385 struct hlist_head *head;
9386
9387 rcu_read_lock();
9388 head = find_swevent_head_rcu(swhash, type, event_id);
9389 if (!head)
9390 goto end;
9391
9392 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9393 if (perf_swevent_match(event, type, event_id, data, regs))
9394 perf_swevent_event(event, nr, data, regs);
9395 }
9396end:
9397 rcu_read_unlock();
9398}
9399
9400DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9401
9402int perf_swevent_get_recursion_context(void)
9403{
9404 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9405
9406 return get_recursion_context(swhash->recursion);
9407}
9408EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9409
9410void perf_swevent_put_recursion_context(int rctx)
9411{
9412 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9413
9414 put_recursion_context(swhash->recursion, rctx);
9415}
9416
9417void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9418{
9419 struct perf_sample_data data;
9420
9421 if (WARN_ON_ONCE(!regs))
9422 return;
9423
9424 perf_sample_data_init(&data, addr, 0);
9425 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9426}
9427
9428void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9429{
9430 int rctx;
9431
9432 preempt_disable_notrace();
9433 rctx = perf_swevent_get_recursion_context();
9434 if (unlikely(rctx < 0))
9435 goto fail;
9436
9437 ___perf_sw_event(event_id, nr, regs, addr);
9438
9439 perf_swevent_put_recursion_context(rctx);
9440fail:
9441 preempt_enable_notrace();
9442}
9443
9444static void perf_swevent_read(struct perf_event *event)
9445{
9446}
9447
9448static int perf_swevent_add(struct perf_event *event, int flags)
9449{
9450 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9451 struct hw_perf_event *hwc = &event->hw;
9452 struct hlist_head *head;
9453
9454 if (is_sampling_event(event)) {
9455 hwc->last_period = hwc->sample_period;
9456 perf_swevent_set_period(event);
9457 }
9458
9459 hwc->state = !(flags & PERF_EF_START);
9460
9461 head = find_swevent_head(swhash, event);
9462 if (WARN_ON_ONCE(!head))
9463 return -EINVAL;
9464
9465 hlist_add_head_rcu(&event->hlist_entry, head);
9466 perf_event_update_userpage(event);
9467
9468 return 0;
9469}
9470
9471static void perf_swevent_del(struct perf_event *event, int flags)
9472{
9473 hlist_del_rcu(&event->hlist_entry);
9474}
9475
9476static void perf_swevent_start(struct perf_event *event, int flags)
9477{
9478 event->hw.state = 0;
9479}
9480
9481static void perf_swevent_stop(struct perf_event *event, int flags)
9482{
9483 event->hw.state = PERF_HES_STOPPED;
9484}
9485
9486
9487static inline struct swevent_hlist *
9488swevent_hlist_deref(struct swevent_htable *swhash)
9489{
9490 return rcu_dereference_protected(swhash->swevent_hlist,
9491 lockdep_is_held(&swhash->hlist_mutex));
9492}
9493
9494static void swevent_hlist_release(struct swevent_htable *swhash)
9495{
9496 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9497
9498 if (!hlist)
9499 return;
9500
9501 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9502 kfree_rcu(hlist, rcu_head);
9503}
9504
9505static void swevent_hlist_put_cpu(int cpu)
9506{
9507 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9508
9509 mutex_lock(&swhash->hlist_mutex);
9510
9511 if (!--swhash->hlist_refcount)
9512 swevent_hlist_release(swhash);
9513
9514 mutex_unlock(&swhash->hlist_mutex);
9515}
9516
9517static void swevent_hlist_put(void)
9518{
9519 int cpu;
9520
9521 for_each_possible_cpu(cpu)
9522 swevent_hlist_put_cpu(cpu);
9523}
9524
9525static int swevent_hlist_get_cpu(int cpu)
9526{
9527 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9528 int err = 0;
9529
9530 mutex_lock(&swhash->hlist_mutex);
9531 if (!swevent_hlist_deref(swhash) &&
9532 cpumask_test_cpu(cpu, perf_online_mask)) {
9533 struct swevent_hlist *hlist;
9534
9535 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9536 if (!hlist) {
9537 err = -ENOMEM;
9538 goto exit;
9539 }
9540 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9541 }
9542 swhash->hlist_refcount++;
9543exit:
9544 mutex_unlock(&swhash->hlist_mutex);
9545
9546 return err;
9547}
9548
9549static int swevent_hlist_get(void)
9550{
9551 int err, cpu, failed_cpu;
9552
9553 mutex_lock(&pmus_lock);
9554 for_each_possible_cpu(cpu) {
9555 err = swevent_hlist_get_cpu(cpu);
9556 if (err) {
9557 failed_cpu = cpu;
9558 goto fail;
9559 }
9560 }
9561 mutex_unlock(&pmus_lock);
9562 return 0;
9563fail:
9564 for_each_possible_cpu(cpu) {
9565 if (cpu == failed_cpu)
9566 break;
9567 swevent_hlist_put_cpu(cpu);
9568 }
9569 mutex_unlock(&pmus_lock);
9570 return err;
9571}
9572
9573struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9574
9575static void sw_perf_event_destroy(struct perf_event *event)
9576{
9577 u64 event_id = event->attr.config;
9578
9579 WARN_ON(event->parent);
9580
9581 static_key_slow_dec(&perf_swevent_enabled[event_id]);
9582 swevent_hlist_put();
9583}
9584
9585static int perf_swevent_init(struct perf_event *event)
9586{
9587 u64 event_id = event->attr.config;
9588
9589 if (event->attr.type != PERF_TYPE_SOFTWARE)
9590 return -ENOENT;
9591
9592
9593
9594
9595 if (has_branch_stack(event))
9596 return -EOPNOTSUPP;
9597
9598 switch (event_id) {
9599 case PERF_COUNT_SW_CPU_CLOCK:
9600 case PERF_COUNT_SW_TASK_CLOCK:
9601 return -ENOENT;
9602
9603 default:
9604 break;
9605 }
9606
9607 if (event_id >= PERF_COUNT_SW_MAX)
9608 return -ENOENT;
9609
9610 if (!event->parent) {
9611 int err;
9612
9613 err = swevent_hlist_get();
9614 if (err)
9615 return err;
9616
9617 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9618 event->destroy = sw_perf_event_destroy;
9619 }
9620
9621 return 0;
9622}
9623
9624static struct pmu perf_swevent = {
9625 .task_ctx_nr = perf_sw_context,
9626
9627 .capabilities = PERF_PMU_CAP_NO_NMI,
9628
9629 .event_init = perf_swevent_init,
9630 .add = perf_swevent_add,
9631 .del = perf_swevent_del,
9632 .start = perf_swevent_start,
9633 .stop = perf_swevent_stop,
9634 .read = perf_swevent_read,
9635};
9636
9637#ifdef CONFIG_EVENT_TRACING
9638
9639static int perf_tp_filter_match(struct perf_event *event,
9640 struct perf_sample_data *data)
9641{
9642 void *record = data->raw->frag.data;
9643
9644
9645 if (event->parent)
9646 event = event->parent;
9647
9648 if (likely(!event->filter) || filter_match_preds(event->filter, record))
9649 return 1;
9650 return 0;
9651}
9652
9653static int perf_tp_event_match(struct perf_event *event,
9654 struct perf_sample_data *data,
9655 struct pt_regs *regs)
9656{
9657 if (event->hw.state & PERF_HES_STOPPED)
9658 return 0;
9659
9660
9661
9662 if (event->attr.exclude_kernel && !user_mode(regs))
9663 return 0;
9664
9665 if (!perf_tp_filter_match(event, data))
9666 return 0;
9667
9668 return 1;
9669}
9670
9671void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9672 struct trace_event_call *call, u64 count,
9673 struct pt_regs *regs, struct hlist_head *head,
9674 struct task_struct *task)
9675{
9676 if (bpf_prog_array_valid(call)) {
9677 *(struct pt_regs **)raw_data = regs;
9678 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9679 perf_swevent_put_recursion_context(rctx);
9680 return;
9681 }
9682 }
9683 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9684 rctx, task);
9685}
9686EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9687
9688void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9689 struct pt_regs *regs, struct hlist_head *head, int rctx,
9690 struct task_struct *task)
9691{
9692 struct perf_sample_data data;
9693 struct perf_event *event;
9694
9695 struct perf_raw_record raw = {
9696 .frag = {
9697 .size = entry_size,
9698 .data = record,
9699 },
9700 };
9701
9702 perf_sample_data_init(&data, 0, 0);
9703 data.raw = &raw;
9704
9705 perf_trace_buf_update(record, event_type);
9706
9707 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9708 if (perf_tp_event_match(event, &data, regs))
9709 perf_swevent_event(event, count, &data, regs);
9710 }
9711
9712
9713
9714
9715
9716 if (task && task != current) {
9717 struct perf_event_context *ctx;
9718 struct trace_entry *entry = record;
9719
9720 rcu_read_lock();
9721 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9722 if (!ctx)
9723 goto unlock;
9724
9725 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9726 if (event->cpu != smp_processor_id())
9727 continue;
9728 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9729 continue;
9730 if (event->attr.config != entry->type)
9731 continue;
9732 if (perf_tp_event_match(event, &data, regs))
9733 perf_swevent_event(event, count, &data, regs);
9734 }
9735unlock:
9736 rcu_read_unlock();
9737 }
9738
9739 perf_swevent_put_recursion_context(rctx);
9740}
9741EXPORT_SYMBOL_GPL(perf_tp_event);
9742
9743static void tp_perf_event_destroy(struct perf_event *event)
9744{
9745 perf_trace_destroy(event);
9746}
9747
9748static int perf_tp_event_init(struct perf_event *event)
9749{
9750 int err;
9751
9752 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9753 return -ENOENT;
9754
9755
9756
9757
9758 if (has_branch_stack(event))
9759 return -EOPNOTSUPP;
9760
9761 err = perf_trace_init(event);
9762 if (err)
9763 return err;
9764
9765 event->destroy = tp_perf_event_destroy;
9766
9767 return 0;
9768}
9769
9770static struct pmu perf_tracepoint = {
9771 .task_ctx_nr = perf_sw_context,
9772
9773 .event_init = perf_tp_event_init,
9774 .add = perf_trace_add,
9775 .del = perf_trace_del,
9776 .start = perf_swevent_start,
9777 .stop = perf_swevent_stop,
9778 .read = perf_swevent_read,
9779};
9780
9781#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796enum perf_probe_config {
9797 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
9798 PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9799 PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9800};
9801
9802PMU_FORMAT_ATTR(retprobe, "config:0");
9803#endif
9804
9805#ifdef CONFIG_KPROBE_EVENTS
9806static struct attribute *kprobe_attrs[] = {
9807 &format_attr_retprobe.attr,
9808 NULL,
9809};
9810
9811static struct attribute_group kprobe_format_group = {
9812 .name = "format",
9813 .attrs = kprobe_attrs,
9814};
9815
9816static const struct attribute_group *kprobe_attr_groups[] = {
9817 &kprobe_format_group,
9818 NULL,
9819};
9820
9821static int perf_kprobe_event_init(struct perf_event *event);
9822static struct pmu perf_kprobe = {
9823 .task_ctx_nr = perf_sw_context,
9824 .event_init = perf_kprobe_event_init,
9825 .add = perf_trace_add,
9826 .del = perf_trace_del,
9827 .start = perf_swevent_start,
9828 .stop = perf_swevent_stop,
9829 .read = perf_swevent_read,
9830 .attr_groups = kprobe_attr_groups,
9831};
9832
9833static int perf_kprobe_event_init(struct perf_event *event)
9834{
9835 int err;
9836 bool is_retprobe;
9837
9838 if (event->attr.type != perf_kprobe.type)
9839 return -ENOENT;
9840
9841 if (!perfmon_capable())
9842 return -EACCES;
9843
9844
9845
9846
9847 if (has_branch_stack(event))
9848 return -EOPNOTSUPP;
9849
9850 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9851 err = perf_kprobe_init(event, is_retprobe);
9852 if (err)
9853 return err;
9854
9855 event->destroy = perf_kprobe_destroy;
9856
9857 return 0;
9858}
9859#endif
9860
9861#ifdef CONFIG_UPROBE_EVENTS
9862PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9863
9864static struct attribute *uprobe_attrs[] = {
9865 &format_attr_retprobe.attr,
9866 &format_attr_ref_ctr_offset.attr,
9867 NULL,
9868};
9869
9870static struct attribute_group uprobe_format_group = {
9871 .name = "format",
9872 .attrs = uprobe_attrs,
9873};
9874
9875static const struct attribute_group *uprobe_attr_groups[] = {
9876 &uprobe_format_group,
9877 NULL,
9878};
9879
9880static int perf_uprobe_event_init(struct perf_event *event);
9881static struct pmu perf_uprobe = {
9882 .task_ctx_nr = perf_sw_context,
9883 .event_init = perf_uprobe_event_init,
9884 .add = perf_trace_add,
9885 .del = perf_trace_del,
9886 .start = perf_swevent_start,
9887 .stop = perf_swevent_stop,
9888 .read = perf_swevent_read,
9889 .attr_groups = uprobe_attr_groups,
9890};
9891
9892static int perf_uprobe_event_init(struct perf_event *event)
9893{
9894 int err;
9895 unsigned long ref_ctr_offset;
9896 bool is_retprobe;
9897
9898 if (event->attr.type != perf_uprobe.type)
9899 return -ENOENT;
9900
9901 if (!perfmon_capable())
9902 return -EACCES;
9903
9904
9905
9906
9907 if (has_branch_stack(event))
9908 return -EOPNOTSUPP;
9909
9910 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9911 ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9912 err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
9913 if (err)
9914 return err;
9915
9916 event->destroy = perf_uprobe_destroy;
9917
9918 return 0;
9919}
9920#endif
9921
9922static inline void perf_tp_register(void)
9923{
9924 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9925#ifdef CONFIG_KPROBE_EVENTS
9926 perf_pmu_register(&perf_kprobe, "kprobe", -1);
9927#endif
9928#ifdef CONFIG_UPROBE_EVENTS
9929 perf_pmu_register(&perf_uprobe, "uprobe", -1);
9930#endif
9931}
9932
9933static void perf_event_free_filter(struct perf_event *event)
9934{
9935 ftrace_profile_free_filter(event);
9936}
9937
9938#ifdef CONFIG_BPF_SYSCALL
9939static void bpf_overflow_handler(struct perf_event *event,
9940 struct perf_sample_data *data,
9941 struct pt_regs *regs)
9942{
9943 struct bpf_perf_event_data_kern ctx = {
9944 .data = data,
9945 .event = event,
9946 };
9947 struct bpf_prog *prog;
9948 int ret = 0;
9949
9950 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9951 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9952 goto out;
9953 rcu_read_lock();
9954 prog = READ_ONCE(event->prog);
9955 if (prog)
9956 ret = bpf_prog_run(prog, &ctx);
9957 rcu_read_unlock();
9958out:
9959 __this_cpu_dec(bpf_prog_active);
9960 if (!ret)
9961 return;
9962
9963 event->orig_overflow_handler(event, data, regs);
9964}
9965
9966static int perf_event_set_bpf_handler(struct perf_event *event,
9967 struct bpf_prog *prog,
9968 u64 bpf_cookie)
9969{
9970 if (event->overflow_handler_context)
9971
9972 return -EINVAL;
9973
9974 if (event->prog)
9975 return -EEXIST;
9976
9977 if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
9978 return -EINVAL;
9979
9980 if (event->attr.precise_ip &&
9981 prog->call_get_stack &&
9982 (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9983 event->attr.exclude_callchain_kernel ||
9984 event->attr.exclude_callchain_user)) {
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994 return -EPROTO;
9995 }
9996
9997 event->prog = prog;
9998 event->bpf_cookie = bpf_cookie;
9999 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
10000 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
10001 return 0;
10002}
10003
10004static void perf_event_free_bpf_handler(struct perf_event *event)
10005{
10006 struct bpf_prog *prog = event->prog;
10007
10008 if (!prog)
10009 return;
10010
10011 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
10012 event->prog = NULL;
10013 bpf_prog_put(prog);
10014}
10015#else
10016static int perf_event_set_bpf_handler(struct perf_event *event,
10017 struct bpf_prog *prog,
10018 u64 bpf_cookie)
10019{
10020 return -EOPNOTSUPP;
10021}
10022static void perf_event_free_bpf_handler(struct perf_event *event)
10023{
10024}
10025#endif
10026
10027
10028
10029
10030
10031static inline bool perf_event_is_tracing(struct perf_event *event)
10032{
10033 if (event->pmu == &perf_tracepoint)
10034 return true;
10035#ifdef CONFIG_KPROBE_EVENTS
10036 if (event->pmu == &perf_kprobe)
10037 return true;
10038#endif
10039#ifdef CONFIG_UPROBE_EVENTS
10040 if (event->pmu == &perf_uprobe)
10041 return true;
10042#endif
10043 return false;
10044}
10045
10046int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10047 u64 bpf_cookie)
10048{
10049 bool is_kprobe, is_tracepoint, is_syscall_tp;
10050
10051 if (!perf_event_is_tracing(event))
10052 return perf_event_set_bpf_handler(event, prog, bpf_cookie);
10053
10054 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
10055 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
10056 is_syscall_tp = is_syscall_trace_event(event->tp_event);
10057 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
10058
10059 return -EINVAL;
10060
10061 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
10062 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
10063 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
10064 return -EINVAL;
10065
10066
10067 if (prog->kprobe_override &&
10068 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
10069 return -EINVAL;
10070
10071 if (is_tracepoint || is_syscall_tp) {
10072 int off = trace_event_get_offsets(event->tp_event);
10073
10074 if (prog->aux->max_ctx_offset > off)
10075 return -EACCES;
10076 }
10077
10078 return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
10079}
10080
10081void perf_event_free_bpf_prog(struct perf_event *event)
10082{
10083 if (!perf_event_is_tracing(event)) {
10084 perf_event_free_bpf_handler(event);
10085 return;
10086 }
10087 perf_event_detach_bpf_prog(event);
10088}
10089
10090#else
10091
10092static inline void perf_tp_register(void)
10093{
10094}
10095
10096static void perf_event_free_filter(struct perf_event *event)
10097{
10098}
10099
10100int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10101 u64 bpf_cookie)
10102{
10103 return -ENOENT;
10104}
10105
10106void perf_event_free_bpf_prog(struct perf_event *event)
10107{
10108}
10109#endif
10110
10111#ifdef CONFIG_HAVE_HW_BREAKPOINT
10112void perf_bp_event(struct perf_event *bp, void *data)
10113{
10114 struct perf_sample_data sample;
10115 struct pt_regs *regs = data;
10116
10117 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
10118
10119 if (!bp->hw.state && !perf_exclude_event(bp, regs))
10120 perf_swevent_event(bp, 1, &sample, regs);
10121}
10122#endif
10123
10124
10125
10126
10127static struct perf_addr_filter *
10128perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10129{
10130 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10131 struct perf_addr_filter *filter;
10132
10133 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10134 if (!filter)
10135 return NULL;
10136
10137 INIT_LIST_HEAD(&filter->entry);
10138 list_add_tail(&filter->entry, filters);
10139
10140 return filter;
10141}
10142
10143static void free_filters_list(struct list_head *filters)
10144{
10145 struct perf_addr_filter *filter, *iter;
10146
10147 list_for_each_entry_safe(filter, iter, filters, entry) {
10148 path_put(&filter->path);
10149 list_del(&filter->entry);
10150 kfree(filter);
10151 }
10152}
10153
10154
10155
10156
10157static void perf_addr_filters_splice(struct perf_event *event,
10158 struct list_head *head)
10159{
10160 unsigned long flags;
10161 LIST_HEAD(list);
10162
10163 if (!has_addr_filter(event))
10164 return;
10165
10166
10167 if (event->parent)
10168 return;
10169
10170 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10171
10172 list_splice_init(&event->addr_filters.list, &list);
10173 if (head)
10174 list_splice(head, &event->addr_filters.list);
10175
10176 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10177
10178 free_filters_list(&list);
10179}
10180
10181
10182
10183
10184
10185
10186static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10187 struct mm_struct *mm,
10188 struct perf_addr_filter_range *fr)
10189{
10190 struct vm_area_struct *vma;
10191
10192 for (vma = mm->mmap; vma; vma = vma->vm_next) {
10193 if (!vma->vm_file)
10194 continue;
10195
10196 if (perf_addr_filter_vma_adjust(filter, vma, fr))
10197 return;
10198 }
10199}
10200
10201
10202
10203
10204
10205static void perf_event_addr_filters_apply(struct perf_event *event)
10206{
10207 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10208 struct task_struct *task = READ_ONCE(event->ctx->task);
10209 struct perf_addr_filter *filter;
10210 struct mm_struct *mm = NULL;
10211 unsigned int count = 0;
10212 unsigned long flags;
10213
10214
10215
10216
10217
10218 if (task == TASK_TOMBSTONE)
10219 return;
10220
10221 if (ifh->nr_file_filters) {
10222 mm = get_task_mm(task);
10223 if (!mm)
10224 goto restart;
10225
10226 mmap_read_lock(mm);
10227 }
10228
10229 raw_spin_lock_irqsave(&ifh->lock, flags);
10230 list_for_each_entry(filter, &ifh->list, entry) {
10231 if (filter->path.dentry) {
10232
10233
10234
10235
10236 event->addr_filter_ranges[count].start = 0;
10237 event->addr_filter_ranges[count].size = 0;
10238
10239 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10240 } else {
10241 event->addr_filter_ranges[count].start = filter->offset;
10242 event->addr_filter_ranges[count].size = filter->size;
10243 }
10244
10245 count++;
10246 }
10247
10248 event->addr_filters_gen++;
10249 raw_spin_unlock_irqrestore(&ifh->lock, flags);
10250
10251 if (ifh->nr_file_filters) {
10252 mmap_read_unlock(mm);
10253
10254 mmput(mm);
10255 }
10256
10257restart:
10258 perf_event_stop(event, 1);
10259}
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280enum {
10281 IF_ACT_NONE = -1,
10282 IF_ACT_FILTER,
10283 IF_ACT_START,
10284 IF_ACT_STOP,
10285 IF_SRC_FILE,
10286 IF_SRC_KERNEL,
10287 IF_SRC_FILEADDR,
10288 IF_SRC_KERNELADDR,
10289};
10290
10291enum {
10292 IF_STATE_ACTION = 0,
10293 IF_STATE_SOURCE,
10294 IF_STATE_END,
10295};
10296
10297static const match_table_t if_tokens = {
10298 { IF_ACT_FILTER, "filter" },
10299 { IF_ACT_START, "start" },
10300 { IF_ACT_STOP, "stop" },
10301 { IF_SRC_FILE, "%u/%u@%s" },
10302 { IF_SRC_KERNEL, "%u/%u" },
10303 { IF_SRC_FILEADDR, "%u@%s" },
10304 { IF_SRC_KERNELADDR, "%u" },
10305 { IF_ACT_NONE, NULL },
10306};
10307
10308
10309
10310
10311static int
10312perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10313 struct list_head *filters)
10314{
10315 struct perf_addr_filter *filter = NULL;
10316 char *start, *orig, *filename = NULL;
10317 substring_t args[MAX_OPT_ARGS];
10318 int state = IF_STATE_ACTION, token;
10319 unsigned int kernel = 0;
10320 int ret = -EINVAL;
10321
10322 orig = fstr = kstrdup(fstr, GFP_KERNEL);
10323 if (!fstr)
10324 return -ENOMEM;
10325
10326 while ((start = strsep(&fstr, " ,\n")) != NULL) {
10327 static const enum perf_addr_filter_action_t actions[] = {
10328 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10329 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
10330 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
10331 };
10332 ret = -EINVAL;
10333
10334 if (!*start)
10335 continue;
10336
10337
10338 if (state == IF_STATE_ACTION) {
10339 filter = perf_addr_filter_new(event, filters);
10340 if (!filter)
10341 goto fail;
10342 }
10343
10344 token = match_token(start, if_tokens, args);
10345 switch (token) {
10346 case IF_ACT_FILTER:
10347 case IF_ACT_START:
10348 case IF_ACT_STOP:
10349 if (state != IF_STATE_ACTION)
10350 goto fail;
10351
10352 filter->action = actions[token];
10353 state = IF_STATE_SOURCE;
10354 break;
10355
10356 case IF_SRC_KERNELADDR:
10357 case IF_SRC_KERNEL:
10358 kernel = 1;
10359 fallthrough;
10360
10361 case IF_SRC_FILEADDR:
10362 case IF_SRC_FILE:
10363 if (state != IF_STATE_SOURCE)
10364 goto fail;
10365
10366 *args[0].to = 0;
10367 ret = kstrtoul(args[0].from, 0, &filter->offset);
10368 if (ret)
10369 goto fail;
10370
10371 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10372 *args[1].to = 0;
10373 ret = kstrtoul(args[1].from, 0, &filter->size);
10374 if (ret)
10375 goto fail;
10376 }
10377
10378 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10379 int fpos = token == IF_SRC_FILE ? 2 : 1;
10380
10381 kfree(filename);
10382 filename = match_strdup(&args[fpos]);
10383 if (!filename) {
10384 ret = -ENOMEM;
10385 goto fail;
10386 }
10387 }
10388
10389 state = IF_STATE_END;
10390 break;
10391
10392 default:
10393 goto fail;
10394 }
10395
10396
10397
10398
10399
10400
10401 if (state == IF_STATE_END) {
10402 ret = -EINVAL;
10403 if (kernel && event->attr.exclude_kernel)
10404 goto fail;
10405
10406
10407
10408
10409
10410 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10411 !filter->size)
10412 goto fail;
10413
10414 if (!kernel) {
10415 if (!filename)
10416 goto fail;
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426 ret = -EOPNOTSUPP;
10427 if (!event->ctx->task)
10428 goto fail;
10429
10430
10431 ret = kern_path(filename, LOOKUP_FOLLOW,
10432 &filter->path);
10433 if (ret)
10434 goto fail;
10435
10436 ret = -EINVAL;
10437 if (!filter->path.dentry ||
10438 !S_ISREG(d_inode(filter->path.dentry)
10439 ->i_mode))
10440 goto fail;
10441
10442 event->addr_filters.nr_file_filters++;
10443 }
10444
10445
10446 state = IF_STATE_ACTION;
10447 filter = NULL;
10448 }
10449 }
10450
10451 if (state != IF_STATE_ACTION)
10452 goto fail;
10453
10454 kfree(filename);
10455 kfree(orig);
10456
10457 return 0;
10458
10459fail:
10460 kfree(filename);
10461 free_filters_list(filters);
10462 kfree(orig);
10463
10464 return ret;
10465}
10466
10467static int
10468perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10469{
10470 LIST_HEAD(filters);
10471 int ret;
10472
10473
10474
10475
10476
10477 lockdep_assert_held(&event->ctx->mutex);
10478
10479 if (WARN_ON_ONCE(event->parent))
10480 return -EINVAL;
10481
10482 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10483 if (ret)
10484 goto fail_clear_files;
10485
10486 ret = event->pmu->addr_filters_validate(&filters);
10487 if (ret)
10488 goto fail_free_filters;
10489
10490
10491 perf_addr_filters_splice(event, &filters);
10492
10493
10494 perf_event_for_each_child(event, perf_event_addr_filters_apply);
10495
10496 return ret;
10497
10498fail_free_filters:
10499 free_filters_list(&filters);
10500
10501fail_clear_files:
10502 event->addr_filters.nr_file_filters = 0;
10503
10504 return ret;
10505}
10506
10507static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10508{
10509 int ret = -EINVAL;
10510 char *filter_str;
10511
10512 filter_str = strndup_user(arg, PAGE_SIZE);
10513 if (IS_ERR(filter_str))
10514 return PTR_ERR(filter_str);
10515
10516#ifdef CONFIG_EVENT_TRACING
10517 if (perf_event_is_tracing(event)) {
10518 struct perf_event_context *ctx = event->ctx;
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531 mutex_unlock(&ctx->mutex);
10532 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10533 mutex_lock(&ctx->mutex);
10534 } else
10535#endif
10536 if (has_addr_filter(event))
10537 ret = perf_event_set_addr_filter(event, filter_str);
10538
10539 kfree(filter_str);
10540 return ret;
10541}
10542
10543
10544
10545
10546
10547static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10548{
10549 enum hrtimer_restart ret = HRTIMER_RESTART;
10550 struct perf_sample_data data;
10551 struct pt_regs *regs;
10552 struct perf_event *event;
10553 u64 period;
10554
10555 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10556
10557 if (event->state != PERF_EVENT_STATE_ACTIVE)
10558 return HRTIMER_NORESTART;
10559
10560 event->pmu->read(event);
10561
10562 perf_sample_data_init(&data, 0, event->hw.last_period);
10563 regs = get_irq_regs();
10564
10565 if (regs && !perf_exclude_event(event, regs)) {
10566 if (!(event->attr.exclude_idle && is_idle_task(current)))
10567 if (__perf_event_overflow(event, 1, &data, regs))
10568 ret = HRTIMER_NORESTART;
10569 }
10570
10571 period = max_t(u64, 10000, event->hw.sample_period);
10572 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10573
10574 return ret;
10575}
10576
10577static void perf_swevent_start_hrtimer(struct perf_event *event)
10578{
10579 struct hw_perf_event *hwc = &event->hw;
10580 s64 period;
10581
10582 if (!is_sampling_event(event))
10583 return;
10584
10585 period = local64_read(&hwc->period_left);
10586 if (period) {
10587 if (period < 0)
10588 period = 10000;
10589
10590 local64_set(&hwc->period_left, 0);
10591 } else {
10592 period = max_t(u64, 10000, hwc->sample_period);
10593 }
10594 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10595 HRTIMER_MODE_REL_PINNED_HARD);
10596}
10597
10598static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10599{
10600 struct hw_perf_event *hwc = &event->hw;
10601
10602 if (is_sampling_event(event)) {
10603 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10604 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10605
10606 hrtimer_cancel(&hwc->hrtimer);
10607 }
10608}
10609
10610static void perf_swevent_init_hrtimer(struct perf_event *event)
10611{
10612 struct hw_perf_event *hwc = &event->hw;
10613
10614 if (!is_sampling_event(event))
10615 return;
10616
10617 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10618 hwc->hrtimer.function = perf_swevent_hrtimer;
10619
10620
10621
10622
10623
10624 if (event->attr.freq) {
10625 long freq = event->attr.sample_freq;
10626
10627 event->attr.sample_period = NSEC_PER_SEC / freq;
10628 hwc->sample_period = event->attr.sample_period;
10629 local64_set(&hwc->period_left, hwc->sample_period);
10630 hwc->last_period = hwc->sample_period;
10631 event->attr.freq = 0;
10632 }
10633}
10634
10635
10636
10637
10638
10639static void cpu_clock_event_update(struct perf_event *event)
10640{
10641 s64 prev;
10642 u64 now;
10643
10644 now = local_clock();
10645 prev = local64_xchg(&event->hw.prev_count, now);
10646 local64_add(now - prev, &event->count);
10647}
10648
10649static void cpu_clock_event_start(struct perf_event *event, int flags)
10650{
10651 local64_set(&event->hw.prev_count, local_clock());
10652 perf_swevent_start_hrtimer(event);
10653}
10654
10655static void cpu_clock_event_stop(struct perf_event *event, int flags)
10656{
10657 perf_swevent_cancel_hrtimer(event);
10658 cpu_clock_event_update(event);
10659}
10660
10661static int cpu_clock_event_add(struct perf_event *event, int flags)
10662{
10663 if (flags & PERF_EF_START)
10664 cpu_clock_event_start(event, flags);
10665 perf_event_update_userpage(event);
10666
10667 return 0;
10668}
10669
10670static void cpu_clock_event_del(struct perf_event *event, int flags)
10671{
10672 cpu_clock_event_stop(event, flags);
10673}
10674
10675static void cpu_clock_event_read(struct perf_event *event)
10676{
10677 cpu_clock_event_update(event);
10678}
10679
10680static int cpu_clock_event_init(struct perf_event *event)
10681{
10682 if (event->attr.type != PERF_TYPE_SOFTWARE)
10683 return -ENOENT;
10684
10685 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10686 return -ENOENT;
10687
10688
10689
10690
10691 if (has_branch_stack(event))
10692 return -EOPNOTSUPP;
10693
10694 perf_swevent_init_hrtimer(event);
10695
10696 return 0;
10697}
10698
10699static struct pmu perf_cpu_clock = {
10700 .task_ctx_nr = perf_sw_context,
10701
10702 .capabilities = PERF_PMU_CAP_NO_NMI,
10703
10704 .event_init = cpu_clock_event_init,
10705 .add = cpu_clock_event_add,
10706 .del = cpu_clock_event_del,
10707 .start = cpu_clock_event_start,
10708 .stop = cpu_clock_event_stop,
10709 .read = cpu_clock_event_read,
10710};
10711
10712
10713
10714
10715
10716static void task_clock_event_update(struct perf_event *event, u64 now)
10717{
10718 u64 prev;
10719 s64 delta;
10720
10721 prev = local64_xchg(&event->hw.prev_count, now);
10722 delta = now - prev;
10723 local64_add(delta, &event->count);
10724}
10725
10726static void task_clock_event_start(struct perf_event *event, int flags)
10727{
10728 local64_set(&event->hw.prev_count, event->ctx->time);
10729 perf_swevent_start_hrtimer(event);
10730}
10731
10732static void task_clock_event_stop(struct perf_event *event, int flags)
10733{
10734 perf_swevent_cancel_hrtimer(event);
10735 task_clock_event_update(event, event->ctx->time);
10736}
10737
10738static int task_clock_event_add(struct perf_event *event, int flags)
10739{
10740 if (flags & PERF_EF_START)
10741 task_clock_event_start(event, flags);
10742 perf_event_update_userpage(event);
10743
10744 return 0;
10745}
10746
10747static void task_clock_event_del(struct perf_event *event, int flags)
10748{
10749 task_clock_event_stop(event, PERF_EF_UPDATE);
10750}
10751
10752static void task_clock_event_read(struct perf_event *event)
10753{
10754 u64 now = perf_clock();
10755 u64 delta = now - event->ctx->timestamp;
10756 u64 time = event->ctx->time + delta;
10757
10758 task_clock_event_update(event, time);
10759}
10760
10761static int task_clock_event_init(struct perf_event *event)
10762{
10763 if (event->attr.type != PERF_TYPE_SOFTWARE)
10764 return -ENOENT;
10765
10766 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10767 return -ENOENT;
10768
10769
10770
10771
10772 if (has_branch_stack(event))
10773 return -EOPNOTSUPP;
10774
10775 perf_swevent_init_hrtimer(event);
10776
10777 return 0;
10778}
10779
10780static struct pmu perf_task_clock = {
10781 .task_ctx_nr = perf_sw_context,
10782
10783 .capabilities = PERF_PMU_CAP_NO_NMI,
10784
10785 .event_init = task_clock_event_init,
10786 .add = task_clock_event_add,
10787 .del = task_clock_event_del,
10788 .start = task_clock_event_start,
10789 .stop = task_clock_event_stop,
10790 .read = task_clock_event_read,
10791};
10792
10793static void perf_pmu_nop_void(struct pmu *pmu)
10794{
10795}
10796
10797static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10798{
10799}
10800
10801static int perf_pmu_nop_int(struct pmu *pmu)
10802{
10803 return 0;
10804}
10805
10806static int perf_event_nop_int(struct perf_event *event, u64 value)
10807{
10808 return 0;
10809}
10810
10811static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10812
10813static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10814{
10815 __this_cpu_write(nop_txn_flags, flags);
10816
10817 if (flags & ~PERF_PMU_TXN_ADD)
10818 return;
10819
10820 perf_pmu_disable(pmu);
10821}
10822
10823static int perf_pmu_commit_txn(struct pmu *pmu)
10824{
10825 unsigned int flags = __this_cpu_read(nop_txn_flags);
10826
10827 __this_cpu_write(nop_txn_flags, 0);
10828
10829 if (flags & ~PERF_PMU_TXN_ADD)
10830 return 0;
10831
10832 perf_pmu_enable(pmu);
10833 return 0;
10834}
10835
10836static void perf_pmu_cancel_txn(struct pmu *pmu)
10837{
10838 unsigned int flags = __this_cpu_read(nop_txn_flags);
10839
10840 __this_cpu_write(nop_txn_flags, 0);
10841
10842 if (flags & ~PERF_PMU_TXN_ADD)
10843 return;
10844
10845 perf_pmu_enable(pmu);
10846}
10847
10848static int perf_event_idx_default(struct perf_event *event)
10849{
10850 return 0;
10851}
10852
10853
10854
10855
10856
10857static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10858{
10859 struct pmu *pmu;
10860
10861 if (ctxn < 0)
10862 return NULL;
10863
10864 list_for_each_entry(pmu, &pmus, entry) {
10865 if (pmu->task_ctx_nr == ctxn)
10866 return pmu->pmu_cpu_context;
10867 }
10868
10869 return NULL;
10870}
10871
10872static void free_pmu_context(struct pmu *pmu)
10873{
10874
10875
10876
10877
10878
10879 if (pmu->task_ctx_nr > perf_invalid_context)
10880 return;
10881
10882 free_percpu(pmu->pmu_cpu_context);
10883}
10884
10885
10886
10887
10888static ssize_t nr_addr_filters_show(struct device *dev,
10889 struct device_attribute *attr,
10890 char *page)
10891{
10892 struct pmu *pmu = dev_get_drvdata(dev);
10893
10894 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10895}
10896DEVICE_ATTR_RO(nr_addr_filters);
10897
10898static struct idr pmu_idr;
10899
10900static ssize_t
10901type_show(struct device *dev, struct device_attribute *attr, char *page)
10902{
10903 struct pmu *pmu = dev_get_drvdata(dev);
10904
10905 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10906}
10907static DEVICE_ATTR_RO(type);
10908
10909static ssize_t
10910perf_event_mux_interval_ms_show(struct device *dev,
10911 struct device_attribute *attr,
10912 char *page)
10913{
10914 struct pmu *pmu = dev_get_drvdata(dev);
10915
10916 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10917}
10918
10919static DEFINE_MUTEX(mux_interval_mutex);
10920
10921static ssize_t
10922perf_event_mux_interval_ms_store(struct device *dev,
10923 struct device_attribute *attr,
10924 const char *buf, size_t count)
10925{
10926 struct pmu *pmu = dev_get_drvdata(dev);
10927 int timer, cpu, ret;
10928
10929 ret = kstrtoint(buf, 0, &timer);
10930 if (ret)
10931 return ret;
10932
10933 if (timer < 1)
10934 return -EINVAL;
10935
10936
10937 if (timer == pmu->hrtimer_interval_ms)
10938 return count;
10939
10940 mutex_lock(&mux_interval_mutex);
10941 pmu->hrtimer_interval_ms = timer;
10942
10943
10944 cpus_read_lock();
10945 for_each_online_cpu(cpu) {
10946 struct perf_cpu_context *cpuctx;
10947 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10948 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
10949
10950 cpu_function_call(cpu,
10951 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10952 }
10953 cpus_read_unlock();
10954 mutex_unlock(&mux_interval_mutex);
10955
10956 return count;
10957}
10958static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
10959
10960static struct attribute *pmu_dev_attrs[] = {
10961 &dev_attr_type.attr,
10962 &dev_attr_perf_event_mux_interval_ms.attr,
10963 NULL,
10964};
10965ATTRIBUTE_GROUPS(pmu_dev);
10966
10967static int pmu_bus_running;
10968static struct bus_type pmu_bus = {
10969 .name = "event_source",
10970 .dev_groups = pmu_dev_groups,
10971};
10972
10973static void pmu_dev_release(struct device *dev)
10974{
10975 kfree(dev);
10976}
10977
10978static int pmu_dev_alloc(struct pmu *pmu)
10979{
10980 int ret = -ENOMEM;
10981
10982 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
10983 if (!pmu->dev)
10984 goto out;
10985
10986 pmu->dev->groups = pmu->attr_groups;
10987 device_initialize(pmu->dev);
10988 ret = dev_set_name(pmu->dev, "%s", pmu->name);
10989 if (ret)
10990 goto free_dev;
10991
10992 dev_set_drvdata(pmu->dev, pmu);
10993 pmu->dev->bus = &pmu_bus;
10994 pmu->dev->release = pmu_dev_release;
10995 ret = device_add(pmu->dev);
10996 if (ret)
10997 goto free_dev;
10998
10999
11000 if (pmu->nr_addr_filters)
11001 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
11002
11003 if (ret)
11004 goto del_dev;
11005
11006 if (pmu->attr_update)
11007 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
11008
11009 if (ret)
11010 goto del_dev;
11011
11012out:
11013 return ret;
11014
11015del_dev:
11016 device_del(pmu->dev);
11017
11018free_dev:
11019 put_device(pmu->dev);
11020 goto out;
11021}
11022
11023static struct lock_class_key cpuctx_mutex;
11024static struct lock_class_key cpuctx_lock;
11025
11026int perf_pmu_register(struct pmu *pmu, const char *name, int type)
11027{
11028 int cpu, ret, max = PERF_TYPE_MAX;
11029
11030 mutex_lock(&pmus_lock);
11031 ret = -ENOMEM;
11032 pmu->pmu_disable_count = alloc_percpu(int);
11033 if (!pmu->pmu_disable_count)
11034 goto unlock;
11035
11036 pmu->type = -1;
11037 if (!name)
11038 goto skip_type;
11039 pmu->name = name;
11040
11041 if (type != PERF_TYPE_SOFTWARE) {
11042 if (type >= 0)
11043 max = type;
11044
11045 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
11046 if (ret < 0)
11047 goto free_pdc;
11048
11049 WARN_ON(type >= 0 && ret != type);
11050
11051 type = ret;
11052 }
11053 pmu->type = type;
11054
11055 if (pmu_bus_running) {
11056 ret = pmu_dev_alloc(pmu);
11057 if (ret)
11058 goto free_idr;
11059 }
11060
11061skip_type:
11062 if (pmu->task_ctx_nr == perf_hw_context) {
11063 static int hw_context_taken = 0;
11064
11065
11066
11067
11068
11069
11070 if (WARN_ON_ONCE(hw_context_taken &&
11071 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
11072 pmu->task_ctx_nr = perf_invalid_context;
11073
11074 hw_context_taken = 1;
11075 }
11076
11077 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
11078 if (pmu->pmu_cpu_context)
11079 goto got_cpu_context;
11080
11081 ret = -ENOMEM;
11082 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
11083 if (!pmu->pmu_cpu_context)
11084 goto free_dev;
11085
11086 for_each_possible_cpu(cpu) {
11087 struct perf_cpu_context *cpuctx;
11088
11089 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11090 __perf_event_init_context(&cpuctx->ctx);
11091 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
11092 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
11093 cpuctx->ctx.pmu = pmu;
11094 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
11095
11096 __perf_mux_hrtimer_init(cpuctx, cpu);
11097
11098 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
11099 cpuctx->heap = cpuctx->heap_default;
11100 }
11101
11102got_cpu_context:
11103 if (!pmu->start_txn) {
11104 if (pmu->pmu_enable) {
11105
11106
11107
11108
11109
11110 pmu->start_txn = perf_pmu_start_txn;
11111 pmu->commit_txn = perf_pmu_commit_txn;
11112 pmu->cancel_txn = perf_pmu_cancel_txn;
11113 } else {
11114 pmu->start_txn = perf_pmu_nop_txn;
11115 pmu->commit_txn = perf_pmu_nop_int;
11116 pmu->cancel_txn = perf_pmu_nop_void;
11117 }
11118 }
11119
11120 if (!pmu->pmu_enable) {
11121 pmu->pmu_enable = perf_pmu_nop_void;
11122 pmu->pmu_disable = perf_pmu_nop_void;
11123 }
11124
11125 if (!pmu->check_period)
11126 pmu->check_period = perf_event_nop_int;
11127
11128 if (!pmu->event_idx)
11129 pmu->event_idx = perf_event_idx_default;
11130
11131
11132
11133
11134
11135
11136 if (type == PERF_TYPE_SOFTWARE || !name)
11137 list_add_rcu(&pmu->entry, &pmus);
11138 else
11139 list_add_tail_rcu(&pmu->entry, &pmus);
11140
11141 atomic_set(&pmu->exclusive_cnt, 0);
11142 ret = 0;
11143unlock:
11144 mutex_unlock(&pmus_lock);
11145
11146 return ret;
11147
11148free_dev:
11149 device_del(pmu->dev);
11150 put_device(pmu->dev);
11151
11152free_idr:
11153 if (pmu->type != PERF_TYPE_SOFTWARE)
11154 idr_remove(&pmu_idr, pmu->type);
11155
11156free_pdc:
11157 free_percpu(pmu->pmu_disable_count);
11158 goto unlock;
11159}
11160EXPORT_SYMBOL_GPL(perf_pmu_register);
11161
11162void perf_pmu_unregister(struct pmu *pmu)
11163{
11164 mutex_lock(&pmus_lock);
11165 list_del_rcu(&pmu->entry);
11166
11167
11168
11169
11170
11171 synchronize_srcu(&pmus_srcu);
11172 synchronize_rcu();
11173
11174 free_percpu(pmu->pmu_disable_count);
11175 if (pmu->type != PERF_TYPE_SOFTWARE)
11176 idr_remove(&pmu_idr, pmu->type);
11177 if (pmu_bus_running) {
11178 if (pmu->nr_addr_filters)
11179 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11180 device_del(pmu->dev);
11181 put_device(pmu->dev);
11182 }
11183 free_pmu_context(pmu);
11184 mutex_unlock(&pmus_lock);
11185}
11186EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11187
11188static inline bool has_extended_regs(struct perf_event *event)
11189{
11190 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11191 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11192}
11193
11194static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11195{
11196 struct perf_event_context *ctx = NULL;
11197 int ret;
11198
11199 if (!try_module_get(pmu->module))
11200 return -ENODEV;
11201
11202
11203
11204
11205
11206
11207
11208 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11209
11210
11211
11212
11213 ctx = perf_event_ctx_lock_nested(event->group_leader,
11214 SINGLE_DEPTH_NESTING);
11215 BUG_ON(!ctx);
11216 }
11217
11218 event->pmu = pmu;
11219 ret = pmu->event_init(event);
11220
11221 if (ctx)
11222 perf_event_ctx_unlock(event->group_leader, ctx);
11223
11224 if (!ret) {
11225 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11226 has_extended_regs(event))
11227 ret = -EOPNOTSUPP;
11228
11229 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11230 event_has_any_exclude_flag(event))
11231 ret = -EINVAL;
11232
11233 if (ret && event->destroy)
11234 event->destroy(event);
11235 }
11236
11237 if (ret)
11238 module_put(pmu->module);
11239
11240 return ret;
11241}
11242
11243static struct pmu *perf_init_event(struct perf_event *event)
11244{
11245 bool extended_type = false;
11246 int idx, type, ret;
11247 struct pmu *pmu;
11248
11249 idx = srcu_read_lock(&pmus_srcu);
11250
11251
11252 if (event->parent && event->parent->pmu) {
11253 pmu = event->parent->pmu;
11254 ret = perf_try_init_event(pmu, event);
11255 if (!ret)
11256 goto unlock;
11257 }
11258
11259
11260
11261
11262
11263 type = event->attr.type;
11264 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
11265 type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
11266 if (!type) {
11267 type = PERF_TYPE_RAW;
11268 } else {
11269 extended_type = true;
11270 event->attr.config &= PERF_HW_EVENT_MASK;
11271 }
11272 }
11273
11274again:
11275 rcu_read_lock();
11276 pmu = idr_find(&pmu_idr, type);
11277 rcu_read_unlock();
11278 if (pmu) {
11279 if (event->attr.type != type && type != PERF_TYPE_RAW &&
11280 !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
11281 goto fail;
11282
11283 ret = perf_try_init_event(pmu, event);
11284 if (ret == -ENOENT && event->attr.type != type && !extended_type) {
11285 type = event->attr.type;
11286 goto again;
11287 }
11288
11289 if (ret)
11290 pmu = ERR_PTR(ret);
11291
11292 goto unlock;
11293 }
11294
11295 list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11296 ret = perf_try_init_event(pmu, event);
11297 if (!ret)
11298 goto unlock;
11299
11300 if (ret != -ENOENT) {
11301 pmu = ERR_PTR(ret);
11302 goto unlock;
11303 }
11304 }
11305fail:
11306 pmu = ERR_PTR(-ENOENT);
11307unlock:
11308 srcu_read_unlock(&pmus_srcu, idx);
11309
11310 return pmu;
11311}
11312
11313static void attach_sb_event(struct perf_event *event)
11314{
11315 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11316
11317 raw_spin_lock(&pel->lock);
11318 list_add_rcu(&event->sb_list, &pel->list);
11319 raw_spin_unlock(&pel->lock);
11320}
11321
11322
11323
11324
11325
11326
11327
11328
11329static void account_pmu_sb_event(struct perf_event *event)
11330{
11331 if (is_sb_event(event))
11332 attach_sb_event(event);
11333}
11334
11335static void account_event_cpu(struct perf_event *event, int cpu)
11336{
11337 if (event->parent)
11338 return;
11339
11340 if (is_cgroup_event(event))
11341 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11342}
11343
11344
11345static void account_freq_event_nohz(void)
11346{
11347#ifdef CONFIG_NO_HZ_FULL
11348
11349 spin_lock(&nr_freq_lock);
11350 if (atomic_inc_return(&nr_freq_events) == 1)
11351 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11352 spin_unlock(&nr_freq_lock);
11353#endif
11354}
11355
11356static void account_freq_event(void)
11357{
11358 if (tick_nohz_full_enabled())
11359 account_freq_event_nohz();
11360 else
11361 atomic_inc(&nr_freq_events);
11362}
11363
11364
11365static void account_event(struct perf_event *event)
11366{
11367 bool inc = false;
11368
11369 if (event->parent)
11370 return;
11371
11372 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11373 inc = true;
11374 if (event->attr.mmap || event->attr.mmap_data)
11375 atomic_inc(&nr_mmap_events);
11376 if (event->attr.build_id)
11377 atomic_inc(&nr_build_id_events);
11378 if (event->attr.comm)
11379 atomic_inc(&nr_comm_events);
11380 if (event->attr.namespaces)
11381 atomic_inc(&nr_namespaces_events);
11382 if (event->attr.cgroup)
11383 atomic_inc(&nr_cgroup_events);
11384 if (event->attr.task)
11385 atomic_inc(&nr_task_events);
11386 if (event->attr.freq)
11387 account_freq_event();
11388 if (event->attr.context_switch) {
11389 atomic_inc(&nr_switch_events);
11390 inc = true;
11391 }
11392 if (has_branch_stack(event))
11393 inc = true;
11394 if (is_cgroup_event(event))
11395 inc = true;
11396 if (event->attr.ksymbol)
11397 atomic_inc(&nr_ksymbol_events);
11398 if (event->attr.bpf_event)
11399 atomic_inc(&nr_bpf_events);
11400 if (event->attr.text_poke)
11401 atomic_inc(&nr_text_poke_events);
11402
11403 if (inc) {
11404
11405
11406
11407
11408
11409 if (atomic_inc_not_zero(&perf_sched_count))
11410 goto enabled;
11411
11412 mutex_lock(&perf_sched_mutex);
11413 if (!atomic_read(&perf_sched_count)) {
11414 static_branch_enable(&perf_sched_events);
11415
11416
11417
11418
11419
11420 synchronize_rcu();
11421 }
11422
11423
11424
11425
11426 atomic_inc(&perf_sched_count);
11427 mutex_unlock(&perf_sched_mutex);
11428 }
11429enabled:
11430
11431 account_event_cpu(event, event->cpu);
11432
11433 account_pmu_sb_event(event);
11434}
11435
11436
11437
11438
11439static struct perf_event *
11440perf_event_alloc(struct perf_event_attr *attr, int cpu,
11441 struct task_struct *task,
11442 struct perf_event *group_leader,
11443 struct perf_event *parent_event,
11444 perf_overflow_handler_t overflow_handler,
11445 void *context, int cgroup_fd)
11446{
11447 struct pmu *pmu;
11448 struct perf_event *event;
11449 struct hw_perf_event *hwc;
11450 long err = -EINVAL;
11451 int node;
11452
11453 if ((unsigned)cpu >= nr_cpu_ids) {
11454 if (!task || cpu != -1)
11455 return ERR_PTR(-EINVAL);
11456 }
11457 if (attr->sigtrap && !task) {
11458
11459 return ERR_PTR(-EINVAL);
11460 }
11461
11462 node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11463 event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11464 node);
11465 if (!event)
11466 return ERR_PTR(-ENOMEM);
11467
11468
11469
11470
11471
11472 if (!group_leader)
11473 group_leader = event;
11474
11475 mutex_init(&event->child_mutex);
11476 INIT_LIST_HEAD(&event->child_list);
11477
11478 INIT_LIST_HEAD(&event->event_entry);
11479 INIT_LIST_HEAD(&event->sibling_list);
11480 INIT_LIST_HEAD(&event->active_list);
11481 init_event_group(event);
11482 INIT_LIST_HEAD(&event->rb_entry);
11483 INIT_LIST_HEAD(&event->active_entry);
11484 INIT_LIST_HEAD(&event->addr_filters.list);
11485 INIT_HLIST_NODE(&event->hlist_entry);
11486
11487
11488 init_waitqueue_head(&event->waitq);
11489 event->pending_disable = -1;
11490 init_irq_work(&event->pending, perf_pending_event);
11491
11492 mutex_init(&event->mmap_mutex);
11493 raw_spin_lock_init(&event->addr_filters.lock);
11494
11495 atomic_long_set(&event->refcount, 1);
11496 event->cpu = cpu;
11497 event->attr = *attr;
11498 event->group_leader = group_leader;
11499 event->pmu = NULL;
11500 event->oncpu = -1;
11501
11502 event->parent = parent_event;
11503
11504 event->ns = get_pid_ns(task_active_pid_ns(current));
11505 event->id = atomic64_inc_return(&perf_event_id);
11506
11507 event->state = PERF_EVENT_STATE_INACTIVE;
11508
11509 if (event->attr.sigtrap)
11510 atomic_set(&event->event_limit, 1);
11511
11512 if (task) {
11513 event->attach_state = PERF_ATTACH_TASK;
11514
11515
11516
11517
11518
11519 event->hw.target = get_task_struct(task);
11520 }
11521
11522 event->clock = &local_clock;
11523 if (parent_event)
11524 event->clock = parent_event->clock;
11525
11526 if (!overflow_handler && parent_event) {
11527 overflow_handler = parent_event->overflow_handler;
11528 context = parent_event->overflow_handler_context;
11529#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11530 if (overflow_handler == bpf_overflow_handler) {
11531 struct bpf_prog *prog = parent_event->prog;
11532
11533 bpf_prog_inc(prog);
11534 event->prog = prog;
11535 event->orig_overflow_handler =
11536 parent_event->orig_overflow_handler;
11537 }
11538#endif
11539 }
11540
11541 if (overflow_handler) {
11542 event->overflow_handler = overflow_handler;
11543 event->overflow_handler_context = context;
11544 } else if (is_write_backward(event)){
11545 event->overflow_handler = perf_event_output_backward;
11546 event->overflow_handler_context = NULL;
11547 } else {
11548 event->overflow_handler = perf_event_output_forward;
11549 event->overflow_handler_context = NULL;
11550 }
11551
11552 perf_event__state_init(event);
11553
11554 pmu = NULL;
11555
11556 hwc = &event->hw;
11557 hwc->sample_period = attr->sample_period;
11558 if (attr->freq && attr->sample_freq)
11559 hwc->sample_period = 1;
11560 hwc->last_period = hwc->sample_period;
11561
11562 local64_set(&hwc->period_left, hwc->sample_period);
11563
11564
11565
11566
11567
11568 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11569 goto err_ns;
11570
11571 if (!has_branch_stack(event))
11572 event->attr.branch_sample_type = 0;
11573
11574 pmu = perf_init_event(event);
11575 if (IS_ERR(pmu)) {
11576 err = PTR_ERR(pmu);
11577 goto err_ns;
11578 }
11579
11580
11581
11582
11583
11584 if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11585 err = -EINVAL;
11586 goto err_pmu;
11587 }
11588
11589 if (event->attr.aux_output &&
11590 !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11591 err = -EOPNOTSUPP;
11592 goto err_pmu;
11593 }
11594
11595 if (cgroup_fd != -1) {
11596 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11597 if (err)
11598 goto err_pmu;
11599 }
11600
11601 err = exclusive_event_init(event);
11602 if (err)
11603 goto err_pmu;
11604
11605 if (has_addr_filter(event)) {
11606 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11607 sizeof(struct perf_addr_filter_range),
11608 GFP_KERNEL);
11609 if (!event->addr_filter_ranges) {
11610 err = -ENOMEM;
11611 goto err_per_task;
11612 }
11613
11614
11615
11616
11617
11618 if (event->parent) {
11619 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11620
11621 raw_spin_lock_irq(&ifh->lock);
11622 memcpy(event->addr_filter_ranges,
11623 event->parent->addr_filter_ranges,
11624 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11625 raw_spin_unlock_irq(&ifh->lock);
11626 }
11627
11628
11629 event->addr_filters_gen = 1;
11630 }
11631
11632 if (!event->parent) {
11633 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11634 err = get_callchain_buffers(attr->sample_max_stack);
11635 if (err)
11636 goto err_addr_filters;
11637 }
11638 }
11639
11640 err = security_perf_event_alloc(event);
11641 if (err)
11642 goto err_callchain_buffer;
11643
11644
11645 account_event(event);
11646
11647 return event;
11648
11649err_callchain_buffer:
11650 if (!event->parent) {
11651 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11652 put_callchain_buffers();
11653 }
11654err_addr_filters:
11655 kfree(event->addr_filter_ranges);
11656
11657err_per_task:
11658 exclusive_event_destroy(event);
11659
11660err_pmu:
11661 if (is_cgroup_event(event))
11662 perf_detach_cgroup(event);
11663 if (event->destroy)
11664 event->destroy(event);
11665 module_put(pmu->module);
11666err_ns:
11667 if (event->ns)
11668 put_pid_ns(event->ns);
11669 if (event->hw.target)
11670 put_task_struct(event->hw.target);
11671 kmem_cache_free(perf_event_cache, event);
11672
11673 return ERR_PTR(err);
11674}
11675
11676static int perf_copy_attr(struct perf_event_attr __user *uattr,
11677 struct perf_event_attr *attr)
11678{
11679 u32 size;
11680 int ret;
11681
11682
11683 memset(attr, 0, sizeof(*attr));
11684
11685 ret = get_user(size, &uattr->size);
11686 if (ret)
11687 return ret;
11688
11689
11690 if (!size)
11691 size = PERF_ATTR_SIZE_VER0;
11692 if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11693 goto err_size;
11694
11695 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11696 if (ret) {
11697 if (ret == -E2BIG)
11698 goto err_size;
11699 return ret;
11700 }
11701
11702 attr->size = size;
11703
11704 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11705 return -EINVAL;
11706
11707 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11708 return -EINVAL;
11709
11710 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11711 return -EINVAL;
11712
11713 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11714 u64 mask = attr->branch_sample_type;
11715
11716
11717 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11718 return -EINVAL;
11719
11720
11721 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11722 return -EINVAL;
11723
11724
11725 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11726
11727
11728 if (!attr->exclude_kernel)
11729 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11730
11731 if (!attr->exclude_user)
11732 mask |= PERF_SAMPLE_BRANCH_USER;
11733
11734 if (!attr->exclude_hv)
11735 mask |= PERF_SAMPLE_BRANCH_HV;
11736
11737
11738
11739 attr->branch_sample_type = mask;
11740 }
11741
11742 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11743 ret = perf_allow_kernel(attr);
11744 if (ret)
11745 return ret;
11746 }
11747 }
11748
11749 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11750 ret = perf_reg_validate(attr->sample_regs_user);
11751 if (ret)
11752 return ret;
11753 }
11754
11755 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11756 if (!arch_perf_have_user_stack_dump())
11757 return -ENOSYS;
11758
11759
11760
11761
11762
11763
11764 if (attr->sample_stack_user >= USHRT_MAX)
11765 return -EINVAL;
11766 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11767 return -EINVAL;
11768 }
11769
11770 if (!attr->sample_max_stack)
11771 attr->sample_max_stack = sysctl_perf_event_max_stack;
11772
11773 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11774 ret = perf_reg_validate(attr->sample_regs_intr);
11775
11776#ifndef CONFIG_CGROUP_PERF
11777 if (attr->sample_type & PERF_SAMPLE_CGROUP)
11778 return -EINVAL;
11779#endif
11780 if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
11781 (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
11782 return -EINVAL;
11783
11784 if (!attr->inherit && attr->inherit_thread)
11785 return -EINVAL;
11786
11787 if (attr->remove_on_exec && attr->enable_on_exec)
11788 return -EINVAL;
11789
11790 if (attr->sigtrap && !attr->remove_on_exec)
11791 return -EINVAL;
11792
11793out:
11794 return ret;
11795
11796err_size:
11797 put_user(sizeof(*attr), &uattr->size);
11798 ret = -E2BIG;
11799 goto out;
11800}
11801
11802static int
11803perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11804{
11805 struct perf_buffer *rb = NULL;
11806 int ret = -EINVAL;
11807
11808 if (!output_event)
11809 goto set;
11810
11811
11812 if (event == output_event)
11813 goto out;
11814
11815
11816
11817
11818 if (output_event->cpu != event->cpu)
11819 goto out;
11820
11821
11822
11823
11824 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11825 goto out;
11826
11827
11828
11829
11830 if (output_event->clock != event->clock)
11831 goto out;
11832
11833
11834
11835
11836
11837 if (is_write_backward(output_event) != is_write_backward(event))
11838 goto out;
11839
11840
11841
11842
11843 if (has_aux(event) && has_aux(output_event) &&
11844 event->pmu != output_event->pmu)
11845 goto out;
11846
11847set:
11848 mutex_lock(&event->mmap_mutex);
11849
11850 if (atomic_read(&event->mmap_count))
11851 goto unlock;
11852
11853 if (output_event) {
11854
11855 rb = ring_buffer_get(output_event);
11856 if (!rb)
11857 goto unlock;
11858 }
11859
11860 ring_buffer_attach(event, rb);
11861
11862 ret = 0;
11863unlock:
11864 mutex_unlock(&event->mmap_mutex);
11865
11866out:
11867 return ret;
11868}
11869
11870static void mutex_lock_double(struct mutex *a, struct mutex *b)
11871{
11872 if (b < a)
11873 swap(a, b);
11874
11875 mutex_lock(a);
11876 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11877}
11878
11879static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11880{
11881 bool nmi_safe = false;
11882
11883 switch (clk_id) {
11884 case CLOCK_MONOTONIC:
11885 event->clock = &ktime_get_mono_fast_ns;
11886 nmi_safe = true;
11887 break;
11888
11889 case CLOCK_MONOTONIC_RAW:
11890 event->clock = &ktime_get_raw_fast_ns;
11891 nmi_safe = true;
11892 break;
11893
11894 case CLOCK_REALTIME:
11895 event->clock = &ktime_get_real_ns;
11896 break;
11897
11898 case CLOCK_BOOTTIME:
11899 event->clock = &ktime_get_boottime_ns;
11900 break;
11901
11902 case CLOCK_TAI:
11903 event->clock = &ktime_get_clocktai_ns;
11904 break;
11905
11906 default:
11907 return -EINVAL;
11908 }
11909
11910 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11911 return -EINVAL;
11912
11913 return 0;
11914}
11915
11916
11917
11918
11919
11920static struct perf_event_context *
11921__perf_event_ctx_lock_double(struct perf_event *group_leader,
11922 struct perf_event_context *ctx)
11923{
11924 struct perf_event_context *gctx;
11925
11926again:
11927 rcu_read_lock();
11928 gctx = READ_ONCE(group_leader->ctx);
11929 if (!refcount_inc_not_zero(&gctx->refcount)) {
11930 rcu_read_unlock();
11931 goto again;
11932 }
11933 rcu_read_unlock();
11934
11935 mutex_lock_double(&gctx->mutex, &ctx->mutex);
11936
11937 if (group_leader->ctx != gctx) {
11938 mutex_unlock(&ctx->mutex);
11939 mutex_unlock(&gctx->mutex);
11940 put_ctx(gctx);
11941 goto again;
11942 }
11943
11944 return gctx;
11945}
11946
11947static bool
11948perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
11949{
11950 unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
11951 bool is_capable = perfmon_capable();
11952
11953 if (attr->sigtrap) {
11954
11955
11956
11957
11958 rcu_read_lock();
11959 is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
11960 rcu_read_unlock();
11961
11962
11963
11964
11965
11966
11967 ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
11968 }
11969
11970
11971
11972
11973
11974
11975 return is_capable || ptrace_may_access(task, ptrace_mode);
11976}
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987SYSCALL_DEFINE5(perf_event_open,
11988 struct perf_event_attr __user *, attr_uptr,
11989 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
11990{
11991 struct perf_event *group_leader = NULL, *output_event = NULL;
11992 struct perf_event *event, *sibling;
11993 struct perf_event_attr attr;
11994 struct perf_event_context *ctx, *gctx;
11995 struct file *event_file = NULL;
11996 struct fd group = {NULL, 0};
11997 struct task_struct *task = NULL;
11998 struct pmu *pmu;
11999 int event_fd;
12000 int move_group = 0;
12001 int err;
12002 int f_flags = O_RDWR;
12003 int cgroup_fd = -1;
12004
12005
12006 if (flags & ~PERF_FLAG_ALL)
12007 return -EINVAL;
12008
12009
12010 err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
12011 if (err)
12012 return err;
12013
12014 err = perf_copy_attr(attr_uptr, &attr);
12015 if (err)
12016 return err;
12017
12018 if (!attr.exclude_kernel) {
12019 err = perf_allow_kernel(&attr);
12020 if (err)
12021 return err;
12022 }
12023
12024 if (attr.namespaces) {
12025 if (!perfmon_capable())
12026 return -EACCES;
12027 }
12028
12029 if (attr.freq) {
12030 if (attr.sample_freq > sysctl_perf_event_sample_rate)
12031 return -EINVAL;
12032 } else {
12033 if (attr.sample_period & (1ULL << 63))
12034 return -EINVAL;
12035 }
12036
12037
12038 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
12039 err = perf_allow_kernel(&attr);
12040 if (err)
12041 return err;
12042 }
12043
12044
12045 if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
12046 err = security_locked_down(LOCKDOWN_PERF);
12047 if (err)
12048 return err;
12049 }
12050
12051
12052
12053
12054
12055
12056
12057 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
12058 return -EINVAL;
12059
12060 if (flags & PERF_FLAG_FD_CLOEXEC)
12061 f_flags |= O_CLOEXEC;
12062
12063 event_fd = get_unused_fd_flags(f_flags);
12064 if (event_fd < 0)
12065 return event_fd;
12066
12067 if (group_fd != -1) {
12068 err = perf_fget_light(group_fd, &group);
12069 if (err)
12070 goto err_fd;
12071 group_leader = group.file->private_data;
12072 if (flags & PERF_FLAG_FD_OUTPUT)
12073 output_event = group_leader;
12074 if (flags & PERF_FLAG_FD_NO_GROUP)
12075 group_leader = NULL;
12076 }
12077
12078 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
12079 task = find_lively_task_by_vpid(pid);
12080 if (IS_ERR(task)) {
12081 err = PTR_ERR(task);
12082 goto err_group_fd;
12083 }
12084 }
12085
12086 if (task && group_leader &&
12087 group_leader->attr.inherit != attr.inherit) {
12088 err = -EINVAL;
12089 goto err_task;
12090 }
12091
12092 if (flags & PERF_FLAG_PID_CGROUP)
12093 cgroup_fd = pid;
12094
12095 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
12096 NULL, NULL, cgroup_fd);
12097 if (IS_ERR(event)) {
12098 err = PTR_ERR(event);
12099 goto err_task;
12100 }
12101
12102 if (is_sampling_event(event)) {
12103 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
12104 err = -EOPNOTSUPP;
12105 goto err_alloc;
12106 }
12107 }
12108
12109
12110
12111
12112
12113 pmu = event->pmu;
12114
12115 if (attr.use_clockid) {
12116 err = perf_event_set_clock(event, attr.clockid);
12117 if (err)
12118 goto err_alloc;
12119 }
12120
12121 if (pmu->task_ctx_nr == perf_sw_context)
12122 event->event_caps |= PERF_EV_CAP_SOFTWARE;
12123
12124 if (group_leader) {
12125 if (is_software_event(event) &&
12126 !in_software_context(group_leader)) {
12127
12128
12129
12130
12131
12132
12133
12134
12135 pmu = group_leader->ctx->pmu;
12136 } else if (!is_software_event(event) &&
12137 is_software_event(group_leader) &&
12138 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12139
12140
12141
12142
12143
12144 move_group = 1;
12145 }
12146 }
12147
12148
12149
12150
12151 ctx = find_get_context(pmu, task, event);
12152 if (IS_ERR(ctx)) {
12153 err = PTR_ERR(ctx);
12154 goto err_alloc;
12155 }
12156
12157
12158
12159
12160 if (group_leader) {
12161 err = -EINVAL;
12162
12163
12164
12165
12166
12167 if (group_leader->group_leader != group_leader)
12168 goto err_context;
12169
12170
12171 if (group_leader->clock != event->clock)
12172 goto err_context;
12173
12174
12175
12176
12177
12178
12179 if (group_leader->cpu != event->cpu)
12180 goto err_context;
12181
12182
12183
12184
12185
12186 if (group_leader->ctx->task != ctx->task)
12187 goto err_context;
12188
12189
12190
12191
12192
12193
12194 if (!move_group && group_leader->ctx != ctx)
12195 goto err_context;
12196
12197
12198
12199
12200 if (attr.exclusive || attr.pinned)
12201 goto err_context;
12202 }
12203
12204 if (output_event) {
12205 err = perf_event_set_output(event, output_event);
12206 if (err)
12207 goto err_context;
12208 }
12209
12210 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
12211 f_flags);
12212 if (IS_ERR(event_file)) {
12213 err = PTR_ERR(event_file);
12214 event_file = NULL;
12215 goto err_context;
12216 }
12217
12218 if (task) {
12219 err = down_read_interruptible(&task->signal->exec_update_lock);
12220 if (err)
12221 goto err_file;
12222
12223
12224
12225
12226
12227
12228
12229 err = -EACCES;
12230 if (!perf_check_permission(&attr, task))
12231 goto err_cred;
12232 }
12233
12234 if (move_group) {
12235 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
12236
12237 if (gctx->task == TASK_TOMBSTONE) {
12238 err = -ESRCH;
12239 goto err_locked;
12240 }
12241
12242
12243
12244
12245
12246 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12247
12248
12249
12250
12251
12252 if (gctx != ctx) {
12253 err = -EINVAL;
12254 goto err_locked;
12255 } else {
12256 perf_event_ctx_unlock(group_leader, gctx);
12257 move_group = 0;
12258 }
12259 }
12260
12261
12262
12263
12264 err = -EBUSY;
12265 if (!exclusive_event_installable(group_leader, ctx))
12266 goto err_locked;
12267
12268 for_each_sibling_event(sibling, group_leader) {
12269 if (!exclusive_event_installable(sibling, ctx))
12270 goto err_locked;
12271 }
12272 } else {
12273 mutex_lock(&ctx->mutex);
12274 }
12275
12276 if (ctx->task == TASK_TOMBSTONE) {
12277 err = -ESRCH;
12278 goto err_locked;
12279 }
12280
12281 if (!perf_event_validate_size(event)) {
12282 err = -E2BIG;
12283 goto err_locked;
12284 }
12285
12286 if (!task) {
12287
12288
12289
12290
12291
12292
12293 struct perf_cpu_context *cpuctx =
12294 container_of(ctx, struct perf_cpu_context, ctx);
12295
12296 if (!cpuctx->online) {
12297 err = -ENODEV;
12298 goto err_locked;
12299 }
12300 }
12301
12302 if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12303 err = -EINVAL;
12304 goto err_locked;
12305 }
12306
12307
12308
12309
12310
12311 if (!exclusive_event_installable(event, ctx)) {
12312 err = -EBUSY;
12313 goto err_locked;
12314 }
12315
12316 WARN_ON_ONCE(ctx->parent_ctx);
12317
12318
12319
12320
12321
12322
12323 if (move_group) {
12324
12325
12326
12327
12328 perf_remove_from_context(group_leader, 0);
12329 put_ctx(gctx);
12330
12331 for_each_sibling_event(sibling, group_leader) {
12332 perf_remove_from_context(sibling, 0);
12333 put_ctx(gctx);
12334 }
12335
12336
12337
12338
12339
12340 synchronize_rcu();
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352 for_each_sibling_event(sibling, group_leader) {
12353 perf_event__state_init(sibling);
12354 perf_install_in_context(ctx, sibling, sibling->cpu);
12355 get_ctx(ctx);
12356 }
12357
12358
12359
12360
12361
12362
12363 perf_event__state_init(group_leader);
12364 perf_install_in_context(ctx, group_leader, group_leader->cpu);
12365 get_ctx(ctx);
12366 }
12367
12368
12369
12370
12371
12372
12373
12374 perf_event__header_size(event);
12375 perf_event__id_header_size(event);
12376
12377 event->owner = current;
12378
12379 perf_install_in_context(ctx, event, event->cpu);
12380 perf_unpin_context(ctx);
12381
12382 if (move_group)
12383 perf_event_ctx_unlock(group_leader, gctx);
12384 mutex_unlock(&ctx->mutex);
12385
12386 if (task) {
12387 up_read(&task->signal->exec_update_lock);
12388 put_task_struct(task);
12389 }
12390
12391 mutex_lock(¤t->perf_event_mutex);
12392 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
12393 mutex_unlock(¤t->perf_event_mutex);
12394
12395
12396
12397
12398
12399
12400
12401 fdput(group);
12402 fd_install(event_fd, event_file);
12403 return event_fd;
12404
12405err_locked:
12406 if (move_group)
12407 perf_event_ctx_unlock(group_leader, gctx);
12408 mutex_unlock(&ctx->mutex);
12409err_cred:
12410 if (task)
12411 up_read(&task->signal->exec_update_lock);
12412err_file:
12413 fput(event_file);
12414err_context:
12415 perf_unpin_context(ctx);
12416 put_ctx(ctx);
12417err_alloc:
12418
12419
12420
12421
12422 if (!event_file)
12423 free_event(event);
12424err_task:
12425 if (task)
12426 put_task_struct(task);
12427err_group_fd:
12428 fdput(group);
12429err_fd:
12430 put_unused_fd(event_fd);
12431 return err;
12432}
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443struct perf_event *
12444perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12445 struct task_struct *task,
12446 perf_overflow_handler_t overflow_handler,
12447 void *context)
12448{
12449 struct perf_event_context *ctx;
12450 struct perf_event *event;
12451 int err;
12452
12453
12454
12455
12456
12457 if (attr->aux_output)
12458 return ERR_PTR(-EINVAL);
12459
12460 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12461 overflow_handler, context, -1);
12462 if (IS_ERR(event)) {
12463 err = PTR_ERR(event);
12464 goto err;
12465 }
12466
12467
12468 event->owner = TASK_TOMBSTONE;
12469
12470
12471
12472
12473 ctx = find_get_context(event->pmu, task, event);
12474 if (IS_ERR(ctx)) {
12475 err = PTR_ERR(ctx);
12476 goto err_free;
12477 }
12478
12479 WARN_ON_ONCE(ctx->parent_ctx);
12480 mutex_lock(&ctx->mutex);
12481 if (ctx->task == TASK_TOMBSTONE) {
12482 err = -ESRCH;
12483 goto err_unlock;
12484 }
12485
12486 if (!task) {
12487
12488
12489
12490
12491
12492
12493 struct perf_cpu_context *cpuctx =
12494 container_of(ctx, struct perf_cpu_context, ctx);
12495 if (!cpuctx->online) {
12496 err = -ENODEV;
12497 goto err_unlock;
12498 }
12499 }
12500
12501 if (!exclusive_event_installable(event, ctx)) {
12502 err = -EBUSY;
12503 goto err_unlock;
12504 }
12505
12506 perf_install_in_context(ctx, event, event->cpu);
12507 perf_unpin_context(ctx);
12508 mutex_unlock(&ctx->mutex);
12509
12510 return event;
12511
12512err_unlock:
12513 mutex_unlock(&ctx->mutex);
12514 perf_unpin_context(ctx);
12515 put_ctx(ctx);
12516err_free:
12517 free_event(event);
12518err:
12519 return ERR_PTR(err);
12520}
12521EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12522
12523void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12524{
12525 struct perf_event_context *src_ctx;
12526 struct perf_event_context *dst_ctx;
12527 struct perf_event *event, *tmp;
12528 LIST_HEAD(events);
12529
12530 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12531 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12532
12533
12534
12535
12536
12537 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12538 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12539 event_entry) {
12540 perf_remove_from_context(event, 0);
12541 unaccount_event_cpu(event, src_cpu);
12542 put_ctx(src_ctx);
12543 list_add(&event->migrate_entry, &events);
12544 }
12545
12546
12547
12548
12549 synchronize_rcu();
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12560 if (event->group_leader == event)
12561 continue;
12562
12563 list_del(&event->migrate_entry);
12564 if (event->state >= PERF_EVENT_STATE_OFF)
12565 event->state = PERF_EVENT_STATE_INACTIVE;
12566 account_event_cpu(event, dst_cpu);
12567 perf_install_in_context(dst_ctx, event, dst_cpu);
12568 get_ctx(dst_ctx);
12569 }
12570
12571
12572
12573
12574
12575 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12576 list_del(&event->migrate_entry);
12577 if (event->state >= PERF_EVENT_STATE_OFF)
12578 event->state = PERF_EVENT_STATE_INACTIVE;
12579 account_event_cpu(event, dst_cpu);
12580 perf_install_in_context(dst_ctx, event, dst_cpu);
12581 get_ctx(dst_ctx);
12582 }
12583 mutex_unlock(&dst_ctx->mutex);
12584 mutex_unlock(&src_ctx->mutex);
12585}
12586EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12587
12588static void sync_child_event(struct perf_event *child_event)
12589{
12590 struct perf_event *parent_event = child_event->parent;
12591 u64 child_val;
12592
12593 if (child_event->attr.inherit_stat) {
12594 struct task_struct *task = child_event->ctx->task;
12595
12596 if (task && task != TASK_TOMBSTONE)
12597 perf_event_read_event(child_event, task);
12598 }
12599
12600 child_val = perf_event_count(child_event);
12601
12602
12603
12604
12605 atomic64_add(child_val, &parent_event->child_count);
12606 atomic64_add(child_event->total_time_enabled,
12607 &parent_event->child_total_time_enabled);
12608 atomic64_add(child_event->total_time_running,
12609 &parent_event->child_total_time_running);
12610}
12611
12612static void
12613perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
12614{
12615 struct perf_event *parent_event = event->parent;
12616 unsigned long detach_flags = 0;
12617
12618 if (parent_event) {
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631 detach_flags = DETACH_GROUP | DETACH_CHILD;
12632 mutex_lock(&parent_event->child_mutex);
12633 }
12634
12635 perf_remove_from_context(event, detach_flags);
12636
12637 raw_spin_lock_irq(&ctx->lock);
12638 if (event->state > PERF_EVENT_STATE_EXIT)
12639 perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12640 raw_spin_unlock_irq(&ctx->lock);
12641
12642
12643
12644
12645 if (parent_event) {
12646 mutex_unlock(&parent_event->child_mutex);
12647
12648
12649
12650 perf_event_wakeup(parent_event);
12651 free_event(event);
12652 put_event(parent_event);
12653 return;
12654 }
12655
12656
12657
12658
12659 perf_event_wakeup(event);
12660}
12661
12662static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12663{
12664 struct perf_event_context *child_ctx, *clone_ctx = NULL;
12665 struct perf_event *child_event, *next;
12666
12667 WARN_ON_ONCE(child != current);
12668
12669 child_ctx = perf_pin_task_context(child, ctxn);
12670 if (!child_ctx)
12671 return;
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683 mutex_lock(&child_ctx->mutex);
12684
12685
12686
12687
12688
12689
12690 raw_spin_lock_irq(&child_ctx->lock);
12691 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12692
12693
12694
12695
12696
12697 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12698 put_ctx(child_ctx);
12699 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12700 put_task_struct(current);
12701
12702 clone_ctx = unclone_ctx(child_ctx);
12703 raw_spin_unlock_irq(&child_ctx->lock);
12704
12705 if (clone_ctx)
12706 put_ctx(clone_ctx);
12707
12708
12709
12710
12711
12712
12713 perf_event_task(child, child_ctx, 0);
12714
12715 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12716 perf_event_exit_event(child_event, child_ctx);
12717
12718 mutex_unlock(&child_ctx->mutex);
12719
12720 put_ctx(child_ctx);
12721}
12722
12723
12724
12725
12726
12727
12728
12729void perf_event_exit_task(struct task_struct *child)
12730{
12731 struct perf_event *event, *tmp;
12732 int ctxn;
12733
12734 mutex_lock(&child->perf_event_mutex);
12735 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12736 owner_entry) {
12737 list_del_init(&event->owner_entry);
12738
12739
12740
12741
12742
12743
12744 smp_store_release(&event->owner, NULL);
12745 }
12746 mutex_unlock(&child->perf_event_mutex);
12747
12748 for_each_task_context_nr(ctxn)
12749 perf_event_exit_task_context(child, ctxn);
12750
12751
12752
12753
12754
12755
12756
12757 perf_event_task(child, NULL, 0);
12758}
12759
12760static void perf_free_event(struct perf_event *event,
12761 struct perf_event_context *ctx)
12762{
12763 struct perf_event *parent = event->parent;
12764
12765 if (WARN_ON_ONCE(!parent))
12766 return;
12767
12768 mutex_lock(&parent->child_mutex);
12769 list_del_init(&event->child_list);
12770 mutex_unlock(&parent->child_mutex);
12771
12772 put_event(parent);
12773
12774 raw_spin_lock_irq(&ctx->lock);
12775 perf_group_detach(event);
12776 list_del_event(event, ctx);
12777 raw_spin_unlock_irq(&ctx->lock);
12778 free_event(event);
12779}
12780
12781
12782
12783
12784
12785
12786
12787
12788void perf_event_free_task(struct task_struct *task)
12789{
12790 struct perf_event_context *ctx;
12791 struct perf_event *event, *tmp;
12792 int ctxn;
12793
12794 for_each_task_context_nr(ctxn) {
12795 ctx = task->perf_event_ctxp[ctxn];
12796 if (!ctx)
12797 continue;
12798
12799 mutex_lock(&ctx->mutex);
12800 raw_spin_lock_irq(&ctx->lock);
12801
12802
12803
12804
12805
12806
12807 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12808 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12809 put_task_struct(task);
12810 raw_spin_unlock_irq(&ctx->lock);
12811
12812 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12813 perf_free_event(event, ctx);
12814
12815 mutex_unlock(&ctx->mutex);
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12832 put_ctx(ctx);
12833 }
12834}
12835
12836void perf_event_delayed_put(struct task_struct *task)
12837{
12838 int ctxn;
12839
12840 for_each_task_context_nr(ctxn)
12841 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12842}
12843
12844struct file *perf_event_get(unsigned int fd)
12845{
12846 struct file *file = fget(fd);
12847 if (!file)
12848 return ERR_PTR(-EBADF);
12849
12850 if (file->f_op != &perf_fops) {
12851 fput(file);
12852 return ERR_PTR(-EBADF);
12853 }
12854
12855 return file;
12856}
12857
12858const struct perf_event *perf_get_event(struct file *file)
12859{
12860 if (file->f_op != &perf_fops)
12861 return ERR_PTR(-EINVAL);
12862
12863 return file->private_data;
12864}
12865
12866const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12867{
12868 if (!event)
12869 return ERR_PTR(-EINVAL);
12870
12871 return &event->attr;
12872}
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882static struct perf_event *
12883inherit_event(struct perf_event *parent_event,
12884 struct task_struct *parent,
12885 struct perf_event_context *parent_ctx,
12886 struct task_struct *child,
12887 struct perf_event *group_leader,
12888 struct perf_event_context *child_ctx)
12889{
12890 enum perf_event_state parent_state = parent_event->state;
12891 struct perf_event *child_event;
12892 unsigned long flags;
12893
12894
12895
12896
12897
12898
12899
12900 if (parent_event->parent)
12901 parent_event = parent_event->parent;
12902
12903 child_event = perf_event_alloc(&parent_event->attr,
12904 parent_event->cpu,
12905 child,
12906 group_leader, parent_event,
12907 NULL, NULL, -1);
12908 if (IS_ERR(child_event))
12909 return child_event;
12910
12911
12912 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12913 !child_ctx->task_ctx_data) {
12914 struct pmu *pmu = child_event->pmu;
12915
12916 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
12917 if (!child_ctx->task_ctx_data) {
12918 free_event(child_event);
12919 return ERR_PTR(-ENOMEM);
12920 }
12921 }
12922
12923
12924
12925
12926
12927
12928
12929 mutex_lock(&parent_event->child_mutex);
12930 if (is_orphaned_event(parent_event) ||
12931 !atomic_long_inc_not_zero(&parent_event->refcount)) {
12932 mutex_unlock(&parent_event->child_mutex);
12933
12934 free_event(child_event);
12935 return NULL;
12936 }
12937
12938 get_ctx(child_ctx);
12939
12940
12941
12942
12943
12944
12945 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
12946 child_event->state = PERF_EVENT_STATE_INACTIVE;
12947 else
12948 child_event->state = PERF_EVENT_STATE_OFF;
12949
12950 if (parent_event->attr.freq) {
12951 u64 sample_period = parent_event->hw.sample_period;
12952 struct hw_perf_event *hwc = &child_event->hw;
12953
12954 hwc->sample_period = sample_period;
12955 hwc->last_period = sample_period;
12956
12957 local64_set(&hwc->period_left, sample_period);
12958 }
12959
12960 child_event->ctx = child_ctx;
12961 child_event->overflow_handler = parent_event->overflow_handler;
12962 child_event->overflow_handler_context
12963 = parent_event->overflow_handler_context;
12964
12965
12966
12967
12968 perf_event__header_size(child_event);
12969 perf_event__id_header_size(child_event);
12970
12971
12972
12973
12974 raw_spin_lock_irqsave(&child_ctx->lock, flags);
12975 add_event_to_ctx(child_event, child_ctx);
12976 child_event->attach_state |= PERF_ATTACH_CHILD;
12977 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
12978
12979
12980
12981
12982 list_add_tail(&child_event->child_list, &parent_event->child_list);
12983 mutex_unlock(&parent_event->child_mutex);
12984
12985 return child_event;
12986}
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998static int inherit_group(struct perf_event *parent_event,
12999 struct task_struct *parent,
13000 struct perf_event_context *parent_ctx,
13001 struct task_struct *child,
13002 struct perf_event_context *child_ctx)
13003{
13004 struct perf_event *leader;
13005 struct perf_event *sub;
13006 struct perf_event *child_ctr;
13007
13008 leader = inherit_event(parent_event, parent, parent_ctx,
13009 child, NULL, child_ctx);
13010 if (IS_ERR(leader))
13011 return PTR_ERR(leader);
13012
13013
13014
13015
13016
13017 for_each_sibling_event(sub, parent_event) {
13018 child_ctr = inherit_event(sub, parent, parent_ctx,
13019 child, leader, child_ctx);
13020 if (IS_ERR(child_ctr))
13021 return PTR_ERR(child_ctr);
13022
13023 if (sub->aux_event == parent_event && child_ctr &&
13024 !perf_get_aux_event(child_ctr, leader))
13025 return -EINVAL;
13026 }
13027 return 0;
13028}
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041static int
13042inherit_task_group(struct perf_event *event, struct task_struct *parent,
13043 struct perf_event_context *parent_ctx,
13044 struct task_struct *child, int ctxn,
13045 u64 clone_flags, int *inherited_all)
13046{
13047 int ret;
13048 struct perf_event_context *child_ctx;
13049
13050 if (!event->attr.inherit ||
13051 (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
13052
13053 (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
13054 *inherited_all = 0;
13055 return 0;
13056 }
13057
13058 child_ctx = child->perf_event_ctxp[ctxn];
13059 if (!child_ctx) {
13060
13061
13062
13063
13064
13065
13066 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
13067 if (!child_ctx)
13068 return -ENOMEM;
13069
13070 child->perf_event_ctxp[ctxn] = child_ctx;
13071 }
13072
13073 ret = inherit_group(event, parent, parent_ctx,
13074 child, child_ctx);
13075
13076 if (ret)
13077 *inherited_all = 0;
13078
13079 return ret;
13080}
13081
13082
13083
13084
13085static int perf_event_init_context(struct task_struct *child, int ctxn,
13086 u64 clone_flags)
13087{
13088 struct perf_event_context *child_ctx, *parent_ctx;
13089 struct perf_event_context *cloned_ctx;
13090 struct perf_event *event;
13091 struct task_struct *parent = current;
13092 int inherited_all = 1;
13093 unsigned long flags;
13094 int ret = 0;
13095
13096 if (likely(!parent->perf_event_ctxp[ctxn]))
13097 return 0;
13098
13099
13100
13101
13102
13103 parent_ctx = perf_pin_task_context(parent, ctxn);
13104 if (!parent_ctx)
13105 return 0;
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118 mutex_lock(&parent_ctx->mutex);
13119
13120
13121
13122
13123
13124 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
13125 ret = inherit_task_group(event, parent, parent_ctx,
13126 child, ctxn, clone_flags,
13127 &inherited_all);
13128 if (ret)
13129 goto out_unlock;
13130 }
13131
13132
13133
13134
13135
13136
13137 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13138 parent_ctx->rotate_disable = 1;
13139 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13140
13141 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
13142 ret = inherit_task_group(event, parent, parent_ctx,
13143 child, ctxn, clone_flags,
13144 &inherited_all);
13145 if (ret)
13146 goto out_unlock;
13147 }
13148
13149 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13150 parent_ctx->rotate_disable = 0;
13151
13152 child_ctx = child->perf_event_ctxp[ctxn];
13153
13154 if (child_ctx && inherited_all) {
13155
13156
13157
13158
13159
13160
13161
13162 cloned_ctx = parent_ctx->parent_ctx;
13163 if (cloned_ctx) {
13164 child_ctx->parent_ctx = cloned_ctx;
13165 child_ctx->parent_gen = parent_ctx->parent_gen;
13166 } else {
13167 child_ctx->parent_ctx = parent_ctx;
13168 child_ctx->parent_gen = parent_ctx->generation;
13169 }
13170 get_ctx(child_ctx->parent_ctx);
13171 }
13172
13173 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13174out_unlock:
13175 mutex_unlock(&parent_ctx->mutex);
13176
13177 perf_unpin_context(parent_ctx);
13178 put_ctx(parent_ctx);
13179
13180 return ret;
13181}
13182
13183
13184
13185
13186int perf_event_init_task(struct task_struct *child, u64 clone_flags)
13187{
13188 int ctxn, ret;
13189
13190 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
13191 mutex_init(&child->perf_event_mutex);
13192 INIT_LIST_HEAD(&child->perf_event_list);
13193
13194 for_each_task_context_nr(ctxn) {
13195 ret = perf_event_init_context(child, ctxn, clone_flags);
13196 if (ret) {
13197 perf_event_free_task(child);
13198 return ret;
13199 }
13200 }
13201
13202 return 0;
13203}
13204
13205static void __init perf_event_init_all_cpus(void)
13206{
13207 struct swevent_htable *swhash;
13208 int cpu;
13209
13210 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13211
13212 for_each_possible_cpu(cpu) {
13213 swhash = &per_cpu(swevent_htable, cpu);
13214 mutex_init(&swhash->hlist_mutex);
13215 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
13216
13217 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13218 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13219
13220#ifdef CONFIG_CGROUP_PERF
13221 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
13222#endif
13223 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13224 }
13225}
13226
13227static void perf_swevent_init_cpu(unsigned int cpu)
13228{
13229 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13230
13231 mutex_lock(&swhash->hlist_mutex);
13232 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13233 struct swevent_hlist *hlist;
13234
13235 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13236 WARN_ON(!hlist);
13237 rcu_assign_pointer(swhash->swevent_hlist, hlist);
13238 }
13239 mutex_unlock(&swhash->hlist_mutex);
13240}
13241
13242#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13243static void __perf_event_exit_context(void *__info)
13244{
13245 struct perf_event_context *ctx = __info;
13246 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
13247 struct perf_event *event;
13248
13249 raw_spin_lock(&ctx->lock);
13250 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
13251 list_for_each_entry(event, &ctx->event_list, event_entry)
13252 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13253 raw_spin_unlock(&ctx->lock);
13254}
13255
13256static void perf_event_exit_cpu_context(int cpu)
13257{
13258 struct perf_cpu_context *cpuctx;
13259 struct perf_event_context *ctx;
13260 struct pmu *pmu;
13261
13262 mutex_lock(&pmus_lock);
13263 list_for_each_entry(pmu, &pmus, entry) {
13264 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13265 ctx = &cpuctx->ctx;
13266
13267 mutex_lock(&ctx->mutex);
13268 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13269 cpuctx->online = 0;
13270 mutex_unlock(&ctx->mutex);
13271 }
13272 cpumask_clear_cpu(cpu, perf_online_mask);
13273 mutex_unlock(&pmus_lock);
13274}
13275#else
13276
13277static void perf_event_exit_cpu_context(int cpu) { }
13278
13279#endif
13280
13281int perf_event_init_cpu(unsigned int cpu)
13282{
13283 struct perf_cpu_context *cpuctx;
13284 struct perf_event_context *ctx;
13285 struct pmu *pmu;
13286
13287 perf_swevent_init_cpu(cpu);
13288
13289 mutex_lock(&pmus_lock);
13290 cpumask_set_cpu(cpu, perf_online_mask);
13291 list_for_each_entry(pmu, &pmus, entry) {
13292 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13293 ctx = &cpuctx->ctx;
13294
13295 mutex_lock(&ctx->mutex);
13296 cpuctx->online = 1;
13297 mutex_unlock(&ctx->mutex);
13298 }
13299 mutex_unlock(&pmus_lock);
13300
13301 return 0;
13302}
13303
13304int perf_event_exit_cpu(unsigned int cpu)
13305{
13306 perf_event_exit_cpu_context(cpu);
13307 return 0;
13308}
13309
13310static int
13311perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13312{
13313 int cpu;
13314
13315 for_each_online_cpu(cpu)
13316 perf_event_exit_cpu(cpu);
13317
13318 return NOTIFY_OK;
13319}
13320
13321
13322
13323
13324
13325static struct notifier_block perf_reboot_notifier = {
13326 .notifier_call = perf_reboot,
13327 .priority = INT_MIN,
13328};
13329
13330void __init perf_event_init(void)
13331{
13332 int ret;
13333
13334 idr_init(&pmu_idr);
13335
13336 perf_event_init_all_cpus();
13337 init_srcu_struct(&pmus_srcu);
13338 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13339 perf_pmu_register(&perf_cpu_clock, NULL, -1);
13340 perf_pmu_register(&perf_task_clock, NULL, -1);
13341 perf_tp_register();
13342 perf_event_init_cpu(smp_processor_id());
13343 register_reboot_notifier(&perf_reboot_notifier);
13344
13345 ret = init_hw_breakpoint();
13346 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13347
13348 perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13349
13350
13351
13352
13353
13354 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13355 != 1024);
13356}
13357
13358ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13359 char *page)
13360{
13361 struct perf_pmu_events_attr *pmu_attr =
13362 container_of(attr, struct perf_pmu_events_attr, attr);
13363
13364 if (pmu_attr->event_str)
13365 return sprintf(page, "%s\n", pmu_attr->event_str);
13366
13367 return 0;
13368}
13369EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13370
13371static int __init perf_event_sysfs_init(void)
13372{
13373 struct pmu *pmu;
13374 int ret;
13375
13376 mutex_lock(&pmus_lock);
13377
13378 ret = bus_register(&pmu_bus);
13379 if (ret)
13380 goto unlock;
13381
13382 list_for_each_entry(pmu, &pmus, entry) {
13383 if (!pmu->name || pmu->type < 0)
13384 continue;
13385
13386 ret = pmu_dev_alloc(pmu);
13387 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13388 }
13389 pmu_bus_running = 1;
13390 ret = 0;
13391
13392unlock:
13393 mutex_unlock(&pmus_lock);
13394
13395 return ret;
13396}
13397device_initcall(perf_event_sysfs_init);
13398
13399#ifdef CONFIG_CGROUP_PERF
13400static struct cgroup_subsys_state *
13401perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13402{
13403 struct perf_cgroup *jc;
13404
13405 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13406 if (!jc)
13407 return ERR_PTR(-ENOMEM);
13408
13409 jc->info = alloc_percpu(struct perf_cgroup_info);
13410 if (!jc->info) {
13411 kfree(jc);
13412 return ERR_PTR(-ENOMEM);
13413 }
13414
13415 return &jc->css;
13416}
13417
13418static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13419{
13420 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13421
13422 free_percpu(jc->info);
13423 kfree(jc);
13424}
13425
13426static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13427{
13428 perf_event_cgroup(css->cgroup);
13429 return 0;
13430}
13431
13432static int __perf_cgroup_move(void *info)
13433{
13434 struct task_struct *task = info;
13435 rcu_read_lock();
13436 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13437 rcu_read_unlock();
13438 return 0;
13439}
13440
13441static void perf_cgroup_attach(struct cgroup_taskset *tset)
13442{
13443 struct task_struct *task;
13444 struct cgroup_subsys_state *css;
13445
13446 cgroup_taskset_for_each(task, css, tset)
13447 task_function_call(task, __perf_cgroup_move, task);
13448}
13449
13450struct cgroup_subsys perf_event_cgrp_subsys = {
13451 .css_alloc = perf_cgroup_css_alloc,
13452 .css_free = perf_cgroup_css_free,
13453 .css_online = perf_cgroup_css_online,
13454 .attach = perf_cgroup_attach,
13455
13456
13457
13458
13459
13460 .implicit_on_dfl = true,
13461 .threaded = true,
13462};
13463#endif
13464