1
2
3
4
5
6
7
8
9
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/hugetlb.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53#include <linux/min_heap.h>
54
55#include "internal.h"
56
57#include <asm/irq_regs.h>
58
59typedef int (*remote_function_f)(void *);
60
61struct remote_function_call {
62 struct task_struct *p;
63 remote_function_f func;
64 void *info;
65 int ret;
66};
67
68static void remote_function(void *data)
69{
70 struct remote_function_call *tfc = data;
71 struct task_struct *p = tfc->p;
72
73 if (p) {
74
75 if (task_cpu(p) != smp_processor_id())
76 return;
77
78
79
80
81
82
83 tfc->ret = -ESRCH;
84 if (p != current)
85 return;
86 }
87
88 tfc->ret = tfc->func(tfc->info);
89}
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104static int
105task_function_call(struct task_struct *p, remote_function_f func, void *info)
106{
107 struct remote_function_call data = {
108 .p = p,
109 .func = func,
110 .info = info,
111 .ret = -EAGAIN,
112 };
113 int ret;
114
115 for (;;) {
116 ret = smp_call_function_single(task_cpu(p), remote_function,
117 &data, 1);
118 if (!ret)
119 ret = data.ret;
120
121 if (ret != -EAGAIN)
122 break;
123
124 cond_resched();
125 }
126
127 return ret;
128}
129
130
131
132
133
134
135
136
137
138
139static int cpu_function_call(int cpu, remote_function_f func, void *info)
140{
141 struct remote_function_call data = {
142 .p = NULL,
143 .func = func,
144 .info = info,
145 .ret = -ENXIO,
146 };
147
148 smp_call_function_single(cpu, remote_function, &data, 1);
149
150 return data.ret;
151}
152
153static inline struct perf_cpu_context *
154__get_cpu_context(struct perf_event_context *ctx)
155{
156 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
157}
158
159static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
160 struct perf_event_context *ctx)
161{
162 raw_spin_lock(&cpuctx->ctx.lock);
163 if (ctx)
164 raw_spin_lock(&ctx->lock);
165}
166
167static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
168 struct perf_event_context *ctx)
169{
170 if (ctx)
171 raw_spin_unlock(&ctx->lock);
172 raw_spin_unlock(&cpuctx->ctx.lock);
173}
174
175#define TASK_TOMBSTONE ((void *)-1L)
176
177static bool is_kernel_event(struct perf_event *event)
178{
179 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
180}
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
202 struct perf_event_context *, void *);
203
204struct event_function_struct {
205 struct perf_event *event;
206 event_f func;
207 void *data;
208};
209
210static int event_function(void *info)
211{
212 struct event_function_struct *efs = info;
213 struct perf_event *event = efs->event;
214 struct perf_event_context *ctx = event->ctx;
215 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
216 struct perf_event_context *task_ctx = cpuctx->task_ctx;
217 int ret = 0;
218
219 lockdep_assert_irqs_disabled();
220
221 perf_ctx_lock(cpuctx, task_ctx);
222
223
224
225
226 if (ctx->task) {
227 if (ctx->task != current) {
228 ret = -ESRCH;
229 goto unlock;
230 }
231
232
233
234
235
236
237
238
239 WARN_ON_ONCE(!ctx->is_active);
240
241
242
243
244 WARN_ON_ONCE(task_ctx != ctx);
245 } else {
246 WARN_ON_ONCE(&cpuctx->ctx != ctx);
247 }
248
249 efs->func(event, cpuctx, ctx, efs->data);
250unlock:
251 perf_ctx_unlock(cpuctx, task_ctx);
252
253 return ret;
254}
255
256static void event_function_call(struct perf_event *event, event_f func, void *data)
257{
258 struct perf_event_context *ctx = event->ctx;
259 struct task_struct *task = READ_ONCE(ctx->task);
260 struct event_function_struct efs = {
261 .event = event,
262 .func = func,
263 .data = data,
264 };
265
266 if (!event->parent) {
267
268
269
270
271
272 lockdep_assert_held(&ctx->mutex);
273 }
274
275 if (!task) {
276 cpu_function_call(event->cpu, event_function, &efs);
277 return;
278 }
279
280 if (task == TASK_TOMBSTONE)
281 return;
282
283again:
284 if (!task_function_call(task, event_function, &efs))
285 return;
286
287 raw_spin_lock_irq(&ctx->lock);
288
289
290
291
292 task = ctx->task;
293 if (task == TASK_TOMBSTONE) {
294 raw_spin_unlock_irq(&ctx->lock);
295 return;
296 }
297 if (ctx->is_active) {
298 raw_spin_unlock_irq(&ctx->lock);
299 goto again;
300 }
301 func(event, NULL, ctx, data);
302 raw_spin_unlock_irq(&ctx->lock);
303}
304
305
306
307
308
309static void event_function_local(struct perf_event *event, event_f func, void *data)
310{
311 struct perf_event_context *ctx = event->ctx;
312 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
313 struct task_struct *task = READ_ONCE(ctx->task);
314 struct perf_event_context *task_ctx = NULL;
315
316 lockdep_assert_irqs_disabled();
317
318 if (task) {
319 if (task == TASK_TOMBSTONE)
320 return;
321
322 task_ctx = ctx;
323 }
324
325 perf_ctx_lock(cpuctx, task_ctx);
326
327 task = ctx->task;
328 if (task == TASK_TOMBSTONE)
329 goto unlock;
330
331 if (task) {
332
333
334
335
336
337 if (ctx->is_active) {
338 if (WARN_ON_ONCE(task != current))
339 goto unlock;
340
341 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
342 goto unlock;
343 }
344 } else {
345 WARN_ON_ONCE(&cpuctx->ctx != ctx);
346 }
347
348 func(event, cpuctx, ctx, data);
349unlock:
350 perf_ctx_unlock(cpuctx, task_ctx);
351}
352
353#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
354 PERF_FLAG_FD_OUTPUT |\
355 PERF_FLAG_PID_CGROUP |\
356 PERF_FLAG_FD_CLOEXEC)
357
358
359
360
361#define PERF_SAMPLE_BRANCH_PERM_PLM \
362 (PERF_SAMPLE_BRANCH_KERNEL |\
363 PERF_SAMPLE_BRANCH_HV)
364
365enum event_type_t {
366 EVENT_FLEXIBLE = 0x1,
367 EVENT_PINNED = 0x2,
368 EVENT_TIME = 0x4,
369
370 EVENT_CPU = 0x8,
371 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
372};
373
374
375
376
377
378
379static void perf_sched_delayed(struct work_struct *work);
380DEFINE_STATIC_KEY_FALSE(perf_sched_events);
381static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
382static DEFINE_MUTEX(perf_sched_mutex);
383static atomic_t perf_sched_count;
384
385static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
386static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
387
388static atomic_t nr_mmap_events __read_mostly;
389static atomic_t nr_comm_events __read_mostly;
390static atomic_t nr_namespaces_events __read_mostly;
391static atomic_t nr_task_events __read_mostly;
392static atomic_t nr_freq_events __read_mostly;
393static atomic_t nr_switch_events __read_mostly;
394static atomic_t nr_ksymbol_events __read_mostly;
395static atomic_t nr_bpf_events __read_mostly;
396static atomic_t nr_cgroup_events __read_mostly;
397static atomic_t nr_text_poke_events __read_mostly;
398
399static LIST_HEAD(pmus);
400static DEFINE_MUTEX(pmus_lock);
401static struct srcu_struct pmus_srcu;
402static cpumask_var_t perf_online_mask;
403
404
405
406
407
408
409
410
411int sysctl_perf_event_paranoid __read_mostly = 2;
412
413
414int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
415
416
417
418
419#define DEFAULT_MAX_SAMPLE_RATE 100000
420#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
421#define DEFAULT_CPU_TIME_MAX_PERCENT 25
422
423int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
424
425static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
426static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
427
428static int perf_sample_allowed_ns __read_mostly =
429 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
430
431static void update_perf_cpu_limits(void)
432{
433 u64 tmp = perf_sample_period_ns;
434
435 tmp *= sysctl_perf_cpu_time_max_percent;
436 tmp = div_u64(tmp, 100);
437 if (!tmp)
438 tmp = 1;
439
440 WRITE_ONCE(perf_sample_allowed_ns, tmp);
441}
442
443static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
444
445int perf_proc_update_handler(struct ctl_table *table, int write,
446 void *buffer, size_t *lenp, loff_t *ppos)
447{
448 int ret;
449 int perf_cpu = sysctl_perf_cpu_time_max_percent;
450
451
452
453 if (write && (perf_cpu == 100 || perf_cpu == 0))
454 return -EINVAL;
455
456 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
457 if (ret || !write)
458 return ret;
459
460 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
461 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
462 update_perf_cpu_limits();
463
464 return 0;
465}
466
467int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
468
469int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
470 void *buffer, size_t *lenp, loff_t *ppos)
471{
472 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
473
474 if (ret || !write)
475 return ret;
476
477 if (sysctl_perf_cpu_time_max_percent == 100 ||
478 sysctl_perf_cpu_time_max_percent == 0) {
479 printk(KERN_WARNING
480 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
481 WRITE_ONCE(perf_sample_allowed_ns, 0);
482 } else {
483 update_perf_cpu_limits();
484 }
485
486 return 0;
487}
488
489
490
491
492
493
494
495#define NR_ACCUMULATED_SAMPLES 128
496static DEFINE_PER_CPU(u64, running_sample_length);
497
498static u64 __report_avg;
499static u64 __report_allowed;
500
501static void perf_duration_warn(struct irq_work *w)
502{
503 printk_ratelimited(KERN_INFO
504 "perf: interrupt took too long (%lld > %lld), lowering "
505 "kernel.perf_event_max_sample_rate to %d\n",
506 __report_avg, __report_allowed,
507 sysctl_perf_event_sample_rate);
508}
509
510static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
511
512void perf_sample_event_took(u64 sample_len_ns)
513{
514 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
515 u64 running_len;
516 u64 avg_len;
517 u32 max;
518
519 if (max_len == 0)
520 return;
521
522
523 running_len = __this_cpu_read(running_sample_length);
524 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
525 running_len += sample_len_ns;
526 __this_cpu_write(running_sample_length, running_len);
527
528
529
530
531
532
533 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
534 if (avg_len <= max_len)
535 return;
536
537 __report_avg = avg_len;
538 __report_allowed = max_len;
539
540
541
542
543 avg_len += avg_len / 4;
544 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
545 if (avg_len < max)
546 max /= (u32)avg_len;
547 else
548 max = 1;
549
550 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
551 WRITE_ONCE(max_samples_per_tick, max);
552
553 sysctl_perf_event_sample_rate = max * HZ;
554 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
555
556 if (!irq_work_queue(&perf_duration_work)) {
557 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
558 "kernel.perf_event_max_sample_rate to %d\n",
559 __report_avg, __report_allowed,
560 sysctl_perf_event_sample_rate);
561 }
562}
563
564static atomic64_t perf_event_id;
565
566static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
567 enum event_type_t event_type);
568
569static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
570 enum event_type_t event_type,
571 struct task_struct *task);
572
573static void update_context_time(struct perf_event_context *ctx);
574static u64 perf_event_time(struct perf_event *event);
575
576void __weak perf_event_print_debug(void) { }
577
578extern __weak const char *perf_pmu_name(void)
579{
580 return "pmu";
581}
582
583static inline u64 perf_clock(void)
584{
585 return local_clock();
586}
587
588static inline u64 perf_event_clock(struct perf_event *event)
589{
590 return event->clock();
591}
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615static __always_inline enum perf_event_state
616__perf_effective_state(struct perf_event *event)
617{
618 struct perf_event *leader = event->group_leader;
619
620 if (leader->state <= PERF_EVENT_STATE_OFF)
621 return leader->state;
622
623 return event->state;
624}
625
626static __always_inline void
627__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
628{
629 enum perf_event_state state = __perf_effective_state(event);
630 u64 delta = now - event->tstamp;
631
632 *enabled = event->total_time_enabled;
633 if (state >= PERF_EVENT_STATE_INACTIVE)
634 *enabled += delta;
635
636 *running = event->total_time_running;
637 if (state >= PERF_EVENT_STATE_ACTIVE)
638 *running += delta;
639}
640
641static void perf_event_update_time(struct perf_event *event)
642{
643 u64 now = perf_event_time(event);
644
645 __perf_update_times(event, now, &event->total_time_enabled,
646 &event->total_time_running);
647 event->tstamp = now;
648}
649
650static void perf_event_update_sibling_time(struct perf_event *leader)
651{
652 struct perf_event *sibling;
653
654 for_each_sibling_event(sibling, leader)
655 perf_event_update_time(sibling);
656}
657
658static void
659perf_event_set_state(struct perf_event *event, enum perf_event_state state)
660{
661 if (event->state == state)
662 return;
663
664 perf_event_update_time(event);
665
666
667
668
669 if ((event->state < 0) ^ (state < 0))
670 perf_event_update_sibling_time(event);
671
672 WRITE_ONCE(event->state, state);
673}
674
675#ifdef CONFIG_CGROUP_PERF
676
677static inline bool
678perf_cgroup_match(struct perf_event *event)
679{
680 struct perf_event_context *ctx = event->ctx;
681 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
682
683
684 if (!event->cgrp)
685 return true;
686
687
688 if (!cpuctx->cgrp)
689 return false;
690
691
692
693
694
695
696
697 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
698 event->cgrp->css.cgroup);
699}
700
701static inline void perf_detach_cgroup(struct perf_event *event)
702{
703 css_put(&event->cgrp->css);
704 event->cgrp = NULL;
705}
706
707static inline int is_cgroup_event(struct perf_event *event)
708{
709 return event->cgrp != NULL;
710}
711
712static inline u64 perf_cgroup_event_time(struct perf_event *event)
713{
714 struct perf_cgroup_info *t;
715
716 t = per_cpu_ptr(event->cgrp->info, event->cpu);
717 return t->time;
718}
719
720static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
721{
722 struct perf_cgroup_info *info;
723 u64 now;
724
725 now = perf_clock();
726
727 info = this_cpu_ptr(cgrp->info);
728
729 info->time += now - info->timestamp;
730 info->timestamp = now;
731}
732
733static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
734{
735 struct perf_cgroup *cgrp = cpuctx->cgrp;
736 struct cgroup_subsys_state *css;
737
738 if (cgrp) {
739 for (css = &cgrp->css; css; css = css->parent) {
740 cgrp = container_of(css, struct perf_cgroup, css);
741 __update_cgrp_time(cgrp);
742 }
743 }
744}
745
746static inline void update_cgrp_time_from_event(struct perf_event *event)
747{
748 struct perf_cgroup *cgrp;
749
750
751
752
753
754 if (!is_cgroup_event(event))
755 return;
756
757 cgrp = perf_cgroup_from_task(current, event->ctx);
758
759
760
761 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
762 __update_cgrp_time(event->cgrp);
763}
764
765static inline void
766perf_cgroup_set_timestamp(struct task_struct *task,
767 struct perf_event_context *ctx)
768{
769 struct perf_cgroup *cgrp;
770 struct perf_cgroup_info *info;
771 struct cgroup_subsys_state *css;
772
773
774
775
776
777
778 if (!task || !ctx->nr_cgroups)
779 return;
780
781 cgrp = perf_cgroup_from_task(task, ctx);
782
783 for (css = &cgrp->css; css; css = css->parent) {
784 cgrp = container_of(css, struct perf_cgroup, css);
785 info = this_cpu_ptr(cgrp->info);
786 info->timestamp = ctx->timestamp;
787 }
788}
789
790static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
791
792#define PERF_CGROUP_SWOUT 0x1
793#define PERF_CGROUP_SWIN 0x2
794
795
796
797
798
799
800
801static void perf_cgroup_switch(struct task_struct *task, int mode)
802{
803 struct perf_cpu_context *cpuctx;
804 struct list_head *list;
805 unsigned long flags;
806
807
808
809
810
811 local_irq_save(flags);
812
813 list = this_cpu_ptr(&cgrp_cpuctx_list);
814 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
815 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
816
817 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
818 perf_pmu_disable(cpuctx->ctx.pmu);
819
820 if (mode & PERF_CGROUP_SWOUT) {
821 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
822
823
824
825
826 cpuctx->cgrp = NULL;
827 }
828
829 if (mode & PERF_CGROUP_SWIN) {
830 WARN_ON_ONCE(cpuctx->cgrp);
831
832
833
834
835
836
837
838 cpuctx->cgrp = perf_cgroup_from_task(task,
839 &cpuctx->ctx);
840 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
841 }
842 perf_pmu_enable(cpuctx->ctx.pmu);
843 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
844 }
845
846 local_irq_restore(flags);
847}
848
849static inline void perf_cgroup_sched_out(struct task_struct *task,
850 struct task_struct *next)
851{
852 struct perf_cgroup *cgrp1;
853 struct perf_cgroup *cgrp2 = NULL;
854
855 rcu_read_lock();
856
857
858
859
860
861 cgrp1 = perf_cgroup_from_task(task, NULL);
862 cgrp2 = perf_cgroup_from_task(next, NULL);
863
864
865
866
867
868
869 if (cgrp1 != cgrp2)
870 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
871
872 rcu_read_unlock();
873}
874
875static inline void perf_cgroup_sched_in(struct task_struct *prev,
876 struct task_struct *task)
877{
878 struct perf_cgroup *cgrp1;
879 struct perf_cgroup *cgrp2 = NULL;
880
881 rcu_read_lock();
882
883
884
885
886
887 cgrp1 = perf_cgroup_from_task(task, NULL);
888 cgrp2 = perf_cgroup_from_task(prev, NULL);
889
890
891
892
893
894
895 if (cgrp1 != cgrp2)
896 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
897
898 rcu_read_unlock();
899}
900
901static int perf_cgroup_ensure_storage(struct perf_event *event,
902 struct cgroup_subsys_state *css)
903{
904 struct perf_cpu_context *cpuctx;
905 struct perf_event **storage;
906 int cpu, heap_size, ret = 0;
907
908
909
910
911
912 for (heap_size = 1; css; css = css->parent)
913 heap_size++;
914
915 for_each_possible_cpu(cpu) {
916 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
917 if (heap_size <= cpuctx->heap_size)
918 continue;
919
920 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
921 GFP_KERNEL, cpu_to_node(cpu));
922 if (!storage) {
923 ret = -ENOMEM;
924 break;
925 }
926
927 raw_spin_lock_irq(&cpuctx->ctx.lock);
928 if (cpuctx->heap_size < heap_size) {
929 swap(cpuctx->heap, storage);
930 if (storage == cpuctx->heap_default)
931 storage = NULL;
932 cpuctx->heap_size = heap_size;
933 }
934 raw_spin_unlock_irq(&cpuctx->ctx.lock);
935
936 kfree(storage);
937 }
938
939 return ret;
940}
941
942static inline int perf_cgroup_connect(int fd, struct perf_event *event,
943 struct perf_event_attr *attr,
944 struct perf_event *group_leader)
945{
946 struct perf_cgroup *cgrp;
947 struct cgroup_subsys_state *css;
948 struct fd f = fdget(fd);
949 int ret = 0;
950
951 if (!f.file)
952 return -EBADF;
953
954 css = css_tryget_online_from_dir(f.file->f_path.dentry,
955 &perf_event_cgrp_subsys);
956 if (IS_ERR(css)) {
957 ret = PTR_ERR(css);
958 goto out;
959 }
960
961 ret = perf_cgroup_ensure_storage(event, css);
962 if (ret)
963 goto out;
964
965 cgrp = container_of(css, struct perf_cgroup, css);
966 event->cgrp = cgrp;
967
968
969
970
971
972
973 if (group_leader && group_leader->cgrp != cgrp) {
974 perf_detach_cgroup(event);
975 ret = -EINVAL;
976 }
977out:
978 fdput(f);
979 return ret;
980}
981
982static inline void
983perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
984{
985 struct perf_cgroup_info *t;
986 t = per_cpu_ptr(event->cgrp->info, event->cpu);
987 event->shadow_ctx_time = now - t->timestamp;
988}
989
990static inline void
991perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
992{
993 struct perf_cpu_context *cpuctx;
994
995 if (!is_cgroup_event(event))
996 return;
997
998
999
1000
1001
1002 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1003
1004
1005
1006
1007
1008
1009
1010 if (ctx->is_active && !cpuctx->cgrp) {
1011 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1012
1013 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1014 cpuctx->cgrp = cgrp;
1015 }
1016
1017 if (ctx->nr_cgroups++)
1018 return;
1019
1020 list_add(&cpuctx->cgrp_cpuctx_entry,
1021 per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1022}
1023
1024static inline void
1025perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1026{
1027 struct perf_cpu_context *cpuctx;
1028
1029 if (!is_cgroup_event(event))
1030 return;
1031
1032
1033
1034
1035
1036 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1037
1038 if (--ctx->nr_cgroups)
1039 return;
1040
1041 if (ctx->is_active && cpuctx->cgrp)
1042 cpuctx->cgrp = NULL;
1043
1044 list_del(&cpuctx->cgrp_cpuctx_entry);
1045}
1046
1047#else
1048
1049static inline bool
1050perf_cgroup_match(struct perf_event *event)
1051{
1052 return true;
1053}
1054
1055static inline void perf_detach_cgroup(struct perf_event *event)
1056{}
1057
1058static inline int is_cgroup_event(struct perf_event *event)
1059{
1060 return 0;
1061}
1062
1063static inline void update_cgrp_time_from_event(struct perf_event *event)
1064{
1065}
1066
1067static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1068{
1069}
1070
1071static inline void perf_cgroup_sched_out(struct task_struct *task,
1072 struct task_struct *next)
1073{
1074}
1075
1076static inline void perf_cgroup_sched_in(struct task_struct *prev,
1077 struct task_struct *task)
1078{
1079}
1080
1081static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1082 struct perf_event_attr *attr,
1083 struct perf_event *group_leader)
1084{
1085 return -EINVAL;
1086}
1087
1088static inline void
1089perf_cgroup_set_timestamp(struct task_struct *task,
1090 struct perf_event_context *ctx)
1091{
1092}
1093
1094static inline void
1095perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1096{
1097}
1098
1099static inline void
1100perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1101{
1102}
1103
1104static inline u64 perf_cgroup_event_time(struct perf_event *event)
1105{
1106 return 0;
1107}
1108
1109static inline void
1110perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1111{
1112}
1113
1114static inline void
1115perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1116{
1117}
1118#endif
1119
1120
1121
1122
1123
1124#define PERF_CPU_HRTIMER (1000 / HZ)
1125
1126
1127
1128static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1129{
1130 struct perf_cpu_context *cpuctx;
1131 bool rotations;
1132
1133 lockdep_assert_irqs_disabled();
1134
1135 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1136 rotations = perf_rotate_context(cpuctx);
1137
1138 raw_spin_lock(&cpuctx->hrtimer_lock);
1139 if (rotations)
1140 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1141 else
1142 cpuctx->hrtimer_active = 0;
1143 raw_spin_unlock(&cpuctx->hrtimer_lock);
1144
1145 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1146}
1147
1148static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1149{
1150 struct hrtimer *timer = &cpuctx->hrtimer;
1151 struct pmu *pmu = cpuctx->ctx.pmu;
1152 u64 interval;
1153
1154
1155 if (pmu->task_ctx_nr == perf_sw_context)
1156 return;
1157
1158
1159
1160
1161
1162 interval = pmu->hrtimer_interval_ms;
1163 if (interval < 1)
1164 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1165
1166 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1167
1168 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1169 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1170 timer->function = perf_mux_hrtimer_handler;
1171}
1172
1173static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1174{
1175 struct hrtimer *timer = &cpuctx->hrtimer;
1176 struct pmu *pmu = cpuctx->ctx.pmu;
1177 unsigned long flags;
1178
1179
1180 if (pmu->task_ctx_nr == perf_sw_context)
1181 return 0;
1182
1183 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1184 if (!cpuctx->hrtimer_active) {
1185 cpuctx->hrtimer_active = 1;
1186 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1187 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1188 }
1189 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1190
1191 return 0;
1192}
1193
1194void perf_pmu_disable(struct pmu *pmu)
1195{
1196 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1197 if (!(*count)++)
1198 pmu->pmu_disable(pmu);
1199}
1200
1201void perf_pmu_enable(struct pmu *pmu)
1202{
1203 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1204 if (!--(*count))
1205 pmu->pmu_enable(pmu);
1206}
1207
1208static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1209
1210
1211
1212
1213
1214
1215
1216static void perf_event_ctx_activate(struct perf_event_context *ctx)
1217{
1218 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1219
1220 lockdep_assert_irqs_disabled();
1221
1222 WARN_ON(!list_empty(&ctx->active_ctx_list));
1223
1224 list_add(&ctx->active_ctx_list, head);
1225}
1226
1227static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1228{
1229 lockdep_assert_irqs_disabled();
1230
1231 WARN_ON(list_empty(&ctx->active_ctx_list));
1232
1233 list_del_init(&ctx->active_ctx_list);
1234}
1235
1236static void get_ctx(struct perf_event_context *ctx)
1237{
1238 refcount_inc(&ctx->refcount);
1239}
1240
1241static void *alloc_task_ctx_data(struct pmu *pmu)
1242{
1243 if (pmu->task_ctx_cache)
1244 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1245
1246 return NULL;
1247}
1248
1249static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1250{
1251 if (pmu->task_ctx_cache && task_ctx_data)
1252 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1253}
1254
1255static void free_ctx(struct rcu_head *head)
1256{
1257 struct perf_event_context *ctx;
1258
1259 ctx = container_of(head, struct perf_event_context, rcu_head);
1260 free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1261 kfree(ctx);
1262}
1263
1264static void put_ctx(struct perf_event_context *ctx)
1265{
1266 if (refcount_dec_and_test(&ctx->refcount)) {
1267 if (ctx->parent_ctx)
1268 put_ctx(ctx->parent_ctx);
1269 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1270 put_task_struct(ctx->task);
1271 call_rcu(&ctx->rcu_head, free_ctx);
1272 }
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341static struct perf_event_context *
1342perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1343{
1344 struct perf_event_context *ctx;
1345
1346again:
1347 rcu_read_lock();
1348 ctx = READ_ONCE(event->ctx);
1349 if (!refcount_inc_not_zero(&ctx->refcount)) {
1350 rcu_read_unlock();
1351 goto again;
1352 }
1353 rcu_read_unlock();
1354
1355 mutex_lock_nested(&ctx->mutex, nesting);
1356 if (event->ctx != ctx) {
1357 mutex_unlock(&ctx->mutex);
1358 put_ctx(ctx);
1359 goto again;
1360 }
1361
1362 return ctx;
1363}
1364
1365static inline struct perf_event_context *
1366perf_event_ctx_lock(struct perf_event *event)
1367{
1368 return perf_event_ctx_lock_nested(event, 0);
1369}
1370
1371static void perf_event_ctx_unlock(struct perf_event *event,
1372 struct perf_event_context *ctx)
1373{
1374 mutex_unlock(&ctx->mutex);
1375 put_ctx(ctx);
1376}
1377
1378
1379
1380
1381
1382
1383static __must_check struct perf_event_context *
1384unclone_ctx(struct perf_event_context *ctx)
1385{
1386 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1387
1388 lockdep_assert_held(&ctx->lock);
1389
1390 if (parent_ctx)
1391 ctx->parent_ctx = NULL;
1392 ctx->generation++;
1393
1394 return parent_ctx;
1395}
1396
1397static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1398 enum pid_type type)
1399{
1400 u32 nr;
1401
1402
1403
1404 if (event->parent)
1405 event = event->parent;
1406
1407 nr = __task_pid_nr_ns(p, type, event->ns);
1408
1409 if (!nr && !pid_alive(p))
1410 nr = -1;
1411 return nr;
1412}
1413
1414static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1415{
1416 return perf_event_pid_type(event, p, PIDTYPE_TGID);
1417}
1418
1419static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1420{
1421 return perf_event_pid_type(event, p, PIDTYPE_PID);
1422}
1423
1424
1425
1426
1427
1428static u64 primary_event_id(struct perf_event *event)
1429{
1430 u64 id = event->id;
1431
1432 if (event->parent)
1433 id = event->parent->id;
1434
1435 return id;
1436}
1437
1438
1439
1440
1441
1442
1443
1444static struct perf_event_context *
1445perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1446{
1447 struct perf_event_context *ctx;
1448
1449retry:
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459 local_irq_save(*flags);
1460 rcu_read_lock();
1461 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1462 if (ctx) {
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473 raw_spin_lock(&ctx->lock);
1474 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1475 raw_spin_unlock(&ctx->lock);
1476 rcu_read_unlock();
1477 local_irq_restore(*flags);
1478 goto retry;
1479 }
1480
1481 if (ctx->task == TASK_TOMBSTONE ||
1482 !refcount_inc_not_zero(&ctx->refcount)) {
1483 raw_spin_unlock(&ctx->lock);
1484 ctx = NULL;
1485 } else {
1486 WARN_ON_ONCE(ctx->task != task);
1487 }
1488 }
1489 rcu_read_unlock();
1490 if (!ctx)
1491 local_irq_restore(*flags);
1492 return ctx;
1493}
1494
1495
1496
1497
1498
1499
1500static struct perf_event_context *
1501perf_pin_task_context(struct task_struct *task, int ctxn)
1502{
1503 struct perf_event_context *ctx;
1504 unsigned long flags;
1505
1506 ctx = perf_lock_task_context(task, ctxn, &flags);
1507 if (ctx) {
1508 ++ctx->pin_count;
1509 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1510 }
1511 return ctx;
1512}
1513
1514static void perf_unpin_context(struct perf_event_context *ctx)
1515{
1516 unsigned long flags;
1517
1518 raw_spin_lock_irqsave(&ctx->lock, flags);
1519 --ctx->pin_count;
1520 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1521}
1522
1523
1524
1525
1526static void update_context_time(struct perf_event_context *ctx)
1527{
1528 u64 now = perf_clock();
1529
1530 ctx->time += now - ctx->timestamp;
1531 ctx->timestamp = now;
1532}
1533
1534static u64 perf_event_time(struct perf_event *event)
1535{
1536 struct perf_event_context *ctx = event->ctx;
1537
1538 if (is_cgroup_event(event))
1539 return perf_cgroup_event_time(event);
1540
1541 return ctx ? ctx->time : 0;
1542}
1543
1544static enum event_type_t get_event_type(struct perf_event *event)
1545{
1546 struct perf_event_context *ctx = event->ctx;
1547 enum event_type_t event_type;
1548
1549 lockdep_assert_held(&ctx->lock);
1550
1551
1552
1553
1554
1555 if (event->group_leader != event)
1556 event = event->group_leader;
1557
1558 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1559 if (!ctx->task)
1560 event_type |= EVENT_CPU;
1561
1562 return event_type;
1563}
1564
1565
1566
1567
1568static void init_event_group(struct perf_event *event)
1569{
1570 RB_CLEAR_NODE(&event->group_node);
1571 event->group_index = 0;
1572}
1573
1574
1575
1576
1577
1578static struct perf_event_groups *
1579get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1580{
1581 if (event->attr.pinned)
1582 return &ctx->pinned_groups;
1583 else
1584 return &ctx->flexible_groups;
1585}
1586
1587
1588
1589
1590static void perf_event_groups_init(struct perf_event_groups *groups)
1591{
1592 groups->tree = RB_ROOT;
1593 groups->index = 0;
1594}
1595
1596
1597
1598
1599
1600
1601
1602static bool
1603perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1604{
1605 if (left->cpu < right->cpu)
1606 return true;
1607 if (left->cpu > right->cpu)
1608 return false;
1609
1610#ifdef CONFIG_CGROUP_PERF
1611 if (left->cgrp != right->cgrp) {
1612 if (!left->cgrp || !left->cgrp->css.cgroup) {
1613
1614
1615
1616
1617 return true;
1618 }
1619 if (!right->cgrp || !right->cgrp->css.cgroup) {
1620
1621
1622
1623
1624 return false;
1625 }
1626
1627 if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1628 return true;
1629
1630 return false;
1631 }
1632#endif
1633
1634 if (left->group_index < right->group_index)
1635 return true;
1636 if (left->group_index > right->group_index)
1637 return false;
1638
1639 return false;
1640}
1641
1642
1643
1644
1645
1646
1647static void
1648perf_event_groups_insert(struct perf_event_groups *groups,
1649 struct perf_event *event)
1650{
1651 struct perf_event *node_event;
1652 struct rb_node *parent;
1653 struct rb_node **node;
1654
1655 event->group_index = ++groups->index;
1656
1657 node = &groups->tree.rb_node;
1658 parent = *node;
1659
1660 while (*node) {
1661 parent = *node;
1662 node_event = container_of(*node, struct perf_event, group_node);
1663
1664 if (perf_event_groups_less(event, node_event))
1665 node = &parent->rb_left;
1666 else
1667 node = &parent->rb_right;
1668 }
1669
1670 rb_link_node(&event->group_node, parent, node);
1671 rb_insert_color(&event->group_node, &groups->tree);
1672}
1673
1674
1675
1676
1677static void
1678add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1679{
1680 struct perf_event_groups *groups;
1681
1682 groups = get_event_groups(event, ctx);
1683 perf_event_groups_insert(groups, event);
1684}
1685
1686
1687
1688
1689static void
1690perf_event_groups_delete(struct perf_event_groups *groups,
1691 struct perf_event *event)
1692{
1693 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1694 RB_EMPTY_ROOT(&groups->tree));
1695
1696 rb_erase(&event->group_node, &groups->tree);
1697 init_event_group(event);
1698}
1699
1700
1701
1702
1703static void
1704del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1705{
1706 struct perf_event_groups *groups;
1707
1708 groups = get_event_groups(event, ctx);
1709 perf_event_groups_delete(groups, event);
1710}
1711
1712
1713
1714
1715static struct perf_event *
1716perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1717 struct cgroup *cgrp)
1718{
1719 struct perf_event *node_event = NULL, *match = NULL;
1720 struct rb_node *node = groups->tree.rb_node;
1721#ifdef CONFIG_CGROUP_PERF
1722 u64 node_cgrp_id, cgrp_id = 0;
1723
1724 if (cgrp)
1725 cgrp_id = cgrp->kn->id;
1726#endif
1727
1728 while (node) {
1729 node_event = container_of(node, struct perf_event, group_node);
1730
1731 if (cpu < node_event->cpu) {
1732 node = node->rb_left;
1733 continue;
1734 }
1735 if (cpu > node_event->cpu) {
1736 node = node->rb_right;
1737 continue;
1738 }
1739#ifdef CONFIG_CGROUP_PERF
1740 node_cgrp_id = 0;
1741 if (node_event->cgrp && node_event->cgrp->css.cgroup)
1742 node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1743
1744 if (cgrp_id < node_cgrp_id) {
1745 node = node->rb_left;
1746 continue;
1747 }
1748 if (cgrp_id > node_cgrp_id) {
1749 node = node->rb_right;
1750 continue;
1751 }
1752#endif
1753 match = node_event;
1754 node = node->rb_left;
1755 }
1756
1757 return match;
1758}
1759
1760
1761
1762
1763static struct perf_event *
1764perf_event_groups_next(struct perf_event *event)
1765{
1766 struct perf_event *next;
1767#ifdef CONFIG_CGROUP_PERF
1768 u64 curr_cgrp_id = 0;
1769 u64 next_cgrp_id = 0;
1770#endif
1771
1772 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1773 if (next == NULL || next->cpu != event->cpu)
1774 return NULL;
1775
1776#ifdef CONFIG_CGROUP_PERF
1777 if (event->cgrp && event->cgrp->css.cgroup)
1778 curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1779
1780 if (next->cgrp && next->cgrp->css.cgroup)
1781 next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1782
1783 if (curr_cgrp_id != next_cgrp_id)
1784 return NULL;
1785#endif
1786 return next;
1787}
1788
1789
1790
1791
1792#define perf_event_groups_for_each(event, groups) \
1793 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1794 typeof(*event), group_node); event; \
1795 event = rb_entry_safe(rb_next(&event->group_node), \
1796 typeof(*event), group_node))
1797
1798
1799
1800
1801
1802static void
1803list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1804{
1805 lockdep_assert_held(&ctx->lock);
1806
1807 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1808 event->attach_state |= PERF_ATTACH_CONTEXT;
1809
1810 event->tstamp = perf_event_time(event);
1811
1812
1813
1814
1815
1816
1817 if (event->group_leader == event) {
1818 event->group_caps = event->event_caps;
1819 add_event_to_groups(event, ctx);
1820 }
1821
1822 list_add_rcu(&event->event_entry, &ctx->event_list);
1823 ctx->nr_events++;
1824 if (event->attr.inherit_stat)
1825 ctx->nr_stat++;
1826
1827 if (event->state > PERF_EVENT_STATE_OFF)
1828 perf_cgroup_event_enable(event, ctx);
1829
1830 ctx->generation++;
1831}
1832
1833
1834
1835
1836static inline void perf_event__state_init(struct perf_event *event)
1837{
1838 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1839 PERF_EVENT_STATE_INACTIVE;
1840}
1841
1842static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1843{
1844 int entry = sizeof(u64);
1845 int size = 0;
1846 int nr = 1;
1847
1848 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1849 size += sizeof(u64);
1850
1851 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1852 size += sizeof(u64);
1853
1854 if (event->attr.read_format & PERF_FORMAT_ID)
1855 entry += sizeof(u64);
1856
1857 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1858 nr += nr_siblings;
1859 size += sizeof(u64);
1860 }
1861
1862 size += entry * nr;
1863 event->read_size = size;
1864}
1865
1866static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1867{
1868 struct perf_sample_data *data;
1869 u16 size = 0;
1870
1871 if (sample_type & PERF_SAMPLE_IP)
1872 size += sizeof(data->ip);
1873
1874 if (sample_type & PERF_SAMPLE_ADDR)
1875 size += sizeof(data->addr);
1876
1877 if (sample_type & PERF_SAMPLE_PERIOD)
1878 size += sizeof(data->period);
1879
1880 if (sample_type & PERF_SAMPLE_WEIGHT)
1881 size += sizeof(data->weight);
1882
1883 if (sample_type & PERF_SAMPLE_READ)
1884 size += event->read_size;
1885
1886 if (sample_type & PERF_SAMPLE_DATA_SRC)
1887 size += sizeof(data->data_src.val);
1888
1889 if (sample_type & PERF_SAMPLE_TRANSACTION)
1890 size += sizeof(data->txn);
1891
1892 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1893 size += sizeof(data->phys_addr);
1894
1895 if (sample_type & PERF_SAMPLE_CGROUP)
1896 size += sizeof(data->cgroup);
1897
1898 event->header_size = size;
1899}
1900
1901
1902
1903
1904
1905static void perf_event__header_size(struct perf_event *event)
1906{
1907 __perf_event_read_size(event,
1908 event->group_leader->nr_siblings);
1909 __perf_event_header_size(event, event->attr.sample_type);
1910}
1911
1912static void perf_event__id_header_size(struct perf_event *event)
1913{
1914 struct perf_sample_data *data;
1915 u64 sample_type = event->attr.sample_type;
1916 u16 size = 0;
1917
1918 if (sample_type & PERF_SAMPLE_TID)
1919 size += sizeof(data->tid_entry);
1920
1921 if (sample_type & PERF_SAMPLE_TIME)
1922 size += sizeof(data->time);
1923
1924 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1925 size += sizeof(data->id);
1926
1927 if (sample_type & PERF_SAMPLE_ID)
1928 size += sizeof(data->id);
1929
1930 if (sample_type & PERF_SAMPLE_STREAM_ID)
1931 size += sizeof(data->stream_id);
1932
1933 if (sample_type & PERF_SAMPLE_CPU)
1934 size += sizeof(data->cpu_entry);
1935
1936 event->id_header_size = size;
1937}
1938
1939static bool perf_event_validate_size(struct perf_event *event)
1940{
1941
1942
1943
1944
1945 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1946 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1947 perf_event__id_header_size(event);
1948
1949
1950
1951
1952
1953 if (event->read_size + event->header_size +
1954 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1955 return false;
1956
1957 return true;
1958}
1959
1960static void perf_group_attach(struct perf_event *event)
1961{
1962 struct perf_event *group_leader = event->group_leader, *pos;
1963
1964 lockdep_assert_held(&event->ctx->lock);
1965
1966
1967
1968
1969 if (event->attach_state & PERF_ATTACH_GROUP)
1970 return;
1971
1972 event->attach_state |= PERF_ATTACH_GROUP;
1973
1974 if (group_leader == event)
1975 return;
1976
1977 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1978
1979 group_leader->group_caps &= event->event_caps;
1980
1981 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1982 group_leader->nr_siblings++;
1983
1984 perf_event__header_size(group_leader);
1985
1986 for_each_sibling_event(pos, group_leader)
1987 perf_event__header_size(pos);
1988}
1989
1990
1991
1992
1993
1994static void
1995list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1996{
1997 WARN_ON_ONCE(event->ctx != ctx);
1998 lockdep_assert_held(&ctx->lock);
1999
2000
2001
2002
2003 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2004 return;
2005
2006 event->attach_state &= ~PERF_ATTACH_CONTEXT;
2007
2008 ctx->nr_events--;
2009 if (event->attr.inherit_stat)
2010 ctx->nr_stat--;
2011
2012 list_del_rcu(&event->event_entry);
2013
2014 if (event->group_leader == event)
2015 del_event_from_groups(event, ctx);
2016
2017
2018
2019
2020
2021
2022
2023
2024 if (event->state > PERF_EVENT_STATE_OFF) {
2025 perf_cgroup_event_disable(event, ctx);
2026 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2027 }
2028
2029 ctx->generation++;
2030}
2031
2032static int
2033perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2034{
2035 if (!has_aux(aux_event))
2036 return 0;
2037
2038 if (!event->pmu->aux_output_match)
2039 return 0;
2040
2041 return event->pmu->aux_output_match(aux_event);
2042}
2043
2044static void put_event(struct perf_event *event);
2045static void event_sched_out(struct perf_event *event,
2046 struct perf_cpu_context *cpuctx,
2047 struct perf_event_context *ctx);
2048
2049static void perf_put_aux_event(struct perf_event *event)
2050{
2051 struct perf_event_context *ctx = event->ctx;
2052 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2053 struct perf_event *iter;
2054
2055
2056
2057
2058 if (event->aux_event) {
2059 iter = event->aux_event;
2060 event->aux_event = NULL;
2061 put_event(iter);
2062 return;
2063 }
2064
2065
2066
2067
2068
2069 for_each_sibling_event(iter, event->group_leader) {
2070 if (iter->aux_event != event)
2071 continue;
2072
2073 iter->aux_event = NULL;
2074 put_event(event);
2075
2076
2077
2078
2079
2080
2081 event_sched_out(iter, cpuctx, ctx);
2082 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2083 }
2084}
2085
2086static bool perf_need_aux_event(struct perf_event *event)
2087{
2088 return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2089}
2090
2091static int perf_get_aux_event(struct perf_event *event,
2092 struct perf_event *group_leader)
2093{
2094
2095
2096
2097
2098
2099
2100 if (!group_leader)
2101 return 0;
2102
2103
2104
2105
2106 if (event->attr.aux_output && event->attr.aux_sample_size)
2107 return 0;
2108
2109 if (event->attr.aux_output &&
2110 !perf_aux_output_match(event, group_leader))
2111 return 0;
2112
2113 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2114 return 0;
2115
2116 if (!atomic_long_inc_not_zero(&group_leader->refcount))
2117 return 0;
2118
2119
2120
2121
2122
2123
2124
2125 event->aux_event = group_leader;
2126
2127 return 1;
2128}
2129
2130static inline struct list_head *get_event_list(struct perf_event *event)
2131{
2132 struct perf_event_context *ctx = event->ctx;
2133 return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2134}
2135
2136
2137
2138
2139
2140
2141
2142static inline void perf_remove_sibling_event(struct perf_event *event)
2143{
2144 struct perf_event_context *ctx = event->ctx;
2145 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2146
2147 event_sched_out(event, cpuctx, ctx);
2148 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2149}
2150
2151static void perf_group_detach(struct perf_event *event)
2152{
2153 struct perf_event *leader = event->group_leader;
2154 struct perf_event *sibling, *tmp;
2155 struct perf_event_context *ctx = event->ctx;
2156
2157 lockdep_assert_held(&ctx->lock);
2158
2159
2160
2161
2162 if (!(event->attach_state & PERF_ATTACH_GROUP))
2163 return;
2164
2165 event->attach_state &= ~PERF_ATTACH_GROUP;
2166
2167 perf_put_aux_event(event);
2168
2169
2170
2171
2172 if (leader != event) {
2173 list_del_init(&event->sibling_list);
2174 event->group_leader->nr_siblings--;
2175 goto out;
2176 }
2177
2178
2179
2180
2181
2182
2183 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2184
2185 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2186 perf_remove_sibling_event(sibling);
2187
2188 sibling->group_leader = sibling;
2189 list_del_init(&sibling->sibling_list);
2190
2191
2192 sibling->group_caps = event->group_caps;
2193
2194 if (!RB_EMPTY_NODE(&event->group_node)) {
2195 add_event_to_groups(sibling, event->ctx);
2196
2197 if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2198 list_add_tail(&sibling->active_list, get_event_list(sibling));
2199 }
2200
2201 WARN_ON_ONCE(sibling->ctx != event->ctx);
2202 }
2203
2204out:
2205 for_each_sibling_event(tmp, leader)
2206 perf_event__header_size(tmp);
2207
2208 perf_event__header_size(leader);
2209}
2210
2211static bool is_orphaned_event(struct perf_event *event)
2212{
2213 return event->state == PERF_EVENT_STATE_DEAD;
2214}
2215
2216static inline int __pmu_filter_match(struct perf_event *event)
2217{
2218 struct pmu *pmu = event->pmu;
2219 return pmu->filter_match ? pmu->filter_match(event) : 1;
2220}
2221
2222
2223
2224
2225
2226
2227
2228static inline int pmu_filter_match(struct perf_event *event)
2229{
2230 struct perf_event *sibling;
2231
2232 if (!__pmu_filter_match(event))
2233 return 0;
2234
2235 for_each_sibling_event(sibling, event) {
2236 if (!__pmu_filter_match(sibling))
2237 return 0;
2238 }
2239
2240 return 1;
2241}
2242
2243static inline int
2244event_filter_match(struct perf_event *event)
2245{
2246 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2247 perf_cgroup_match(event) && pmu_filter_match(event);
2248}
2249
2250static void
2251event_sched_out(struct perf_event *event,
2252 struct perf_cpu_context *cpuctx,
2253 struct perf_event_context *ctx)
2254{
2255 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2256
2257 WARN_ON_ONCE(event->ctx != ctx);
2258 lockdep_assert_held(&ctx->lock);
2259
2260 if (event->state != PERF_EVENT_STATE_ACTIVE)
2261 return;
2262
2263
2264
2265
2266
2267
2268 list_del_init(&event->active_list);
2269
2270 perf_pmu_disable(event->pmu);
2271
2272 event->pmu->del(event, 0);
2273 event->oncpu = -1;
2274
2275 if (READ_ONCE(event->pending_disable) >= 0) {
2276 WRITE_ONCE(event->pending_disable, -1);
2277 perf_cgroup_event_disable(event, ctx);
2278 state = PERF_EVENT_STATE_OFF;
2279 }
2280 perf_event_set_state(event, state);
2281
2282 if (!is_software_event(event))
2283 cpuctx->active_oncpu--;
2284 if (!--ctx->nr_active)
2285 perf_event_ctx_deactivate(ctx);
2286 if (event->attr.freq && event->attr.sample_freq)
2287 ctx->nr_freq--;
2288 if (event->attr.exclusive || !cpuctx->active_oncpu)
2289 cpuctx->exclusive = 0;
2290
2291 perf_pmu_enable(event->pmu);
2292}
2293
2294static void
2295group_sched_out(struct perf_event *group_event,
2296 struct perf_cpu_context *cpuctx,
2297 struct perf_event_context *ctx)
2298{
2299 struct perf_event *event;
2300
2301 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2302 return;
2303
2304 perf_pmu_disable(ctx->pmu);
2305
2306 event_sched_out(group_event, cpuctx, ctx);
2307
2308
2309
2310
2311 for_each_sibling_event(event, group_event)
2312 event_sched_out(event, cpuctx, ctx);
2313
2314 perf_pmu_enable(ctx->pmu);
2315}
2316
2317#define DETACH_GROUP 0x01UL
2318
2319
2320
2321
2322
2323
2324
2325static void
2326__perf_remove_from_context(struct perf_event *event,
2327 struct perf_cpu_context *cpuctx,
2328 struct perf_event_context *ctx,
2329 void *info)
2330{
2331 unsigned long flags = (unsigned long)info;
2332
2333 if (ctx->is_active & EVENT_TIME) {
2334 update_context_time(ctx);
2335 update_cgrp_time_from_cpuctx(cpuctx);
2336 }
2337
2338 event_sched_out(event, cpuctx, ctx);
2339 if (flags & DETACH_GROUP)
2340 perf_group_detach(event);
2341 list_del_event(event, ctx);
2342
2343 if (!ctx->nr_events && ctx->is_active) {
2344 ctx->is_active = 0;
2345 ctx->rotate_necessary = 0;
2346 if (ctx->task) {
2347 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2348 cpuctx->task_ctx = NULL;
2349 }
2350 }
2351}
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2364{
2365 struct perf_event_context *ctx = event->ctx;
2366
2367 lockdep_assert_held(&ctx->mutex);
2368
2369 event_function_call(event, __perf_remove_from_context, (void *)flags);
2370
2371
2372
2373
2374
2375
2376
2377 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2378 if ((flags & DETACH_GROUP) &&
2379 (event->attach_state & PERF_ATTACH_GROUP)) {
2380
2381
2382
2383
2384 raw_spin_lock_irq(&ctx->lock);
2385 perf_group_detach(event);
2386 raw_spin_unlock_irq(&ctx->lock);
2387 }
2388}
2389
2390
2391
2392
2393static void __perf_event_disable(struct perf_event *event,
2394 struct perf_cpu_context *cpuctx,
2395 struct perf_event_context *ctx,
2396 void *info)
2397{
2398 if (event->state < PERF_EVENT_STATE_INACTIVE)
2399 return;
2400
2401 if (ctx->is_active & EVENT_TIME) {
2402 update_context_time(ctx);
2403 update_cgrp_time_from_event(event);
2404 }
2405
2406 if (event == event->group_leader)
2407 group_sched_out(event, cpuctx, ctx);
2408 else
2409 event_sched_out(event, cpuctx, ctx);
2410
2411 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2412 perf_cgroup_event_disable(event, ctx);
2413}
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429static void _perf_event_disable(struct perf_event *event)
2430{
2431 struct perf_event_context *ctx = event->ctx;
2432
2433 raw_spin_lock_irq(&ctx->lock);
2434 if (event->state <= PERF_EVENT_STATE_OFF) {
2435 raw_spin_unlock_irq(&ctx->lock);
2436 return;
2437 }
2438 raw_spin_unlock_irq(&ctx->lock);
2439
2440 event_function_call(event, __perf_event_disable, NULL);
2441}
2442
2443void perf_event_disable_local(struct perf_event *event)
2444{
2445 event_function_local(event, __perf_event_disable, NULL);
2446}
2447
2448
2449
2450
2451
2452void perf_event_disable(struct perf_event *event)
2453{
2454 struct perf_event_context *ctx;
2455
2456 ctx = perf_event_ctx_lock(event);
2457 _perf_event_disable(event);
2458 perf_event_ctx_unlock(event, ctx);
2459}
2460EXPORT_SYMBOL_GPL(perf_event_disable);
2461
2462void perf_event_disable_inatomic(struct perf_event *event)
2463{
2464 WRITE_ONCE(event->pending_disable, smp_processor_id());
2465
2466 irq_work_queue(&event->pending);
2467}
2468
2469static void perf_set_shadow_time(struct perf_event *event,
2470 struct perf_event_context *ctx)
2471{
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497 if (is_cgroup_event(event))
2498 perf_cgroup_set_shadow_time(event, event->tstamp);
2499 else
2500 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2501}
2502
2503#define MAX_INTERRUPTS (~0ULL)
2504
2505static void perf_log_throttle(struct perf_event *event, int enable);
2506static void perf_log_itrace_start(struct perf_event *event);
2507
2508static int
2509event_sched_in(struct perf_event *event,
2510 struct perf_cpu_context *cpuctx,
2511 struct perf_event_context *ctx)
2512{
2513 int ret = 0;
2514
2515 WARN_ON_ONCE(event->ctx != ctx);
2516
2517 lockdep_assert_held(&ctx->lock);
2518
2519 if (event->state <= PERF_EVENT_STATE_OFF)
2520 return 0;
2521
2522 WRITE_ONCE(event->oncpu, smp_processor_id());
2523
2524
2525
2526
2527
2528 smp_wmb();
2529 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2530
2531
2532
2533
2534
2535
2536 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2537 perf_log_throttle(event, 1);
2538 event->hw.interrupts = 0;
2539 }
2540
2541 perf_pmu_disable(event->pmu);
2542
2543 perf_set_shadow_time(event, ctx);
2544
2545 perf_log_itrace_start(event);
2546
2547 if (event->pmu->add(event, PERF_EF_START)) {
2548 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2549 event->oncpu = -1;
2550 ret = -EAGAIN;
2551 goto out;
2552 }
2553
2554 if (!is_software_event(event))
2555 cpuctx->active_oncpu++;
2556 if (!ctx->nr_active++)
2557 perf_event_ctx_activate(ctx);
2558 if (event->attr.freq && event->attr.sample_freq)
2559 ctx->nr_freq++;
2560
2561 if (event->attr.exclusive)
2562 cpuctx->exclusive = 1;
2563
2564out:
2565 perf_pmu_enable(event->pmu);
2566
2567 return ret;
2568}
2569
2570static int
2571group_sched_in(struct perf_event *group_event,
2572 struct perf_cpu_context *cpuctx,
2573 struct perf_event_context *ctx)
2574{
2575 struct perf_event *event, *partial_group = NULL;
2576 struct pmu *pmu = ctx->pmu;
2577
2578 if (group_event->state == PERF_EVENT_STATE_OFF)
2579 return 0;
2580
2581 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2582
2583 if (event_sched_in(group_event, cpuctx, ctx))
2584 goto error;
2585
2586
2587
2588
2589 for_each_sibling_event(event, group_event) {
2590 if (event_sched_in(event, cpuctx, ctx)) {
2591 partial_group = event;
2592 goto group_error;
2593 }
2594 }
2595
2596 if (!pmu->commit_txn(pmu))
2597 return 0;
2598
2599group_error:
2600
2601
2602
2603
2604
2605 for_each_sibling_event(event, group_event) {
2606 if (event == partial_group)
2607 break;
2608
2609 event_sched_out(event, cpuctx, ctx);
2610 }
2611 event_sched_out(group_event, cpuctx, ctx);
2612
2613error:
2614 pmu->cancel_txn(pmu);
2615 return -EAGAIN;
2616}
2617
2618
2619
2620
2621static int group_can_go_on(struct perf_event *event,
2622 struct perf_cpu_context *cpuctx,
2623 int can_add_hw)
2624{
2625
2626
2627
2628 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2629 return 1;
2630
2631
2632
2633
2634 if (cpuctx->exclusive)
2635 return 0;
2636
2637
2638
2639
2640 if (event->attr.exclusive && !list_empty(get_event_list(event)))
2641 return 0;
2642
2643
2644
2645
2646 return can_add_hw;
2647}
2648
2649static void add_event_to_ctx(struct perf_event *event,
2650 struct perf_event_context *ctx)
2651{
2652 list_add_event(event, ctx);
2653 perf_group_attach(event);
2654}
2655
2656static void ctx_sched_out(struct perf_event_context *ctx,
2657 struct perf_cpu_context *cpuctx,
2658 enum event_type_t event_type);
2659static void
2660ctx_sched_in(struct perf_event_context *ctx,
2661 struct perf_cpu_context *cpuctx,
2662 enum event_type_t event_type,
2663 struct task_struct *task);
2664
2665static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2666 struct perf_event_context *ctx,
2667 enum event_type_t event_type)
2668{
2669 if (!cpuctx->task_ctx)
2670 return;
2671
2672 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2673 return;
2674
2675 ctx_sched_out(ctx, cpuctx, event_type);
2676}
2677
2678static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2679 struct perf_event_context *ctx,
2680 struct task_struct *task)
2681{
2682 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2683 if (ctx)
2684 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2685 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2686 if (ctx)
2687 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2688}
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705static void ctx_resched(struct perf_cpu_context *cpuctx,
2706 struct perf_event_context *task_ctx,
2707 enum event_type_t event_type)
2708{
2709 enum event_type_t ctx_event_type;
2710 bool cpu_event = !!(event_type & EVENT_CPU);
2711
2712
2713
2714
2715
2716 if (event_type & EVENT_PINNED)
2717 event_type |= EVENT_FLEXIBLE;
2718
2719 ctx_event_type = event_type & EVENT_ALL;
2720
2721 perf_pmu_disable(cpuctx->ctx.pmu);
2722 if (task_ctx)
2723 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2724
2725
2726
2727
2728
2729
2730
2731
2732 if (cpu_event)
2733 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2734 else if (ctx_event_type & EVENT_PINNED)
2735 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2736
2737 perf_event_sched_in(cpuctx, task_ctx, current);
2738 perf_pmu_enable(cpuctx->ctx.pmu);
2739}
2740
2741void perf_pmu_resched(struct pmu *pmu)
2742{
2743 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2744 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2745
2746 perf_ctx_lock(cpuctx, task_ctx);
2747 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2748 perf_ctx_unlock(cpuctx, task_ctx);
2749}
2750
2751
2752
2753
2754
2755
2756
2757static int __perf_install_in_context(void *info)
2758{
2759 struct perf_event *event = info;
2760 struct perf_event_context *ctx = event->ctx;
2761 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2762 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2763 bool reprogram = true;
2764 int ret = 0;
2765
2766 raw_spin_lock(&cpuctx->ctx.lock);
2767 if (ctx->task) {
2768 raw_spin_lock(&ctx->lock);
2769 task_ctx = ctx;
2770
2771 reprogram = (ctx->task == current);
2772
2773
2774
2775
2776
2777
2778
2779
2780 if (task_curr(ctx->task) && !reprogram) {
2781 ret = -ESRCH;
2782 goto unlock;
2783 }
2784
2785 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2786 } else if (task_ctx) {
2787 raw_spin_lock(&task_ctx->lock);
2788 }
2789
2790#ifdef CONFIG_CGROUP_PERF
2791 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2792
2793
2794
2795
2796 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2797 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2798 event->cgrp->css.cgroup);
2799 }
2800#endif
2801
2802 if (reprogram) {
2803 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2804 add_event_to_ctx(event, ctx);
2805 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2806 } else {
2807 add_event_to_ctx(event, ctx);
2808 }
2809
2810unlock:
2811 perf_ctx_unlock(cpuctx, task_ctx);
2812
2813 return ret;
2814}
2815
2816static bool exclusive_event_installable(struct perf_event *event,
2817 struct perf_event_context *ctx);
2818
2819
2820
2821
2822
2823
2824static void
2825perf_install_in_context(struct perf_event_context *ctx,
2826 struct perf_event *event,
2827 int cpu)
2828{
2829 struct task_struct *task = READ_ONCE(ctx->task);
2830
2831 lockdep_assert_held(&ctx->mutex);
2832
2833 WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2834
2835 if (event->cpu != -1)
2836 event->cpu = cpu;
2837
2838
2839
2840
2841
2842 smp_store_release(&event->ctx, ctx);
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2853 raw_spin_lock_irq(&ctx->lock);
2854 if (ctx->task == TASK_TOMBSTONE) {
2855 raw_spin_unlock_irq(&ctx->lock);
2856 return;
2857 }
2858 add_event_to_ctx(event, ctx);
2859 raw_spin_unlock_irq(&ctx->lock);
2860 return;
2861 }
2862
2863 if (!task) {
2864 cpu_function_call(cpu, __perf_install_in_context, event);
2865 return;
2866 }
2867
2868
2869
2870
2871 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2872 return;
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904 smp_mb();
2905again:
2906 if (!task_function_call(task, __perf_install_in_context, event))
2907 return;
2908
2909 raw_spin_lock_irq(&ctx->lock);
2910 task = ctx->task;
2911 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2912
2913
2914
2915
2916
2917 raw_spin_unlock_irq(&ctx->lock);
2918 return;
2919 }
2920
2921
2922
2923
2924 if (task_curr(task)) {
2925 raw_spin_unlock_irq(&ctx->lock);
2926 goto again;
2927 }
2928 add_event_to_ctx(event, ctx);
2929 raw_spin_unlock_irq(&ctx->lock);
2930}
2931
2932
2933
2934
2935static void __perf_event_enable(struct perf_event *event,
2936 struct perf_cpu_context *cpuctx,
2937 struct perf_event_context *ctx,
2938 void *info)
2939{
2940 struct perf_event *leader = event->group_leader;
2941 struct perf_event_context *task_ctx;
2942
2943 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2944 event->state <= PERF_EVENT_STATE_ERROR)
2945 return;
2946
2947 if (ctx->is_active)
2948 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2949
2950 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2951 perf_cgroup_event_enable(event, ctx);
2952
2953 if (!ctx->is_active)
2954 return;
2955
2956 if (!event_filter_match(event)) {
2957 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2958 return;
2959 }
2960
2961
2962
2963
2964
2965 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2966 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2967 return;
2968 }
2969
2970 task_ctx = cpuctx->task_ctx;
2971 if (ctx->task)
2972 WARN_ON_ONCE(task_ctx != ctx);
2973
2974 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2975}
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986static void _perf_event_enable(struct perf_event *event)
2987{
2988 struct perf_event_context *ctx = event->ctx;
2989
2990 raw_spin_lock_irq(&ctx->lock);
2991 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2992 event->state < PERF_EVENT_STATE_ERROR) {
2993out:
2994 raw_spin_unlock_irq(&ctx->lock);
2995 return;
2996 }
2997
2998
2999
3000
3001
3002
3003
3004
3005 if (event->state == PERF_EVENT_STATE_ERROR) {
3006
3007
3008
3009 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3010 event->group_leader == event)
3011 goto out;
3012
3013 event->state = PERF_EVENT_STATE_OFF;
3014 }
3015 raw_spin_unlock_irq(&ctx->lock);
3016
3017 event_function_call(event, __perf_event_enable, NULL);
3018}
3019
3020
3021
3022
3023void perf_event_enable(struct perf_event *event)
3024{
3025 struct perf_event_context *ctx;
3026
3027 ctx = perf_event_ctx_lock(event);
3028 _perf_event_enable(event);
3029 perf_event_ctx_unlock(event, ctx);
3030}
3031EXPORT_SYMBOL_GPL(perf_event_enable);
3032
3033struct stop_event_data {
3034 struct perf_event *event;
3035 unsigned int restart;
3036};
3037
3038static int __perf_event_stop(void *info)
3039{
3040 struct stop_event_data *sd = info;
3041 struct perf_event *event = sd->event;
3042
3043
3044 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3045 return 0;
3046
3047
3048 smp_rmb();
3049
3050
3051
3052
3053
3054 if (READ_ONCE(event->oncpu) != smp_processor_id())
3055 return -EAGAIN;
3056
3057 event->pmu->stop(event, PERF_EF_UPDATE);
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068 if (sd->restart)
3069 event->pmu->start(event, 0);
3070
3071 return 0;
3072}
3073
3074static int perf_event_stop(struct perf_event *event, int restart)
3075{
3076 struct stop_event_data sd = {
3077 .event = event,
3078 .restart = restart,
3079 };
3080 int ret = 0;
3081
3082 do {
3083 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3084 return 0;
3085
3086
3087 smp_rmb();
3088
3089
3090
3091
3092
3093
3094 ret = cpu_function_call(READ_ONCE(event->oncpu),
3095 __perf_event_stop, &sd);
3096 } while (ret == -EAGAIN);
3097
3098 return ret;
3099}
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123void perf_event_addr_filters_sync(struct perf_event *event)
3124{
3125 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3126
3127 if (!has_addr_filter(event))
3128 return;
3129
3130 raw_spin_lock(&ifh->lock);
3131 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3132 event->pmu->addr_filters_sync(event);
3133 event->hw.addr_filters_gen = event->addr_filters_gen;
3134 }
3135 raw_spin_unlock(&ifh->lock);
3136}
3137EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3138
3139static int _perf_event_refresh(struct perf_event *event, int refresh)
3140{
3141
3142
3143
3144 if (event->attr.inherit || !is_sampling_event(event))
3145 return -EINVAL;
3146
3147 atomic_add(refresh, &event->event_limit);
3148 _perf_event_enable(event);
3149
3150 return 0;
3151}
3152
3153
3154
3155
3156int perf_event_refresh(struct perf_event *event, int refresh)
3157{
3158 struct perf_event_context *ctx;
3159 int ret;
3160
3161 ctx = perf_event_ctx_lock(event);
3162 ret = _perf_event_refresh(event, refresh);
3163 perf_event_ctx_unlock(event, ctx);
3164
3165 return ret;
3166}
3167EXPORT_SYMBOL_GPL(perf_event_refresh);
3168
3169static int perf_event_modify_breakpoint(struct perf_event *bp,
3170 struct perf_event_attr *attr)
3171{
3172 int err;
3173
3174 _perf_event_disable(bp);
3175
3176 err = modify_user_hw_breakpoint_check(bp, attr, true);
3177
3178 if (!bp->attr.disabled)
3179 _perf_event_enable(bp);
3180
3181 return err;
3182}
3183
3184static int perf_event_modify_attr(struct perf_event *event,
3185 struct perf_event_attr *attr)
3186{
3187 if (event->attr.type != attr->type)
3188 return -EINVAL;
3189
3190 switch (event->attr.type) {
3191 case PERF_TYPE_BREAKPOINT:
3192 return perf_event_modify_breakpoint(event, attr);
3193 default:
3194
3195 return -EOPNOTSUPP;
3196 }
3197}
3198
3199static void ctx_sched_out(struct perf_event_context *ctx,
3200 struct perf_cpu_context *cpuctx,
3201 enum event_type_t event_type)
3202{
3203 struct perf_event *event, *tmp;
3204 int is_active = ctx->is_active;
3205
3206 lockdep_assert_held(&ctx->lock);
3207
3208 if (likely(!ctx->nr_events)) {
3209
3210
3211
3212 WARN_ON_ONCE(ctx->is_active);
3213 if (ctx->task)
3214 WARN_ON_ONCE(cpuctx->task_ctx);
3215 return;
3216 }
3217
3218 ctx->is_active &= ~event_type;
3219 if (!(ctx->is_active & EVENT_ALL))
3220 ctx->is_active = 0;
3221
3222 if (ctx->task) {
3223 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3224 if (!ctx->is_active)
3225 cpuctx->task_ctx = NULL;
3226 }
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238 if (is_active & EVENT_TIME) {
3239
3240 update_context_time(ctx);
3241 update_cgrp_time_from_cpuctx(cpuctx);
3242 }
3243
3244 is_active ^= ctx->is_active;
3245
3246 if (!ctx->nr_active || !(is_active & EVENT_ALL))
3247 return;
3248
3249 perf_pmu_disable(ctx->pmu);
3250 if (is_active & EVENT_PINNED) {
3251 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3252 group_sched_out(event, cpuctx, ctx);
3253 }
3254
3255 if (is_active & EVENT_FLEXIBLE) {
3256 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3257 group_sched_out(event, cpuctx, ctx);
3258
3259
3260
3261
3262
3263
3264 ctx->rotate_necessary = 0;
3265 }
3266 perf_pmu_enable(ctx->pmu);
3267}
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277static int context_equiv(struct perf_event_context *ctx1,
3278 struct perf_event_context *ctx2)
3279{
3280 lockdep_assert_held(&ctx1->lock);
3281 lockdep_assert_held(&ctx2->lock);
3282
3283
3284 if (ctx1->pin_count || ctx2->pin_count)
3285 return 0;
3286
3287
3288 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3289 return 1;
3290
3291
3292 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3293 return 1;
3294
3295
3296
3297
3298
3299 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3300 ctx1->parent_gen == ctx2->parent_gen)
3301 return 1;
3302
3303
3304 return 0;
3305}
3306
3307static void __perf_event_sync_stat(struct perf_event *event,
3308 struct perf_event *next_event)
3309{
3310 u64 value;
3311
3312 if (!event->attr.inherit_stat)
3313 return;
3314
3315
3316
3317
3318
3319
3320
3321
3322 if (event->state == PERF_EVENT_STATE_ACTIVE)
3323 event->pmu->read(event);
3324
3325 perf_event_update_time(event);
3326
3327
3328
3329
3330
3331 value = local64_read(&next_event->count);
3332 value = local64_xchg(&event->count, value);
3333 local64_set(&next_event->count, value);
3334
3335 swap(event->total_time_enabled, next_event->total_time_enabled);
3336 swap(event->total_time_running, next_event->total_time_running);
3337
3338
3339
3340
3341 perf_event_update_userpage(event);
3342 perf_event_update_userpage(next_event);
3343}
3344
3345static void perf_event_sync_stat(struct perf_event_context *ctx,
3346 struct perf_event_context *next_ctx)
3347{
3348 struct perf_event *event, *next_event;
3349
3350 if (!ctx->nr_stat)
3351 return;
3352
3353 update_context_time(ctx);
3354
3355 event = list_first_entry(&ctx->event_list,
3356 struct perf_event, event_entry);
3357
3358 next_event = list_first_entry(&next_ctx->event_list,
3359 struct perf_event, event_entry);
3360
3361 while (&event->event_entry != &ctx->event_list &&
3362 &next_event->event_entry != &next_ctx->event_list) {
3363
3364 __perf_event_sync_stat(event, next_event);
3365
3366 event = list_next_entry(event, event_entry);
3367 next_event = list_next_entry(next_event, event_entry);
3368 }
3369}
3370
3371static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3372 struct task_struct *next)
3373{
3374 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3375 struct perf_event_context *next_ctx;
3376 struct perf_event_context *parent, *next_parent;
3377 struct perf_cpu_context *cpuctx;
3378 int do_switch = 1;
3379 struct pmu *pmu;
3380
3381 if (likely(!ctx))
3382 return;
3383
3384 pmu = ctx->pmu;
3385 cpuctx = __get_cpu_context(ctx);
3386 if (!cpuctx->task_ctx)
3387 return;
3388
3389 rcu_read_lock();
3390 next_ctx = next->perf_event_ctxp[ctxn];
3391 if (!next_ctx)
3392 goto unlock;
3393
3394 parent = rcu_dereference(ctx->parent_ctx);
3395 next_parent = rcu_dereference(next_ctx->parent_ctx);
3396
3397
3398 if (!parent && !next_parent)
3399 goto unlock;
3400
3401 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411 raw_spin_lock(&ctx->lock);
3412 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3413 if (context_equiv(ctx, next_ctx)) {
3414
3415 WRITE_ONCE(ctx->task, next);
3416 WRITE_ONCE(next_ctx->task, task);
3417
3418 perf_pmu_disable(pmu);
3419
3420 if (cpuctx->sched_cb_usage && pmu->sched_task)
3421 pmu->sched_task(ctx, false);
3422
3423
3424
3425
3426
3427
3428
3429 if (pmu->swap_task_ctx)
3430 pmu->swap_task_ctx(ctx, next_ctx);
3431 else
3432 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3433
3434 perf_pmu_enable(pmu);
3435
3436
3437
3438
3439
3440
3441
3442
3443 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3444 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3445
3446 do_switch = 0;
3447
3448 perf_event_sync_stat(ctx, next_ctx);
3449 }
3450 raw_spin_unlock(&next_ctx->lock);
3451 raw_spin_unlock(&ctx->lock);
3452 }
3453unlock:
3454 rcu_read_unlock();
3455
3456 if (do_switch) {
3457 raw_spin_lock(&ctx->lock);
3458 perf_pmu_disable(pmu);
3459
3460 if (cpuctx->sched_cb_usage && pmu->sched_task)
3461 pmu->sched_task(ctx, false);
3462 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3463
3464 perf_pmu_enable(pmu);
3465 raw_spin_unlock(&ctx->lock);
3466 }
3467}
3468
3469void perf_sched_cb_dec(struct pmu *pmu)
3470{
3471 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3472
3473 --cpuctx->sched_cb_usage;
3474}
3475
3476
3477void perf_sched_cb_inc(struct pmu *pmu)
3478{
3479 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3480
3481 cpuctx->sched_cb_usage++;
3482}
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3493{
3494 struct pmu *pmu;
3495
3496 pmu = cpuctx->ctx.pmu;
3497
3498 if (WARN_ON_ONCE(!pmu->sched_task))
3499 return;
3500
3501 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3502 perf_pmu_disable(pmu);
3503
3504 pmu->sched_task(cpuctx->task_ctx, sched_in);
3505
3506 perf_pmu_enable(pmu);
3507 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3508}
3509
3510static void perf_event_switch(struct task_struct *task,
3511 struct task_struct *next_prev, bool sched_in);
3512
3513#define for_each_task_context_nr(ctxn) \
3514 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527void __perf_event_task_sched_out(struct task_struct *task,
3528 struct task_struct *next)
3529{
3530 int ctxn;
3531
3532 if (atomic_read(&nr_switch_events))
3533 perf_event_switch(task, next, false);
3534
3535 for_each_task_context_nr(ctxn)
3536 perf_event_context_sched_out(task, ctxn, next);
3537
3538
3539
3540
3541
3542
3543 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3544 perf_cgroup_sched_out(task, next);
3545}
3546
3547
3548
3549
3550static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3551 enum event_type_t event_type)
3552{
3553 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3554}
3555
3556static bool perf_less_group_idx(const void *l, const void *r)
3557{
3558 const struct perf_event *le = *(const struct perf_event **)l;
3559 const struct perf_event *re = *(const struct perf_event **)r;
3560
3561 return le->group_index < re->group_index;
3562}
3563
3564static void swap_ptr(void *l, void *r)
3565{
3566 void **lp = l, **rp = r;
3567
3568 swap(*lp, *rp);
3569}
3570
3571static const struct min_heap_callbacks perf_min_heap = {
3572 .elem_size = sizeof(struct perf_event *),
3573 .less = perf_less_group_idx,
3574 .swp = swap_ptr,
3575};
3576
3577static void __heap_add(struct min_heap *heap, struct perf_event *event)
3578{
3579 struct perf_event **itrs = heap->data;
3580
3581 if (event) {
3582 itrs[heap->nr] = event;
3583 heap->nr++;
3584 }
3585}
3586
3587static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3588 struct perf_event_groups *groups, int cpu,
3589 int (*func)(struct perf_event *, void *),
3590 void *data)
3591{
3592#ifdef CONFIG_CGROUP_PERF
3593 struct cgroup_subsys_state *css = NULL;
3594#endif
3595
3596 struct perf_event *itrs[2];
3597 struct min_heap event_heap;
3598 struct perf_event **evt;
3599 int ret;
3600
3601 if (cpuctx) {
3602 event_heap = (struct min_heap){
3603 .data = cpuctx->heap,
3604 .nr = 0,
3605 .size = cpuctx->heap_size,
3606 };
3607
3608 lockdep_assert_held(&cpuctx->ctx.lock);
3609
3610#ifdef CONFIG_CGROUP_PERF
3611 if (cpuctx->cgrp)
3612 css = &cpuctx->cgrp->css;
3613#endif
3614 } else {
3615 event_heap = (struct min_heap){
3616 .data = itrs,
3617 .nr = 0,
3618 .size = ARRAY_SIZE(itrs),
3619 };
3620
3621 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3622 }
3623 evt = event_heap.data;
3624
3625 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3626
3627#ifdef CONFIG_CGROUP_PERF
3628 for (; css; css = css->parent)
3629 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3630#endif
3631
3632 min_heapify_all(&event_heap, &perf_min_heap);
3633
3634 while (event_heap.nr) {
3635 ret = func(*evt, data);
3636 if (ret)
3637 return ret;
3638
3639 *evt = perf_event_groups_next(*evt);
3640 if (*evt)
3641 min_heapify(&event_heap, 0, &perf_min_heap);
3642 else
3643 min_heap_pop(&event_heap, &perf_min_heap);
3644 }
3645
3646 return 0;
3647}
3648
3649static int merge_sched_in(struct perf_event *event, void *data)
3650{
3651 struct perf_event_context *ctx = event->ctx;
3652 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3653 int *can_add_hw = data;
3654
3655 if (event->state <= PERF_EVENT_STATE_OFF)
3656 return 0;
3657
3658 if (!event_filter_match(event))
3659 return 0;
3660
3661 if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3662 if (!group_sched_in(event, cpuctx, ctx))
3663 list_add_tail(&event->active_list, get_event_list(event));
3664 }
3665
3666 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3667 if (event->attr.pinned) {
3668 perf_cgroup_event_disable(event, ctx);
3669 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3670 }
3671
3672 *can_add_hw = 0;
3673 ctx->rotate_necessary = 1;
3674 perf_mux_hrtimer_restart(cpuctx);
3675 }
3676
3677 return 0;
3678}
3679
3680static void
3681ctx_pinned_sched_in(struct perf_event_context *ctx,
3682 struct perf_cpu_context *cpuctx)
3683{
3684 int can_add_hw = 1;
3685
3686 if (ctx != &cpuctx->ctx)
3687 cpuctx = NULL;
3688
3689 visit_groups_merge(cpuctx, &ctx->pinned_groups,
3690 smp_processor_id(),
3691 merge_sched_in, &can_add_hw);
3692}
3693
3694static void
3695ctx_flexible_sched_in(struct perf_event_context *ctx,
3696 struct perf_cpu_context *cpuctx)
3697{
3698 int can_add_hw = 1;
3699
3700 if (ctx != &cpuctx->ctx)
3701 cpuctx = NULL;
3702
3703 visit_groups_merge(cpuctx, &ctx->flexible_groups,
3704 smp_processor_id(),
3705 merge_sched_in, &can_add_hw);
3706}
3707
3708static void
3709ctx_sched_in(struct perf_event_context *ctx,
3710 struct perf_cpu_context *cpuctx,
3711 enum event_type_t event_type,
3712 struct task_struct *task)
3713{
3714 int is_active = ctx->is_active;
3715 u64 now;
3716
3717 lockdep_assert_held(&ctx->lock);
3718
3719 if (likely(!ctx->nr_events))
3720 return;
3721
3722 ctx->is_active |= (event_type | EVENT_TIME);
3723 if (ctx->task) {
3724 if (!is_active)
3725 cpuctx->task_ctx = ctx;
3726 else
3727 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3728 }
3729
3730 is_active ^= ctx->is_active;
3731
3732 if (is_active & EVENT_TIME) {
3733
3734 now = perf_clock();
3735 ctx->timestamp = now;
3736 perf_cgroup_set_timestamp(task, ctx);
3737 }
3738
3739
3740
3741
3742
3743 if (is_active & EVENT_PINNED)
3744 ctx_pinned_sched_in(ctx, cpuctx);
3745
3746
3747 if (is_active & EVENT_FLEXIBLE)
3748 ctx_flexible_sched_in(ctx, cpuctx);
3749}
3750
3751static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3752 enum event_type_t event_type,
3753 struct task_struct *task)
3754{
3755 struct perf_event_context *ctx = &cpuctx->ctx;
3756
3757 ctx_sched_in(ctx, cpuctx, event_type, task);
3758}
3759
3760static void perf_event_context_sched_in(struct perf_event_context *ctx,
3761 struct task_struct *task)
3762{
3763 struct perf_cpu_context *cpuctx;
3764 struct pmu *pmu = ctx->pmu;
3765
3766 cpuctx = __get_cpu_context(ctx);
3767 if (cpuctx->task_ctx == ctx) {
3768 if (cpuctx->sched_cb_usage)
3769 __perf_pmu_sched_task(cpuctx, true);
3770 return;
3771 }
3772
3773 perf_ctx_lock(cpuctx, ctx);
3774
3775
3776
3777
3778 if (!ctx->nr_events)
3779 goto unlock;
3780
3781 perf_pmu_disable(pmu);
3782
3783
3784
3785
3786
3787
3788
3789
3790 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3791 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3792 perf_event_sched_in(cpuctx, ctx, task);
3793
3794 if (cpuctx->sched_cb_usage && pmu->sched_task)
3795 pmu->sched_task(cpuctx->task_ctx, true);
3796
3797 perf_pmu_enable(pmu);
3798
3799unlock:
3800 perf_ctx_unlock(cpuctx, ctx);
3801}
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814void __perf_event_task_sched_in(struct task_struct *prev,
3815 struct task_struct *task)
3816{
3817 struct perf_event_context *ctx;
3818 int ctxn;
3819
3820
3821
3822
3823
3824
3825
3826
3827 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3828 perf_cgroup_sched_in(prev, task);
3829
3830 for_each_task_context_nr(ctxn) {
3831 ctx = task->perf_event_ctxp[ctxn];
3832 if (likely(!ctx))
3833 continue;
3834
3835 perf_event_context_sched_in(ctx, task);
3836 }
3837
3838 if (atomic_read(&nr_switch_events))
3839 perf_event_switch(task, prev, true);
3840}
3841
3842static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3843{
3844 u64 frequency = event->attr.sample_freq;
3845 u64 sec = NSEC_PER_SEC;
3846 u64 divisor, dividend;
3847
3848 int count_fls, nsec_fls, frequency_fls, sec_fls;
3849
3850 count_fls = fls64(count);
3851 nsec_fls = fls64(nsec);
3852 frequency_fls = fls64(frequency);
3853 sec_fls = 30;
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869#define REDUCE_FLS(a, b) \
3870do { \
3871 if (a##_fls > b##_fls) { \
3872 a >>= 1; \
3873 a##_fls--; \
3874 } else { \
3875 b >>= 1; \
3876 b##_fls--; \
3877 } \
3878} while (0)
3879
3880
3881
3882
3883
3884 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3885 REDUCE_FLS(nsec, frequency);
3886 REDUCE_FLS(sec, count);
3887 }
3888
3889 if (count_fls + sec_fls > 64) {
3890 divisor = nsec * frequency;
3891
3892 while (count_fls + sec_fls > 64) {
3893 REDUCE_FLS(count, sec);
3894 divisor >>= 1;
3895 }
3896
3897 dividend = count * sec;
3898 } else {
3899 dividend = count * sec;
3900
3901 while (nsec_fls + frequency_fls > 64) {
3902 REDUCE_FLS(nsec, frequency);
3903 dividend >>= 1;
3904 }
3905
3906 divisor = nsec * frequency;
3907 }
3908
3909 if (!divisor)
3910 return dividend;
3911
3912 return div64_u64(dividend, divisor);
3913}
3914
3915static DEFINE_PER_CPU(int, perf_throttled_count);
3916static DEFINE_PER_CPU(u64, perf_throttled_seq);
3917
3918static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3919{
3920 struct hw_perf_event *hwc = &event->hw;
3921 s64 period, sample_period;
3922 s64 delta;
3923
3924 period = perf_calculate_period(event, nsec, count);
3925
3926 delta = (s64)(period - hwc->sample_period);
3927 delta = (delta + 7) / 8;
3928
3929 sample_period = hwc->sample_period + delta;
3930
3931 if (!sample_period)
3932 sample_period = 1;
3933
3934 hwc->sample_period = sample_period;
3935
3936 if (local64_read(&hwc->period_left) > 8*sample_period) {
3937 if (disable)
3938 event->pmu->stop(event, PERF_EF_UPDATE);
3939
3940 local64_set(&hwc->period_left, 0);
3941
3942 if (disable)
3943 event->pmu->start(event, PERF_EF_RELOAD);
3944 }
3945}
3946
3947
3948
3949
3950
3951
3952static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3953 int needs_unthr)
3954{
3955 struct perf_event *event;
3956 struct hw_perf_event *hwc;
3957 u64 now, period = TICK_NSEC;
3958 s64 delta;
3959
3960
3961
3962
3963
3964
3965 if (!(ctx->nr_freq || needs_unthr))
3966 return;
3967
3968 raw_spin_lock(&ctx->lock);
3969 perf_pmu_disable(ctx->pmu);
3970
3971 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3972 if (event->state != PERF_EVENT_STATE_ACTIVE)
3973 continue;
3974
3975 if (!event_filter_match(event))
3976 continue;
3977
3978 perf_pmu_disable(event->pmu);
3979
3980 hwc = &event->hw;
3981
3982 if (hwc->interrupts == MAX_INTERRUPTS) {
3983 hwc->interrupts = 0;
3984 perf_log_throttle(event, 1);
3985 event->pmu->start(event, 0);
3986 }
3987
3988 if (!event->attr.freq || !event->attr.sample_freq)
3989 goto next;
3990
3991
3992
3993
3994 event->pmu->stop(event, PERF_EF_UPDATE);
3995
3996 now = local64_read(&event->count);
3997 delta = now - hwc->freq_count_stamp;
3998 hwc->freq_count_stamp = now;
3999
4000
4001
4002
4003
4004
4005
4006
4007 if (delta > 0)
4008 perf_adjust_period(event, period, delta, false);
4009
4010 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4011 next:
4012 perf_pmu_enable(event->pmu);
4013 }
4014
4015 perf_pmu_enable(ctx->pmu);
4016 raw_spin_unlock(&ctx->lock);
4017}
4018
4019
4020
4021
4022static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4023{
4024
4025
4026
4027
4028 if (ctx->rotate_disable)
4029 return;
4030
4031 perf_event_groups_delete(&ctx->flexible_groups, event);
4032 perf_event_groups_insert(&ctx->flexible_groups, event);
4033}
4034
4035
4036static inline struct perf_event *
4037ctx_event_to_rotate(struct perf_event_context *ctx)
4038{
4039 struct perf_event *event;
4040
4041
4042 event = list_first_entry_or_null(&ctx->flexible_active,
4043 struct perf_event, active_list);
4044
4045
4046 if (!event) {
4047 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4048 typeof(*event), group_node);
4049 }
4050
4051
4052
4053
4054
4055 ctx->rotate_necessary = 0;
4056
4057 return event;
4058}
4059
4060static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4061{
4062 struct perf_event *cpu_event = NULL, *task_event = NULL;
4063 struct perf_event_context *task_ctx = NULL;
4064 int cpu_rotate, task_rotate;
4065
4066
4067
4068
4069
4070
4071 cpu_rotate = cpuctx->ctx.rotate_necessary;
4072 task_ctx = cpuctx->task_ctx;
4073 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4074
4075 if (!(cpu_rotate || task_rotate))
4076 return false;
4077
4078 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4079 perf_pmu_disable(cpuctx->ctx.pmu);
4080
4081 if (task_rotate)
4082 task_event = ctx_event_to_rotate(task_ctx);
4083 if (cpu_rotate)
4084 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4085
4086
4087
4088
4089
4090 if (task_event || (task_ctx && cpu_event))
4091 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4092 if (cpu_event)
4093 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4094
4095 if (task_event)
4096 rotate_ctx(task_ctx, task_event);
4097 if (cpu_event)
4098 rotate_ctx(&cpuctx->ctx, cpu_event);
4099
4100 perf_event_sched_in(cpuctx, task_ctx, current);
4101
4102 perf_pmu_enable(cpuctx->ctx.pmu);
4103 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4104
4105 return true;
4106}
4107
4108void perf_event_task_tick(void)
4109{
4110 struct list_head *head = this_cpu_ptr(&active_ctx_list);
4111 struct perf_event_context *ctx, *tmp;
4112 int throttled;
4113
4114 lockdep_assert_irqs_disabled();
4115
4116 __this_cpu_inc(perf_throttled_seq);
4117 throttled = __this_cpu_xchg(perf_throttled_count, 0);
4118 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4119
4120 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4121 perf_adjust_freq_unthr_context(ctx, throttled);
4122}
4123
4124static int event_enable_on_exec(struct perf_event *event,
4125 struct perf_event_context *ctx)
4126{
4127 if (!event->attr.enable_on_exec)
4128 return 0;
4129
4130 event->attr.enable_on_exec = 0;
4131 if (event->state >= PERF_EVENT_STATE_INACTIVE)
4132 return 0;
4133
4134 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4135
4136 return 1;
4137}
4138
4139
4140
4141
4142
4143static void perf_event_enable_on_exec(int ctxn)
4144{
4145 struct perf_event_context *ctx, *clone_ctx = NULL;
4146 enum event_type_t event_type = 0;
4147 struct perf_cpu_context *cpuctx;
4148 struct perf_event *event;
4149 unsigned long flags;
4150 int enabled = 0;
4151
4152 local_irq_save(flags);
4153 ctx = current->perf_event_ctxp[ctxn];
4154 if (!ctx || !ctx->nr_events)
4155 goto out;
4156
4157 cpuctx = __get_cpu_context(ctx);
4158 perf_ctx_lock(cpuctx, ctx);
4159 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4160 list_for_each_entry(event, &ctx->event_list, event_entry) {
4161 enabled |= event_enable_on_exec(event, ctx);
4162 event_type |= get_event_type(event);
4163 }
4164
4165
4166
4167
4168 if (enabled) {
4169 clone_ctx = unclone_ctx(ctx);
4170 ctx_resched(cpuctx, ctx, event_type);
4171 } else {
4172 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4173 }
4174 perf_ctx_unlock(cpuctx, ctx);
4175
4176out:
4177 local_irq_restore(flags);
4178
4179 if (clone_ctx)
4180 put_ctx(clone_ctx);
4181}
4182
4183struct perf_read_data {
4184 struct perf_event *event;
4185 bool group;
4186 int ret;
4187};
4188
4189static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4190{
4191 u16 local_pkg, event_pkg;
4192
4193 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4194 int local_cpu = smp_processor_id();
4195
4196 event_pkg = topology_physical_package_id(event_cpu);
4197 local_pkg = topology_physical_package_id(local_cpu);
4198
4199 if (event_pkg == local_pkg)
4200 return local_cpu;
4201 }
4202
4203 return event_cpu;
4204}
4205
4206
4207
4208
4209static void __perf_event_read(void *info)
4210{
4211 struct perf_read_data *data = info;
4212 struct perf_event *sub, *event = data->event;
4213 struct perf_event_context *ctx = event->ctx;
4214 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4215 struct pmu *pmu = event->pmu;
4216
4217
4218
4219
4220
4221
4222
4223
4224 if (ctx->task && cpuctx->task_ctx != ctx)
4225 return;
4226
4227 raw_spin_lock(&ctx->lock);
4228 if (ctx->is_active & EVENT_TIME) {
4229 update_context_time(ctx);
4230 update_cgrp_time_from_event(event);
4231 }
4232
4233 perf_event_update_time(event);
4234 if (data->group)
4235 perf_event_update_sibling_time(event);
4236
4237 if (event->state != PERF_EVENT_STATE_ACTIVE)
4238 goto unlock;
4239
4240 if (!data->group) {
4241 pmu->read(event);
4242 data->ret = 0;
4243 goto unlock;
4244 }
4245
4246 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4247
4248 pmu->read(event);
4249
4250 for_each_sibling_event(sub, event) {
4251 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4252
4253
4254
4255
4256 sub->pmu->read(sub);
4257 }
4258 }
4259
4260 data->ret = pmu->commit_txn(pmu);
4261
4262unlock:
4263 raw_spin_unlock(&ctx->lock);
4264}
4265
4266static inline u64 perf_event_count(struct perf_event *event)
4267{
4268 return local64_read(&event->count) + atomic64_read(&event->child_count);
4269}
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279int perf_event_read_local(struct perf_event *event, u64 *value,
4280 u64 *enabled, u64 *running)
4281{
4282 unsigned long flags;
4283 int ret = 0;
4284
4285
4286
4287
4288
4289 local_irq_save(flags);
4290
4291
4292
4293
4294
4295 if (event->attr.inherit) {
4296 ret = -EOPNOTSUPP;
4297 goto out;
4298 }
4299
4300
4301 if ((event->attach_state & PERF_ATTACH_TASK) &&
4302 event->hw.target != current) {
4303 ret = -EINVAL;
4304 goto out;
4305 }
4306
4307
4308 if (!(event->attach_state & PERF_ATTACH_TASK) &&
4309 event->cpu != smp_processor_id()) {
4310 ret = -EINVAL;
4311 goto out;
4312 }
4313
4314
4315 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4316 ret = -EBUSY;
4317 goto out;
4318 }
4319
4320
4321
4322
4323
4324
4325 if (event->oncpu == smp_processor_id())
4326 event->pmu->read(event);
4327
4328 *value = local64_read(&event->count);
4329 if (enabled || running) {
4330 u64 now = event->shadow_ctx_time + perf_clock();
4331 u64 __enabled, __running;
4332
4333 __perf_update_times(event, now, &__enabled, &__running);
4334 if (enabled)
4335 *enabled = __enabled;
4336 if (running)
4337 *running = __running;
4338 }
4339out:
4340 local_irq_restore(flags);
4341
4342 return ret;
4343}
4344
4345static int perf_event_read(struct perf_event *event, bool group)
4346{
4347 enum perf_event_state state = READ_ONCE(event->state);
4348 int event_cpu, ret = 0;
4349
4350
4351
4352
4353
4354again:
4355 if (state == PERF_EVENT_STATE_ACTIVE) {
4356 struct perf_read_data data;
4357
4358
4359
4360
4361
4362
4363
4364 smp_rmb();
4365
4366 event_cpu = READ_ONCE(event->oncpu);
4367 if ((unsigned)event_cpu >= nr_cpu_ids)
4368 return 0;
4369
4370 data = (struct perf_read_data){
4371 .event = event,
4372 .group = group,
4373 .ret = 0,
4374 };
4375
4376 preempt_disable();
4377 event_cpu = __perf_event_read_cpu(event, event_cpu);
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4390 preempt_enable();
4391 ret = data.ret;
4392
4393 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4394 struct perf_event_context *ctx = event->ctx;
4395 unsigned long flags;
4396
4397 raw_spin_lock_irqsave(&ctx->lock, flags);
4398 state = event->state;
4399 if (state != PERF_EVENT_STATE_INACTIVE) {
4400 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4401 goto again;
4402 }
4403
4404
4405
4406
4407
4408 if (ctx->is_active & EVENT_TIME) {
4409 update_context_time(ctx);
4410 update_cgrp_time_from_event(event);
4411 }
4412
4413 perf_event_update_time(event);
4414 if (group)
4415 perf_event_update_sibling_time(event);
4416 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4417 }
4418
4419 return ret;
4420}
4421
4422
4423
4424
4425static void __perf_event_init_context(struct perf_event_context *ctx)
4426{
4427 raw_spin_lock_init(&ctx->lock);
4428 mutex_init(&ctx->mutex);
4429 INIT_LIST_HEAD(&ctx->active_ctx_list);
4430 perf_event_groups_init(&ctx->pinned_groups);
4431 perf_event_groups_init(&ctx->flexible_groups);
4432 INIT_LIST_HEAD(&ctx->event_list);
4433 INIT_LIST_HEAD(&ctx->pinned_active);
4434 INIT_LIST_HEAD(&ctx->flexible_active);
4435 refcount_set(&ctx->refcount, 1);
4436}
4437
4438static struct perf_event_context *
4439alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4440{
4441 struct perf_event_context *ctx;
4442
4443 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4444 if (!ctx)
4445 return NULL;
4446
4447 __perf_event_init_context(ctx);
4448 if (task)
4449 ctx->task = get_task_struct(task);
4450 ctx->pmu = pmu;
4451
4452 return ctx;
4453}
4454
4455static struct task_struct *
4456find_lively_task_by_vpid(pid_t vpid)
4457{
4458 struct task_struct *task;
4459
4460 rcu_read_lock();
4461 if (!vpid)
4462 task = current;
4463 else
4464 task = find_task_by_vpid(vpid);
4465 if (task)
4466 get_task_struct(task);
4467 rcu_read_unlock();
4468
4469 if (!task)
4470 return ERR_PTR(-ESRCH);
4471
4472 return task;
4473}
4474
4475
4476
4477
4478static struct perf_event_context *
4479find_get_context(struct pmu *pmu, struct task_struct *task,
4480 struct perf_event *event)
4481{
4482 struct perf_event_context *ctx, *clone_ctx = NULL;
4483 struct perf_cpu_context *cpuctx;
4484 void *task_ctx_data = NULL;
4485 unsigned long flags;
4486 int ctxn, err;
4487 int cpu = event->cpu;
4488
4489 if (!task) {
4490
4491 err = perf_allow_cpu(&event->attr);
4492 if (err)
4493 return ERR_PTR(err);
4494
4495 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4496 ctx = &cpuctx->ctx;
4497 get_ctx(ctx);
4498 ++ctx->pin_count;
4499
4500 return ctx;
4501 }
4502
4503 err = -EINVAL;
4504 ctxn = pmu->task_ctx_nr;
4505 if (ctxn < 0)
4506 goto errout;
4507
4508 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4509 task_ctx_data = alloc_task_ctx_data(pmu);
4510 if (!task_ctx_data) {
4511 err = -ENOMEM;
4512 goto errout;
4513 }
4514 }
4515
4516retry:
4517 ctx = perf_lock_task_context(task, ctxn, &flags);
4518 if (ctx) {
4519 clone_ctx = unclone_ctx(ctx);
4520 ++ctx->pin_count;
4521
4522 if (task_ctx_data && !ctx->task_ctx_data) {
4523 ctx->task_ctx_data = task_ctx_data;
4524 task_ctx_data = NULL;
4525 }
4526 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4527
4528 if (clone_ctx)
4529 put_ctx(clone_ctx);
4530 } else {
4531 ctx = alloc_perf_context(pmu, task);
4532 err = -ENOMEM;
4533 if (!ctx)
4534 goto errout;
4535
4536 if (task_ctx_data) {
4537 ctx->task_ctx_data = task_ctx_data;
4538 task_ctx_data = NULL;
4539 }
4540
4541 err = 0;
4542 mutex_lock(&task->perf_event_mutex);
4543
4544
4545
4546
4547 if (task->flags & PF_EXITING)
4548 err = -ESRCH;
4549 else if (task->perf_event_ctxp[ctxn])
4550 err = -EAGAIN;
4551 else {
4552 get_ctx(ctx);
4553 ++ctx->pin_count;
4554 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4555 }
4556 mutex_unlock(&task->perf_event_mutex);
4557
4558 if (unlikely(err)) {
4559 put_ctx(ctx);
4560
4561 if (err == -EAGAIN)
4562 goto retry;
4563 goto errout;
4564 }
4565 }
4566
4567 free_task_ctx_data(pmu, task_ctx_data);
4568 return ctx;
4569
4570errout:
4571 free_task_ctx_data(pmu, task_ctx_data);
4572 return ERR_PTR(err);
4573}
4574
4575static void perf_event_free_filter(struct perf_event *event);
4576static void perf_event_free_bpf_prog(struct perf_event *event);
4577
4578static void free_event_rcu(struct rcu_head *head)
4579{
4580 struct perf_event *event;
4581
4582 event = container_of(head, struct perf_event, rcu_head);
4583 if (event->ns)
4584 put_pid_ns(event->ns);
4585 perf_event_free_filter(event);
4586 kfree(event);
4587}
4588
4589static void ring_buffer_attach(struct perf_event *event,
4590 struct perf_buffer *rb);
4591
4592static void detach_sb_event(struct perf_event *event)
4593{
4594 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4595
4596 raw_spin_lock(&pel->lock);
4597 list_del_rcu(&event->sb_list);
4598 raw_spin_unlock(&pel->lock);
4599}
4600
4601static bool is_sb_event(struct perf_event *event)
4602{
4603 struct perf_event_attr *attr = &event->attr;
4604
4605 if (event->parent)
4606 return false;
4607
4608 if (event->attach_state & PERF_ATTACH_TASK)
4609 return false;
4610
4611 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4612 attr->comm || attr->comm_exec ||
4613 attr->task || attr->ksymbol ||
4614 attr->context_switch || attr->text_poke ||
4615 attr->bpf_event)
4616 return true;
4617 return false;
4618}
4619
4620static void unaccount_pmu_sb_event(struct perf_event *event)
4621{
4622 if (is_sb_event(event))
4623 detach_sb_event(event);
4624}
4625
4626static void unaccount_event_cpu(struct perf_event *event, int cpu)
4627{
4628 if (event->parent)
4629 return;
4630
4631 if (is_cgroup_event(event))
4632 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4633}
4634
4635#ifdef CONFIG_NO_HZ_FULL
4636static DEFINE_SPINLOCK(nr_freq_lock);
4637#endif
4638
4639static void unaccount_freq_event_nohz(void)
4640{
4641#ifdef CONFIG_NO_HZ_FULL
4642 spin_lock(&nr_freq_lock);
4643 if (atomic_dec_and_test(&nr_freq_events))
4644 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4645 spin_unlock(&nr_freq_lock);
4646#endif
4647}
4648
4649static void unaccount_freq_event(void)
4650{
4651 if (tick_nohz_full_enabled())
4652 unaccount_freq_event_nohz();
4653 else
4654 atomic_dec(&nr_freq_events);
4655}
4656
4657static void unaccount_event(struct perf_event *event)
4658{
4659 bool dec = false;
4660
4661 if (event->parent)
4662 return;
4663
4664 if (event->attach_state & PERF_ATTACH_TASK)
4665 dec = true;
4666 if (event->attr.mmap || event->attr.mmap_data)
4667 atomic_dec(&nr_mmap_events);
4668 if (event->attr.comm)
4669 atomic_dec(&nr_comm_events);
4670 if (event->attr.namespaces)
4671 atomic_dec(&nr_namespaces_events);
4672 if (event->attr.cgroup)
4673 atomic_dec(&nr_cgroup_events);
4674 if (event->attr.task)
4675 atomic_dec(&nr_task_events);
4676 if (event->attr.freq)
4677 unaccount_freq_event();
4678 if (event->attr.context_switch) {
4679 dec = true;
4680 atomic_dec(&nr_switch_events);
4681 }
4682 if (is_cgroup_event(event))
4683 dec = true;
4684 if (has_branch_stack(event))
4685 dec = true;
4686 if (event->attr.ksymbol)
4687 atomic_dec(&nr_ksymbol_events);
4688 if (event->attr.bpf_event)
4689 atomic_dec(&nr_bpf_events);
4690 if (event->attr.text_poke)
4691 atomic_dec(&nr_text_poke_events);
4692
4693 if (dec) {
4694 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4695 schedule_delayed_work(&perf_sched_work, HZ);
4696 }
4697
4698 unaccount_event_cpu(event, event->cpu);
4699
4700 unaccount_pmu_sb_event(event);
4701}
4702
4703static void perf_sched_delayed(struct work_struct *work)
4704{
4705 mutex_lock(&perf_sched_mutex);
4706 if (atomic_dec_and_test(&perf_sched_count))
4707 static_branch_disable(&perf_sched_events);
4708 mutex_unlock(&perf_sched_mutex);
4709}
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723static int exclusive_event_init(struct perf_event *event)
4724{
4725 struct pmu *pmu = event->pmu;
4726
4727 if (!is_exclusive_pmu(pmu))
4728 return 0;
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743 if (event->attach_state & PERF_ATTACH_TASK) {
4744 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4745 return -EBUSY;
4746 } else {
4747 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4748 return -EBUSY;
4749 }
4750
4751 return 0;
4752}
4753
4754static void exclusive_event_destroy(struct perf_event *event)
4755{
4756 struct pmu *pmu = event->pmu;
4757
4758 if (!is_exclusive_pmu(pmu))
4759 return;
4760
4761
4762 if (event->attach_state & PERF_ATTACH_TASK)
4763 atomic_dec(&pmu->exclusive_cnt);
4764 else
4765 atomic_inc(&pmu->exclusive_cnt);
4766}
4767
4768static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4769{
4770 if ((e1->pmu == e2->pmu) &&
4771 (e1->cpu == e2->cpu ||
4772 e1->cpu == -1 ||
4773 e2->cpu == -1))
4774 return true;
4775 return false;
4776}
4777
4778static bool exclusive_event_installable(struct perf_event *event,
4779 struct perf_event_context *ctx)
4780{
4781 struct perf_event *iter_event;
4782 struct pmu *pmu = event->pmu;
4783
4784 lockdep_assert_held(&ctx->mutex);
4785
4786 if (!is_exclusive_pmu(pmu))
4787 return true;
4788
4789 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4790 if (exclusive_event_match(iter_event, event))
4791 return false;
4792 }
4793
4794 return true;
4795}
4796
4797static void perf_addr_filters_splice(struct perf_event *event,
4798 struct list_head *head);
4799
4800static void _free_event(struct perf_event *event)
4801{
4802 irq_work_sync(&event->pending);
4803
4804 unaccount_event(event);
4805
4806 security_perf_event_free(event);
4807
4808 if (event->rb) {
4809
4810
4811
4812
4813
4814
4815 mutex_lock(&event->mmap_mutex);
4816 ring_buffer_attach(event, NULL);
4817 mutex_unlock(&event->mmap_mutex);
4818 }
4819
4820 if (is_cgroup_event(event))
4821 perf_detach_cgroup(event);
4822
4823 if (!event->parent) {
4824 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4825 put_callchain_buffers();
4826 }
4827
4828 perf_event_free_bpf_prog(event);
4829 perf_addr_filters_splice(event, NULL);
4830 kfree(event->addr_filter_ranges);
4831
4832 if (event->destroy)
4833 event->destroy(event);
4834
4835
4836
4837
4838
4839 if (event->hw.target)
4840 put_task_struct(event->hw.target);
4841
4842
4843
4844
4845
4846 if (event->ctx)
4847 put_ctx(event->ctx);
4848
4849 exclusive_event_destroy(event);
4850 module_put(event->pmu->module);
4851
4852 call_rcu(&event->rcu_head, free_event_rcu);
4853}
4854
4855
4856
4857
4858
4859static void free_event(struct perf_event *event)
4860{
4861 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4862 "unexpected event refcount: %ld; ptr=%p\n",
4863 atomic_long_read(&event->refcount), event)) {
4864
4865 return;
4866 }
4867
4868 _free_event(event);
4869}
4870
4871
4872
4873
4874static void perf_remove_from_owner(struct perf_event *event)
4875{
4876 struct task_struct *owner;
4877
4878 rcu_read_lock();
4879
4880
4881
4882
4883
4884
4885 owner = READ_ONCE(event->owner);
4886 if (owner) {
4887
4888
4889
4890
4891
4892 get_task_struct(owner);
4893 }
4894 rcu_read_unlock();
4895
4896 if (owner) {
4897
4898
4899
4900
4901
4902
4903
4904
4905 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4906
4907
4908
4909
4910
4911
4912
4913 if (event->owner) {
4914 list_del_init(&event->owner_entry);
4915 smp_store_release(&event->owner, NULL);
4916 }
4917 mutex_unlock(&owner->perf_event_mutex);
4918 put_task_struct(owner);
4919 }
4920}
4921
4922static void put_event(struct perf_event *event)
4923{
4924 if (!atomic_long_dec_and_test(&event->refcount))
4925 return;
4926
4927 _free_event(event);
4928}
4929
4930
4931
4932
4933
4934
4935int perf_event_release_kernel(struct perf_event *event)
4936{
4937 struct perf_event_context *ctx = event->ctx;
4938 struct perf_event *child, *tmp;
4939 LIST_HEAD(free_list);
4940
4941
4942
4943
4944
4945 if (!ctx) {
4946 WARN_ON_ONCE(event->attach_state &
4947 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4948 goto no_ctx;
4949 }
4950
4951 if (!is_kernel_event(event))
4952 perf_remove_from_owner(event);
4953
4954 ctx = perf_event_ctx_lock(event);
4955 WARN_ON_ONCE(ctx->parent_ctx);
4956 perf_remove_from_context(event, DETACH_GROUP);
4957
4958 raw_spin_lock_irq(&ctx->lock);
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970 event->state = PERF_EVENT_STATE_DEAD;
4971 raw_spin_unlock_irq(&ctx->lock);
4972
4973 perf_event_ctx_unlock(event, ctx);
4974
4975again:
4976 mutex_lock(&event->child_mutex);
4977 list_for_each_entry(child, &event->child_list, child_list) {
4978
4979
4980
4981
4982
4983 ctx = READ_ONCE(child->ctx);
4984
4985
4986
4987
4988
4989
4990
4991
4992 get_ctx(ctx);
4993
4994
4995
4996
4997
4998
4999 mutex_unlock(&event->child_mutex);
5000 mutex_lock(&ctx->mutex);
5001 mutex_lock(&event->child_mutex);
5002
5003
5004
5005
5006
5007
5008 tmp = list_first_entry_or_null(&event->child_list,
5009 struct perf_event, child_list);
5010 if (tmp == child) {
5011 perf_remove_from_context(child, DETACH_GROUP);
5012 list_move(&child->child_list, &free_list);
5013
5014
5015
5016
5017 put_event(event);
5018 }
5019
5020 mutex_unlock(&event->child_mutex);
5021 mutex_unlock(&ctx->mutex);
5022 put_ctx(ctx);
5023 goto again;
5024 }
5025 mutex_unlock(&event->child_mutex);
5026
5027 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5028 void *var = &child->ctx->refcount;
5029
5030 list_del(&child->child_list);
5031 free_event(child);
5032
5033
5034
5035
5036
5037 smp_mb();
5038 wake_up_var(var);
5039 }
5040
5041no_ctx:
5042 put_event(event);
5043 return 0;
5044}
5045EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5046
5047
5048
5049
5050static int perf_release(struct inode *inode, struct file *file)
5051{
5052 perf_event_release_kernel(file->private_data);
5053 return 0;
5054}
5055
5056static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5057{
5058 struct perf_event *child;
5059 u64 total = 0;
5060
5061 *enabled = 0;
5062 *running = 0;
5063
5064 mutex_lock(&event->child_mutex);
5065
5066 (void)perf_event_read(event, false);
5067 total += perf_event_count(event);
5068
5069 *enabled += event->total_time_enabled +
5070 atomic64_read(&event->child_total_time_enabled);
5071 *running += event->total_time_running +
5072 atomic64_read(&event->child_total_time_running);
5073
5074 list_for_each_entry(child, &event->child_list, child_list) {
5075 (void)perf_event_read(child, false);
5076 total += perf_event_count(child);
5077 *enabled += child->total_time_enabled;
5078 *running += child->total_time_running;
5079 }
5080 mutex_unlock(&event->child_mutex);
5081
5082 return total;
5083}
5084
5085u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5086{
5087 struct perf_event_context *ctx;
5088 u64 count;
5089
5090 ctx = perf_event_ctx_lock(event);
5091 count = __perf_event_read_value(event, enabled, running);
5092 perf_event_ctx_unlock(event, ctx);
5093
5094 return count;
5095}
5096EXPORT_SYMBOL_GPL(perf_event_read_value);
5097
5098static int __perf_read_group_add(struct perf_event *leader,
5099 u64 read_format, u64 *values)
5100{
5101 struct perf_event_context *ctx = leader->ctx;
5102 struct perf_event *sub;
5103 unsigned long flags;
5104 int n = 1;
5105 int ret;
5106
5107 ret = perf_event_read(leader, true);
5108 if (ret)
5109 return ret;
5110
5111 raw_spin_lock_irqsave(&ctx->lock, flags);
5112
5113
5114
5115
5116
5117
5118 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5119 values[n++] += leader->total_time_enabled +
5120 atomic64_read(&leader->child_total_time_enabled);
5121 }
5122
5123 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5124 values[n++] += leader->total_time_running +
5125 atomic64_read(&leader->child_total_time_running);
5126 }
5127
5128
5129
5130
5131 values[n++] += perf_event_count(leader);
5132 if (read_format & PERF_FORMAT_ID)
5133 values[n++] = primary_event_id(leader);
5134
5135 for_each_sibling_event(sub, leader) {
5136 values[n++] += perf_event_count(sub);
5137 if (read_format & PERF_FORMAT_ID)
5138 values[n++] = primary_event_id(sub);
5139 }
5140
5141 raw_spin_unlock_irqrestore(&ctx->lock, flags);
5142 return 0;
5143}
5144
5145static int perf_read_group(struct perf_event *event,
5146 u64 read_format, char __user *buf)
5147{
5148 struct perf_event *leader = event->group_leader, *child;
5149 struct perf_event_context *ctx = leader->ctx;
5150 int ret;
5151 u64 *values;
5152
5153 lockdep_assert_held(&ctx->mutex);
5154
5155 values = kzalloc(event->read_size, GFP_KERNEL);
5156 if (!values)
5157 return -ENOMEM;
5158
5159 values[0] = 1 + leader->nr_siblings;
5160
5161
5162
5163
5164
5165 mutex_lock(&leader->child_mutex);
5166
5167 ret = __perf_read_group_add(leader, read_format, values);
5168 if (ret)
5169 goto unlock;
5170
5171 list_for_each_entry(child, &leader->child_list, child_list) {
5172 ret = __perf_read_group_add(child, read_format, values);
5173 if (ret)
5174 goto unlock;
5175 }
5176
5177 mutex_unlock(&leader->child_mutex);
5178
5179 ret = event->read_size;
5180 if (copy_to_user(buf, values, event->read_size))
5181 ret = -EFAULT;
5182 goto out;
5183
5184unlock:
5185 mutex_unlock(&leader->child_mutex);
5186out:
5187 kfree(values);
5188 return ret;
5189}
5190
5191static int perf_read_one(struct perf_event *event,
5192 u64 read_format, char __user *buf)
5193{
5194 u64 enabled, running;
5195 u64 values[4];
5196 int n = 0;
5197
5198 values[n++] = __perf_event_read_value(event, &enabled, &running);
5199 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5200 values[n++] = enabled;
5201 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5202 values[n++] = running;
5203 if (read_format & PERF_FORMAT_ID)
5204 values[n++] = primary_event_id(event);
5205
5206 if (copy_to_user(buf, values, n * sizeof(u64)))
5207 return -EFAULT;
5208
5209 return n * sizeof(u64);
5210}
5211
5212static bool is_event_hup(struct perf_event *event)
5213{
5214 bool no_children;
5215
5216 if (event->state > PERF_EVENT_STATE_EXIT)
5217 return false;
5218
5219 mutex_lock(&event->child_mutex);
5220 no_children = list_empty(&event->child_list);
5221 mutex_unlock(&event->child_mutex);
5222 return no_children;
5223}
5224
5225
5226
5227
5228static ssize_t
5229__perf_read(struct perf_event *event, char __user *buf, size_t count)
5230{
5231 u64 read_format = event->attr.read_format;
5232 int ret;
5233
5234
5235
5236
5237
5238
5239 if (event->state == PERF_EVENT_STATE_ERROR)
5240 return 0;
5241
5242 if (count < event->read_size)
5243 return -ENOSPC;
5244
5245 WARN_ON_ONCE(event->ctx->parent_ctx);
5246 if (read_format & PERF_FORMAT_GROUP)
5247 ret = perf_read_group(event, read_format, buf);
5248 else
5249 ret = perf_read_one(event, read_format, buf);
5250
5251 return ret;
5252}
5253
5254static ssize_t
5255perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5256{
5257 struct perf_event *event = file->private_data;
5258 struct perf_event_context *ctx;
5259 int ret;
5260
5261 ret = security_perf_event_read(event);
5262 if (ret)
5263 return ret;
5264
5265 ctx = perf_event_ctx_lock(event);
5266 ret = __perf_read(event, buf, count);
5267 perf_event_ctx_unlock(event, ctx);
5268
5269 return ret;
5270}
5271
5272static __poll_t perf_poll(struct file *file, poll_table *wait)
5273{
5274 struct perf_event *event = file->private_data;
5275 struct perf_buffer *rb;
5276 __poll_t events = EPOLLHUP;
5277
5278 poll_wait(file, &event->waitq, wait);
5279
5280 if (is_event_hup(event))
5281 return events;
5282
5283
5284
5285
5286
5287 mutex_lock(&event->mmap_mutex);
5288 rb = event->rb;
5289 if (rb)
5290 events = atomic_xchg(&rb->poll, 0);
5291 mutex_unlock(&event->mmap_mutex);
5292 return events;
5293}
5294
5295static void _perf_event_reset(struct perf_event *event)
5296{
5297 (void)perf_event_read(event, false);
5298 local64_set(&event->count, 0);
5299 perf_event_update_userpage(event);
5300}
5301
5302
5303u64 perf_event_pause(struct perf_event *event, bool reset)
5304{
5305 struct perf_event_context *ctx;
5306 u64 count;
5307
5308 ctx = perf_event_ctx_lock(event);
5309 WARN_ON_ONCE(event->attr.inherit);
5310 _perf_event_disable(event);
5311 count = local64_read(&event->count);
5312 if (reset)
5313 local64_set(&event->count, 0);
5314 perf_event_ctx_unlock(event, ctx);
5315
5316 return count;
5317}
5318EXPORT_SYMBOL_GPL(perf_event_pause);
5319
5320
5321
5322
5323
5324
5325
5326static void perf_event_for_each_child(struct perf_event *event,
5327 void (*func)(struct perf_event *))
5328{
5329 struct perf_event *child;
5330
5331 WARN_ON_ONCE(event->ctx->parent_ctx);
5332
5333 mutex_lock(&event->child_mutex);
5334 func(event);
5335 list_for_each_entry(child, &event->child_list, child_list)
5336 func(child);
5337 mutex_unlock(&event->child_mutex);
5338}
5339
5340static void perf_event_for_each(struct perf_event *event,
5341 void (*func)(struct perf_event *))
5342{
5343 struct perf_event_context *ctx = event->ctx;
5344 struct perf_event *sibling;
5345
5346 lockdep_assert_held(&ctx->mutex);
5347
5348 event = event->group_leader;
5349
5350 perf_event_for_each_child(event, func);
5351 for_each_sibling_event(sibling, event)
5352 perf_event_for_each_child(sibling, func);
5353}
5354
5355static void __perf_event_period(struct perf_event *event,
5356 struct perf_cpu_context *cpuctx,
5357 struct perf_event_context *ctx,
5358 void *info)
5359{
5360 u64 value = *((u64 *)info);
5361 bool active;
5362
5363 if (event->attr.freq) {
5364 event->attr.sample_freq = value;
5365 } else {
5366 event->attr.sample_period = value;
5367 event->hw.sample_period = value;
5368 }
5369
5370 active = (event->state == PERF_EVENT_STATE_ACTIVE);
5371 if (active) {
5372 perf_pmu_disable(ctx->pmu);
5373
5374
5375
5376
5377 if (event->hw.interrupts == MAX_INTERRUPTS) {
5378 event->hw.interrupts = 0;
5379 perf_log_throttle(event, 1);
5380 }
5381 event->pmu->stop(event, PERF_EF_UPDATE);
5382 }
5383
5384 local64_set(&event->hw.period_left, 0);
5385
5386 if (active) {
5387 event->pmu->start(event, PERF_EF_RELOAD);
5388 perf_pmu_enable(ctx->pmu);
5389 }
5390}
5391
5392static int perf_event_check_period(struct perf_event *event, u64 value)
5393{
5394 return event->pmu->check_period(event, value);
5395}
5396
5397static int _perf_event_period(struct perf_event *event, u64 value)
5398{
5399 if (!is_sampling_event(event))
5400 return -EINVAL;
5401
5402 if (!value)
5403 return -EINVAL;
5404
5405 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5406 return -EINVAL;
5407
5408 if (perf_event_check_period(event, value))
5409 return -EINVAL;
5410
5411 if (!event->attr.freq && (value & (1ULL << 63)))
5412 return -EINVAL;
5413
5414 event_function_call(event, __perf_event_period, &value);
5415
5416 return 0;
5417}
5418
5419int perf_event_period(struct perf_event *event, u64 value)
5420{
5421 struct perf_event_context *ctx;
5422 int ret;
5423
5424 ctx = perf_event_ctx_lock(event);
5425 ret = _perf_event_period(event, value);
5426 perf_event_ctx_unlock(event, ctx);
5427
5428 return ret;
5429}
5430EXPORT_SYMBOL_GPL(perf_event_period);
5431
5432static const struct file_operations perf_fops;
5433
5434static inline int perf_fget_light(int fd, struct fd *p)
5435{
5436 struct fd f = fdget(fd);
5437 if (!f.file)
5438 return -EBADF;
5439
5440 if (f.file->f_op != &perf_fops) {
5441 fdput(f);
5442 return -EBADF;
5443 }
5444 *p = f;
5445 return 0;
5446}
5447
5448static int perf_event_set_output(struct perf_event *event,
5449 struct perf_event *output_event);
5450static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5451static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5452static int perf_copy_attr(struct perf_event_attr __user *uattr,
5453 struct perf_event_attr *attr);
5454
5455static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5456{
5457 void (*func)(struct perf_event *);
5458 u32 flags = arg;
5459
5460 switch (cmd) {
5461 case PERF_EVENT_IOC_ENABLE:
5462 func = _perf_event_enable;
5463 break;
5464 case PERF_EVENT_IOC_DISABLE:
5465 func = _perf_event_disable;
5466 break;
5467 case PERF_EVENT_IOC_RESET:
5468 func = _perf_event_reset;
5469 break;
5470
5471 case PERF_EVENT_IOC_REFRESH:
5472 return _perf_event_refresh(event, arg);
5473
5474 case PERF_EVENT_IOC_PERIOD:
5475 {
5476 u64 value;
5477
5478 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5479 return -EFAULT;
5480
5481 return _perf_event_period(event, value);
5482 }
5483 case PERF_EVENT_IOC_ID:
5484 {
5485 u64 id = primary_event_id(event);
5486
5487 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5488 return -EFAULT;
5489 return 0;
5490 }
5491
5492 case PERF_EVENT_IOC_SET_OUTPUT:
5493 {
5494 int ret;
5495 if (arg != -1) {
5496 struct perf_event *output_event;
5497 struct fd output;
5498 ret = perf_fget_light(arg, &output);
5499 if (ret)
5500 return ret;
5501 output_event = output.file->private_data;
5502 ret = perf_event_set_output(event, output_event);
5503 fdput(output);
5504 } else {
5505 ret = perf_event_set_output(event, NULL);
5506 }
5507 return ret;
5508 }
5509
5510 case PERF_EVENT_IOC_SET_FILTER:
5511 return perf_event_set_filter(event, (void __user *)arg);
5512
5513 case PERF_EVENT_IOC_SET_BPF:
5514 return perf_event_set_bpf_prog(event, arg);
5515
5516 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5517 struct perf_buffer *rb;
5518
5519 rcu_read_lock();
5520 rb = rcu_dereference(event->rb);
5521 if (!rb || !rb->nr_pages) {
5522 rcu_read_unlock();
5523 return -EINVAL;
5524 }
5525 rb_toggle_paused(rb, !!arg);
5526 rcu_read_unlock();
5527 return 0;
5528 }
5529
5530 case PERF_EVENT_IOC_QUERY_BPF:
5531 return perf_event_query_prog_array(event, (void __user *)arg);
5532
5533 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5534 struct perf_event_attr new_attr;
5535 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5536 &new_attr);
5537
5538 if (err)
5539 return err;
5540
5541 return perf_event_modify_attr(event, &new_attr);
5542 }
5543 default:
5544 return -ENOTTY;
5545 }
5546
5547 if (flags & PERF_IOC_FLAG_GROUP)
5548 perf_event_for_each(event, func);
5549 else
5550 perf_event_for_each_child(event, func);
5551
5552 return 0;
5553}
5554
5555static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5556{
5557 struct perf_event *event = file->private_data;
5558 struct perf_event_context *ctx;
5559 long ret;
5560
5561
5562 ret = security_perf_event_write(event);
5563 if (ret)
5564 return ret;
5565
5566 ctx = perf_event_ctx_lock(event);
5567 ret = _perf_ioctl(event, cmd, arg);
5568 perf_event_ctx_unlock(event, ctx);
5569
5570 return ret;
5571}
5572
5573#ifdef CONFIG_COMPAT
5574static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5575 unsigned long arg)
5576{
5577 switch (_IOC_NR(cmd)) {
5578 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5579 case _IOC_NR(PERF_EVENT_IOC_ID):
5580 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5581 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5582
5583 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5584 cmd &= ~IOCSIZE_MASK;
5585 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5586 }
5587 break;
5588 }
5589 return perf_ioctl(file, cmd, arg);
5590}
5591#else
5592# define perf_compat_ioctl NULL
5593#endif
5594
5595int perf_event_task_enable(void)
5596{
5597 struct perf_event_context *ctx;
5598 struct perf_event *event;
5599
5600 mutex_lock(¤t->perf_event_mutex);
5601 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5602 ctx = perf_event_ctx_lock(event);
5603 perf_event_for_each_child(event, _perf_event_enable);
5604 perf_event_ctx_unlock(event, ctx);
5605 }
5606 mutex_unlock(¤t->perf_event_mutex);
5607
5608 return 0;
5609}
5610
5611int perf_event_task_disable(void)
5612{
5613 struct perf_event_context *ctx;
5614 struct perf_event *event;
5615
5616 mutex_lock(¤t->perf_event_mutex);
5617 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5618 ctx = perf_event_ctx_lock(event);
5619 perf_event_for_each_child(event, _perf_event_disable);
5620 perf_event_ctx_unlock(event, ctx);
5621 }
5622 mutex_unlock(¤t->perf_event_mutex);
5623
5624 return 0;
5625}
5626
5627static int perf_event_index(struct perf_event *event)
5628{
5629 if (event->hw.state & PERF_HES_STOPPED)
5630 return 0;
5631
5632 if (event->state != PERF_EVENT_STATE_ACTIVE)
5633 return 0;
5634
5635 return event->pmu->event_idx(event);
5636}
5637
5638static void calc_timer_values(struct perf_event *event,
5639 u64 *now,
5640 u64 *enabled,
5641 u64 *running)
5642{
5643 u64 ctx_time;
5644
5645 *now = perf_clock();
5646 ctx_time = event->shadow_ctx_time + *now;
5647 __perf_update_times(event, ctx_time, enabled, running);
5648}
5649
5650static void perf_event_init_userpage(struct perf_event *event)
5651{
5652 struct perf_event_mmap_page *userpg;
5653 struct perf_buffer *rb;
5654
5655 rcu_read_lock();
5656 rb = rcu_dereference(event->rb);
5657 if (!rb)
5658 goto unlock;
5659
5660 userpg = rb->user_page;
5661
5662
5663 userpg->cap_bit0_is_deprecated = 1;
5664 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5665 userpg->data_offset = PAGE_SIZE;
5666 userpg->data_size = perf_data_size(rb);
5667
5668unlock:
5669 rcu_read_unlock();
5670}
5671
5672void __weak arch_perf_update_userpage(
5673 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5674{
5675}
5676
5677
5678
5679
5680
5681
5682void perf_event_update_userpage(struct perf_event *event)
5683{
5684 struct perf_event_mmap_page *userpg;
5685 struct perf_buffer *rb;
5686 u64 enabled, running, now;
5687
5688 rcu_read_lock();
5689 rb = rcu_dereference(event->rb);
5690 if (!rb)
5691 goto unlock;
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702 calc_timer_values(event, &now, &enabled, &running);
5703
5704 userpg = rb->user_page;
5705
5706
5707
5708
5709 preempt_disable();
5710 ++userpg->lock;
5711 barrier();
5712 userpg->index = perf_event_index(event);
5713 userpg->offset = perf_event_count(event);
5714 if (userpg->index)
5715 userpg->offset -= local64_read(&event->hw.prev_count);
5716
5717 userpg->time_enabled = enabled +
5718 atomic64_read(&event->child_total_time_enabled);
5719
5720 userpg->time_running = running +
5721 atomic64_read(&event->child_total_time_running);
5722
5723 arch_perf_update_userpage(event, userpg, now);
5724
5725 barrier();
5726 ++userpg->lock;
5727 preempt_enable();
5728unlock:
5729 rcu_read_unlock();
5730}
5731EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5732
5733static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5734{
5735 struct perf_event *event = vmf->vma->vm_file->private_data;
5736 struct perf_buffer *rb;
5737 vm_fault_t ret = VM_FAULT_SIGBUS;
5738
5739 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5740 if (vmf->pgoff == 0)
5741 ret = 0;
5742 return ret;
5743 }
5744
5745 rcu_read_lock();
5746 rb = rcu_dereference(event->rb);
5747 if (!rb)
5748 goto unlock;
5749
5750 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5751 goto unlock;
5752
5753 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5754 if (!vmf->page)
5755 goto unlock;
5756
5757 get_page(vmf->page);
5758 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5759 vmf->page->index = vmf->pgoff;
5760
5761 ret = 0;
5762unlock:
5763 rcu_read_unlock();
5764
5765 return ret;
5766}
5767
5768static void ring_buffer_attach(struct perf_event *event,
5769 struct perf_buffer *rb)
5770{
5771 struct perf_buffer *old_rb = NULL;
5772 unsigned long flags;
5773
5774 if (event->rb) {
5775
5776
5777
5778
5779 WARN_ON_ONCE(event->rcu_pending);
5780
5781 old_rb = event->rb;
5782 spin_lock_irqsave(&old_rb->event_lock, flags);
5783 list_del_rcu(&event->rb_entry);
5784 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5785
5786 event->rcu_batches = get_state_synchronize_rcu();
5787 event->rcu_pending = 1;
5788 }
5789
5790 if (rb) {
5791 if (event->rcu_pending) {
5792 cond_synchronize_rcu(event->rcu_batches);
5793 event->rcu_pending = 0;
5794 }
5795
5796 spin_lock_irqsave(&rb->event_lock, flags);
5797 list_add_rcu(&event->rb_entry, &rb->event_list);
5798 spin_unlock_irqrestore(&rb->event_lock, flags);
5799 }
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811 if (has_aux(event))
5812 perf_event_stop(event, 0);
5813
5814 rcu_assign_pointer(event->rb, rb);
5815
5816 if (old_rb) {
5817 ring_buffer_put(old_rb);
5818
5819
5820
5821
5822
5823 wake_up_all(&event->waitq);
5824 }
5825}
5826
5827static void ring_buffer_wakeup(struct perf_event *event)
5828{
5829 struct perf_buffer *rb;
5830
5831 rcu_read_lock();
5832 rb = rcu_dereference(event->rb);
5833 if (rb) {
5834 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5835 wake_up_all(&event->waitq);
5836 }
5837 rcu_read_unlock();
5838}
5839
5840struct perf_buffer *ring_buffer_get(struct perf_event *event)
5841{
5842 struct perf_buffer *rb;
5843
5844 rcu_read_lock();
5845 rb = rcu_dereference(event->rb);
5846 if (rb) {
5847 if (!refcount_inc_not_zero(&rb->refcount))
5848 rb = NULL;
5849 }
5850 rcu_read_unlock();
5851
5852 return rb;
5853}
5854
5855void ring_buffer_put(struct perf_buffer *rb)
5856{
5857 if (!refcount_dec_and_test(&rb->refcount))
5858 return;
5859
5860 WARN_ON_ONCE(!list_empty(&rb->event_list));
5861
5862 call_rcu(&rb->rcu_head, rb_free_rcu);
5863}
5864
5865static void perf_mmap_open(struct vm_area_struct *vma)
5866{
5867 struct perf_event *event = vma->vm_file->private_data;
5868
5869 atomic_inc(&event->mmap_count);
5870 atomic_inc(&event->rb->mmap_count);
5871
5872 if (vma->vm_pgoff)
5873 atomic_inc(&event->rb->aux_mmap_count);
5874
5875 if (event->pmu->event_mapped)
5876 event->pmu->event_mapped(event, vma->vm_mm);
5877}
5878
5879static void perf_pmu_output_stop(struct perf_event *event);
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889static void perf_mmap_close(struct vm_area_struct *vma)
5890{
5891 struct perf_event *event = vma->vm_file->private_data;
5892 struct perf_buffer *rb = ring_buffer_get(event);
5893 struct user_struct *mmap_user = rb->mmap_user;
5894 int mmap_locked = rb->mmap_locked;
5895 unsigned long size = perf_data_size(rb);
5896 bool detach_rest = false;
5897
5898 if (event->pmu->event_unmapped)
5899 event->pmu->event_unmapped(event, vma->vm_mm);
5900
5901
5902
5903
5904
5905
5906 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5907 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5908
5909
5910
5911
5912
5913
5914 perf_pmu_output_stop(event);
5915
5916
5917 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5918 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5919
5920
5921 rb_free_aux(rb);
5922 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5923
5924 mutex_unlock(&event->mmap_mutex);
5925 }
5926
5927 if (atomic_dec_and_test(&rb->mmap_count))
5928 detach_rest = true;
5929
5930 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5931 goto out_put;
5932
5933 ring_buffer_attach(event, NULL);
5934 mutex_unlock(&event->mmap_mutex);
5935
5936
5937 if (!detach_rest)
5938 goto out_put;
5939
5940
5941
5942
5943
5944
5945again:
5946 rcu_read_lock();
5947 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5948 if (!atomic_long_inc_not_zero(&event->refcount)) {
5949
5950
5951
5952
5953 continue;
5954 }
5955 rcu_read_unlock();
5956
5957 mutex_lock(&event->mmap_mutex);
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968 if (event->rb == rb)
5969 ring_buffer_attach(event, NULL);
5970
5971 mutex_unlock(&event->mmap_mutex);
5972 put_event(event);
5973
5974
5975
5976
5977
5978 goto again;
5979 }
5980 rcu_read_unlock();
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991 atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
5992 &mmap_user->locked_vm);
5993 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5994 free_uid(mmap_user);
5995
5996out_put:
5997 ring_buffer_put(rb);
5998}
5999
6000static const struct vm_operations_struct perf_mmap_vmops = {
6001 .open = perf_mmap_open,
6002 .close = perf_mmap_close,
6003 .fault = perf_mmap_fault,
6004 .page_mkwrite = perf_mmap_fault,
6005};
6006
6007static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6008{
6009 struct perf_event *event = file->private_data;
6010 unsigned long user_locked, user_lock_limit;
6011 struct user_struct *user = current_user();
6012 struct perf_buffer *rb = NULL;
6013 unsigned long locked, lock_limit;
6014 unsigned long vma_size;
6015 unsigned long nr_pages;
6016 long user_extra = 0, extra = 0;
6017 int ret = 0, flags = 0;
6018
6019
6020
6021
6022
6023
6024 if (event->cpu == -1 && event->attr.inherit)
6025 return -EINVAL;
6026
6027 if (!(vma->vm_flags & VM_SHARED))
6028 return -EINVAL;
6029
6030 ret = security_perf_event_read(event);
6031 if (ret)
6032 return ret;
6033
6034 vma_size = vma->vm_end - vma->vm_start;
6035
6036 if (vma->vm_pgoff == 0) {
6037 nr_pages = (vma_size / PAGE_SIZE) - 1;
6038 } else {
6039
6040
6041
6042
6043
6044 u64 aux_offset, aux_size;
6045
6046 if (!event->rb)
6047 return -EINVAL;
6048
6049 nr_pages = vma_size / PAGE_SIZE;
6050
6051 mutex_lock(&event->mmap_mutex);
6052 ret = -EINVAL;
6053
6054 rb = event->rb;
6055 if (!rb)
6056 goto aux_unlock;
6057
6058 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6059 aux_size = READ_ONCE(rb->user_page->aux_size);
6060
6061 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6062 goto aux_unlock;
6063
6064 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6065 goto aux_unlock;
6066
6067
6068 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6069 goto aux_unlock;
6070
6071 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6072 goto aux_unlock;
6073
6074
6075 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6076 goto aux_unlock;
6077
6078 if (!is_power_of_2(nr_pages))
6079 goto aux_unlock;
6080
6081 if (!atomic_inc_not_zero(&rb->mmap_count))
6082 goto aux_unlock;
6083
6084 if (rb_has_aux(rb)) {
6085 atomic_inc(&rb->aux_mmap_count);
6086 ret = 0;
6087 goto unlock;
6088 }
6089
6090 atomic_set(&rb->aux_mmap_count, 1);
6091 user_extra = nr_pages;
6092
6093 goto accounting;
6094 }
6095
6096
6097
6098
6099
6100 if (nr_pages != 0 && !is_power_of_2(nr_pages))
6101 return -EINVAL;
6102
6103 if (vma_size != PAGE_SIZE * (1 + nr_pages))
6104 return -EINVAL;
6105
6106 WARN_ON_ONCE(event->ctx->parent_ctx);
6107again:
6108 mutex_lock(&event->mmap_mutex);
6109 if (event->rb) {
6110 if (event->rb->nr_pages != nr_pages) {
6111 ret = -EINVAL;
6112 goto unlock;
6113 }
6114
6115 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6116
6117
6118
6119
6120
6121 mutex_unlock(&event->mmap_mutex);
6122 goto again;
6123 }
6124
6125 goto unlock;
6126 }
6127
6128 user_extra = nr_pages + 1;
6129
6130accounting:
6131 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6132
6133
6134
6135
6136 user_lock_limit *= num_online_cpus();
6137
6138 user_locked = atomic_long_read(&user->locked_vm);
6139
6140
6141
6142
6143
6144 if (user_locked > user_lock_limit)
6145 user_locked = user_lock_limit;
6146 user_locked += user_extra;
6147
6148 if (user_locked > user_lock_limit) {
6149
6150
6151
6152
6153 extra = user_locked - user_lock_limit;
6154 user_extra -= extra;
6155 }
6156
6157 lock_limit = rlimit(RLIMIT_MEMLOCK);
6158 lock_limit >>= PAGE_SHIFT;
6159 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6160
6161 if ((locked > lock_limit) && perf_is_paranoid() &&
6162 !capable(CAP_IPC_LOCK)) {
6163 ret = -EPERM;
6164 goto unlock;
6165 }
6166
6167 WARN_ON(!rb && event->rb);
6168
6169 if (vma->vm_flags & VM_WRITE)
6170 flags |= RING_BUFFER_WRITABLE;
6171
6172 if (!rb) {
6173 rb = rb_alloc(nr_pages,
6174 event->attr.watermark ? event->attr.wakeup_watermark : 0,
6175 event->cpu, flags);
6176
6177 if (!rb) {
6178 ret = -ENOMEM;
6179 goto unlock;
6180 }
6181
6182 atomic_set(&rb->mmap_count, 1);
6183 rb->mmap_user = get_current_user();
6184 rb->mmap_locked = extra;
6185
6186 ring_buffer_attach(event, rb);
6187
6188 perf_event_init_userpage(event);
6189 perf_event_update_userpage(event);
6190 } else {
6191 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6192 event->attr.aux_watermark, flags);
6193 if (!ret)
6194 rb->aux_mmap_locked = extra;
6195 }
6196
6197unlock:
6198 if (!ret) {
6199 atomic_long_add(user_extra, &user->locked_vm);
6200 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6201
6202 atomic_inc(&event->mmap_count);
6203 } else if (rb) {
6204 atomic_dec(&rb->mmap_count);
6205 }
6206aux_unlock:
6207 mutex_unlock(&event->mmap_mutex);
6208
6209
6210
6211
6212
6213 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6214 vma->vm_ops = &perf_mmap_vmops;
6215
6216 if (event->pmu->event_mapped)
6217 event->pmu->event_mapped(event, vma->vm_mm);
6218
6219 return ret;
6220}
6221
6222static int perf_fasync(int fd, struct file *filp, int on)
6223{
6224 struct inode *inode = file_inode(filp);
6225 struct perf_event *event = filp->private_data;
6226 int retval;
6227
6228 inode_lock(inode);
6229 retval = fasync_helper(fd, filp, on, &event->fasync);
6230 inode_unlock(inode);
6231
6232 if (retval < 0)
6233 return retval;
6234
6235 return 0;
6236}
6237
6238static const struct file_operations perf_fops = {
6239 .llseek = no_llseek,
6240 .release = perf_release,
6241 .read = perf_read,
6242 .poll = perf_poll,
6243 .unlocked_ioctl = perf_ioctl,
6244 .compat_ioctl = perf_compat_ioctl,
6245 .mmap = perf_mmap,
6246 .fasync = perf_fasync,
6247};
6248
6249
6250
6251
6252
6253
6254
6255
6256static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6257{
6258
6259 if (event->parent)
6260 event = event->parent;
6261 return &event->fasync;
6262}
6263
6264void perf_event_wakeup(struct perf_event *event)
6265{
6266 ring_buffer_wakeup(event);
6267
6268 if (event->pending_kill) {
6269 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6270 event->pending_kill = 0;
6271 }
6272}
6273
6274static void perf_pending_event_disable(struct perf_event *event)
6275{
6276 int cpu = READ_ONCE(event->pending_disable);
6277
6278 if (cpu < 0)
6279 return;
6280
6281 if (cpu == smp_processor_id()) {
6282 WRITE_ONCE(event->pending_disable, -1);
6283 perf_event_disable_local(event);
6284 return;
6285 }
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307 irq_work_queue_on(&event->pending, cpu);
6308}
6309
6310static void perf_pending_event(struct irq_work *entry)
6311{
6312 struct perf_event *event = container_of(entry, struct perf_event, pending);
6313 int rctx;
6314
6315 rctx = perf_swevent_get_recursion_context();
6316
6317
6318
6319
6320
6321 perf_pending_event_disable(event);
6322
6323 if (event->pending_wakeup) {
6324 event->pending_wakeup = 0;
6325 perf_event_wakeup(event);
6326 }
6327
6328 if (rctx >= 0)
6329 perf_swevent_put_recursion_context(rctx);
6330}
6331
6332
6333
6334
6335
6336
6337struct perf_guest_info_callbacks *perf_guest_cbs;
6338
6339int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6340{
6341 perf_guest_cbs = cbs;
6342 return 0;
6343}
6344EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6345
6346int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6347{
6348 perf_guest_cbs = NULL;
6349 return 0;
6350}
6351EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6352
6353static void
6354perf_output_sample_regs(struct perf_output_handle *handle,
6355 struct pt_regs *regs, u64 mask)
6356{
6357 int bit;
6358 DECLARE_BITMAP(_mask, 64);
6359
6360 bitmap_from_u64(_mask, mask);
6361 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6362 u64 val;
6363
6364 val = perf_reg_value(regs, bit);
6365 perf_output_put(handle, val);
6366 }
6367}
6368
6369static void perf_sample_regs_user(struct perf_regs *regs_user,
6370 struct pt_regs *regs)
6371{
6372 if (user_mode(regs)) {
6373 regs_user->abi = perf_reg_abi(current);
6374 regs_user->regs = regs;
6375 } else if (!(current->flags & PF_KTHREAD)) {
6376 perf_get_regs_user(regs_user, regs);
6377 } else {
6378 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6379 regs_user->regs = NULL;
6380 }
6381}
6382
6383static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6384 struct pt_regs *regs)
6385{
6386 regs_intr->regs = regs;
6387 regs_intr->abi = perf_reg_abi(current);
6388}
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398static u64 perf_ustack_task_size(struct pt_regs *regs)
6399{
6400 unsigned long addr = perf_user_stack_pointer(regs);
6401
6402 if (!addr || addr >= TASK_SIZE)
6403 return 0;
6404
6405 return TASK_SIZE - addr;
6406}
6407
6408static u16
6409perf_sample_ustack_size(u16 stack_size, u16 header_size,
6410 struct pt_regs *regs)
6411{
6412 u64 task_size;
6413
6414
6415 if (!regs)
6416 return 0;
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6429 stack_size = min(stack_size, (u16) task_size);
6430
6431
6432 header_size += 2 * sizeof(u64);
6433
6434
6435 if ((u16) (header_size + stack_size) < header_size) {
6436
6437
6438
6439
6440 stack_size = USHRT_MAX - header_size - sizeof(u64);
6441 stack_size = round_up(stack_size, sizeof(u64));
6442 }
6443
6444 return stack_size;
6445}
6446
6447static void
6448perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6449 struct pt_regs *regs)
6450{
6451
6452 if (!regs) {
6453 u64 size = 0;
6454 perf_output_put(handle, size);
6455 } else {
6456 unsigned long sp;
6457 unsigned int rem;
6458 u64 dyn_size;
6459 mm_segment_t fs;
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473 perf_output_put(handle, dump_size);
6474
6475
6476 sp = perf_user_stack_pointer(regs);
6477 fs = force_uaccess_begin();
6478 rem = __output_copy_user(handle, (void *) sp, dump_size);
6479 force_uaccess_end(fs);
6480 dyn_size = dump_size - rem;
6481
6482 perf_output_skip(handle, rem);
6483
6484
6485 perf_output_put(handle, dyn_size);
6486 }
6487}
6488
6489static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6490 struct perf_sample_data *data,
6491 size_t size)
6492{
6493 struct perf_event *sampler = event->aux_event;
6494 struct perf_buffer *rb;
6495
6496 data->aux_size = 0;
6497
6498 if (!sampler)
6499 goto out;
6500
6501 if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6502 goto out;
6503
6504 if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6505 goto out;
6506
6507 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6508 if (!rb)
6509 goto out;
6510
6511
6512
6513
6514
6515 if (READ_ONCE(rb->aux_in_sampling)) {
6516 data->aux_size = 0;
6517 } else {
6518 size = min_t(size_t, size, perf_aux_size(rb));
6519 data->aux_size = ALIGN(size, sizeof(u64));
6520 }
6521 ring_buffer_put(rb);
6522
6523out:
6524 return data->aux_size;
6525}
6526
6527long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6528 struct perf_event *event,
6529 struct perf_output_handle *handle,
6530 unsigned long size)
6531{
6532 unsigned long flags;
6533 long ret;
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544 local_irq_save(flags);
6545
6546
6547
6548
6549 WRITE_ONCE(rb->aux_in_sampling, 1);
6550 barrier();
6551
6552 ret = event->pmu->snapshot_aux(event, handle, size);
6553
6554 barrier();
6555 WRITE_ONCE(rb->aux_in_sampling, 0);
6556 local_irq_restore(flags);
6557
6558 return ret;
6559}
6560
6561static void perf_aux_sample_output(struct perf_event *event,
6562 struct perf_output_handle *handle,
6563 struct perf_sample_data *data)
6564{
6565 struct perf_event *sampler = event->aux_event;
6566 struct perf_buffer *rb;
6567 unsigned long pad;
6568 long size;
6569
6570 if (WARN_ON_ONCE(!sampler || !data->aux_size))
6571 return;
6572
6573 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6574 if (!rb)
6575 return;
6576
6577 size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6578
6579
6580
6581
6582
6583
6584
6585 if (WARN_ON_ONCE(size < 0))
6586 goto out_put;
6587
6588
6589
6590
6591
6592 pad = data->aux_size - size;
6593 if (WARN_ON_ONCE(pad >= sizeof(u64)))
6594 pad = 8;
6595
6596 if (pad) {
6597 u64 zero = 0;
6598 perf_output_copy(handle, &zero, pad);
6599 }
6600
6601out_put:
6602 ring_buffer_put(rb);
6603}
6604
6605static void __perf_event_header__init_id(struct perf_event_header *header,
6606 struct perf_sample_data *data,
6607 struct perf_event *event)
6608{
6609 u64 sample_type = event->attr.sample_type;
6610
6611 data->type = sample_type;
6612 header->size += event->id_header_size;
6613
6614 if (sample_type & PERF_SAMPLE_TID) {
6615
6616 data->tid_entry.pid = perf_event_pid(event, current);
6617 data->tid_entry.tid = perf_event_tid(event, current);
6618 }
6619
6620 if (sample_type & PERF_SAMPLE_TIME)
6621 data->time = perf_event_clock(event);
6622
6623 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6624 data->id = primary_event_id(event);
6625
6626 if (sample_type & PERF_SAMPLE_STREAM_ID)
6627 data->stream_id = event->id;
6628
6629 if (sample_type & PERF_SAMPLE_CPU) {
6630 data->cpu_entry.cpu = raw_smp_processor_id();
6631 data->cpu_entry.reserved = 0;
6632 }
6633}
6634
6635void perf_event_header__init_id(struct perf_event_header *header,
6636 struct perf_sample_data *data,
6637 struct perf_event *event)
6638{
6639 if (event->attr.sample_id_all)
6640 __perf_event_header__init_id(header, data, event);
6641}
6642
6643static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6644 struct perf_sample_data *data)
6645{
6646 u64 sample_type = data->type;
6647
6648 if (sample_type & PERF_SAMPLE_TID)
6649 perf_output_put(handle, data->tid_entry);
6650
6651 if (sample_type & PERF_SAMPLE_TIME)
6652 perf_output_put(handle, data->time);
6653
6654 if (sample_type & PERF_SAMPLE_ID)
6655 perf_output_put(handle, data->id);
6656
6657 if (sample_type & PERF_SAMPLE_STREAM_ID)
6658 perf_output_put(handle, data->stream_id);
6659
6660 if (sample_type & PERF_SAMPLE_CPU)
6661 perf_output_put(handle, data->cpu_entry);
6662
6663 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6664 perf_output_put(handle, data->id);
6665}
6666
6667void perf_event__output_id_sample(struct perf_event *event,
6668 struct perf_output_handle *handle,
6669 struct perf_sample_data *sample)
6670{
6671 if (event->attr.sample_id_all)
6672 __perf_event__output_id_sample(handle, sample);
6673}
6674
6675static void perf_output_read_one(struct perf_output_handle *handle,
6676 struct perf_event *event,
6677 u64 enabled, u64 running)
6678{
6679 u64 read_format = event->attr.read_format;
6680 u64 values[4];
6681 int n = 0;
6682
6683 values[n++] = perf_event_count(event);
6684 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6685 values[n++] = enabled +
6686 atomic64_read(&event->child_total_time_enabled);
6687 }
6688 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6689 values[n++] = running +
6690 atomic64_read(&event->child_total_time_running);
6691 }
6692 if (read_format & PERF_FORMAT_ID)
6693 values[n++] = primary_event_id(event);
6694
6695 __output_copy(handle, values, n * sizeof(u64));
6696}
6697
6698static void perf_output_read_group(struct perf_output_handle *handle,
6699 struct perf_event *event,
6700 u64 enabled, u64 running)
6701{
6702 struct perf_event *leader = event->group_leader, *sub;
6703 u64 read_format = event->attr.read_format;
6704 u64 values[5];
6705 int n = 0;
6706
6707 values[n++] = 1 + leader->nr_siblings;
6708
6709 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6710 values[n++] = enabled;
6711
6712 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6713 values[n++] = running;
6714
6715 if ((leader != event) &&
6716 (leader->state == PERF_EVENT_STATE_ACTIVE))
6717 leader->pmu->read(leader);
6718
6719 values[n++] = perf_event_count(leader);
6720 if (read_format & PERF_FORMAT_ID)
6721 values[n++] = primary_event_id(leader);
6722
6723 __output_copy(handle, values, n * sizeof(u64));
6724
6725 for_each_sibling_event(sub, leader) {
6726 n = 0;
6727
6728 if ((sub != event) &&
6729 (sub->state == PERF_EVENT_STATE_ACTIVE))
6730 sub->pmu->read(sub);
6731
6732 values[n++] = perf_event_count(sub);
6733 if (read_format & PERF_FORMAT_ID)
6734 values[n++] = primary_event_id(sub);
6735
6736 __output_copy(handle, values, n * sizeof(u64));
6737 }
6738}
6739
6740#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6741 PERF_FORMAT_TOTAL_TIME_RUNNING)
6742
6743
6744
6745
6746
6747
6748
6749
6750static void perf_output_read(struct perf_output_handle *handle,
6751 struct perf_event *event)
6752{
6753 u64 enabled = 0, running = 0, now;
6754 u64 read_format = event->attr.read_format;
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6766 calc_timer_values(event, &now, &enabled, &running);
6767
6768 if (event->attr.read_format & PERF_FORMAT_GROUP)
6769 perf_output_read_group(handle, event, enabled, running);
6770 else
6771 perf_output_read_one(handle, event, enabled, running);
6772}
6773
6774static inline bool perf_sample_save_hw_index(struct perf_event *event)
6775{
6776 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6777}
6778
6779void perf_output_sample(struct perf_output_handle *handle,
6780 struct perf_event_header *header,
6781 struct perf_sample_data *data,
6782 struct perf_event *event)
6783{
6784 u64 sample_type = data->type;
6785
6786 perf_output_put(handle, *header);
6787
6788 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6789 perf_output_put(handle, data->id);
6790
6791 if (sample_type & PERF_SAMPLE_IP)
6792 perf_output_put(handle, data->ip);
6793
6794 if (sample_type & PERF_SAMPLE_TID)
6795 perf_output_put(handle, data->tid_entry);
6796
6797 if (sample_type & PERF_SAMPLE_TIME)
6798 perf_output_put(handle, data->time);
6799
6800 if (sample_type & PERF_SAMPLE_ADDR)
6801 perf_output_put(handle, data->addr);
6802
6803 if (sample_type & PERF_SAMPLE_ID)
6804 perf_output_put(handle, data->id);
6805
6806 if (sample_type & PERF_SAMPLE_STREAM_ID)
6807 perf_output_put(handle, data->stream_id);
6808
6809 if (sample_type & PERF_SAMPLE_CPU)
6810 perf_output_put(handle, data->cpu_entry);
6811
6812 if (sample_type & PERF_SAMPLE_PERIOD)
6813 perf_output_put(handle, data->period);
6814
6815 if (sample_type & PERF_SAMPLE_READ)
6816 perf_output_read(handle, event);
6817
6818 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6819 int size = 1;
6820
6821 size += data->callchain->nr;
6822 size *= sizeof(u64);
6823 __output_copy(handle, data->callchain, size);
6824 }
6825
6826 if (sample_type & PERF_SAMPLE_RAW) {
6827 struct perf_raw_record *raw = data->raw;
6828
6829 if (raw) {
6830 struct perf_raw_frag *frag = &raw->frag;
6831
6832 perf_output_put(handle, raw->size);
6833 do {
6834 if (frag->copy) {
6835 __output_custom(handle, frag->copy,
6836 frag->data, frag->size);
6837 } else {
6838 __output_copy(handle, frag->data,
6839 frag->size);
6840 }
6841 if (perf_raw_frag_last(frag))
6842 break;
6843 frag = frag->next;
6844 } while (1);
6845 if (frag->pad)
6846 __output_skip(handle, NULL, frag->pad);
6847 } else {
6848 struct {
6849 u32 size;
6850 u32 data;
6851 } raw = {
6852 .size = sizeof(u32),
6853 .data = 0,
6854 };
6855 perf_output_put(handle, raw);
6856 }
6857 }
6858
6859 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6860 if (data->br_stack) {
6861 size_t size;
6862
6863 size = data->br_stack->nr
6864 * sizeof(struct perf_branch_entry);
6865
6866 perf_output_put(handle, data->br_stack->nr);
6867 if (perf_sample_save_hw_index(event))
6868 perf_output_put(handle, data->br_stack->hw_idx);
6869 perf_output_copy(handle, data->br_stack->entries, size);
6870 } else {
6871
6872
6873
6874 u64 nr = 0;
6875 perf_output_put(handle, nr);
6876 }
6877 }
6878
6879 if (sample_type & PERF_SAMPLE_REGS_USER) {
6880 u64 abi = data->regs_user.abi;
6881
6882
6883
6884
6885
6886 perf_output_put(handle, abi);
6887
6888 if (abi) {
6889 u64 mask = event->attr.sample_regs_user;
6890 perf_output_sample_regs(handle,
6891 data->regs_user.regs,
6892 mask);
6893 }
6894 }
6895
6896 if (sample_type & PERF_SAMPLE_STACK_USER) {
6897 perf_output_sample_ustack(handle,
6898 data->stack_user_size,
6899 data->regs_user.regs);
6900 }
6901
6902 if (sample_type & PERF_SAMPLE_WEIGHT)
6903 perf_output_put(handle, data->weight);
6904
6905 if (sample_type & PERF_SAMPLE_DATA_SRC)
6906 perf_output_put(handle, data->data_src.val);
6907
6908 if (sample_type & PERF_SAMPLE_TRANSACTION)
6909 perf_output_put(handle, data->txn);
6910
6911 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6912 u64 abi = data->regs_intr.abi;
6913
6914
6915
6916
6917 perf_output_put(handle, abi);
6918
6919 if (abi) {
6920 u64 mask = event->attr.sample_regs_intr;
6921
6922 perf_output_sample_regs(handle,
6923 data->regs_intr.regs,
6924 mask);
6925 }
6926 }
6927
6928 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6929 perf_output_put(handle, data->phys_addr);
6930
6931 if (sample_type & PERF_SAMPLE_CGROUP)
6932 perf_output_put(handle, data->cgroup);
6933
6934 if (sample_type & PERF_SAMPLE_AUX) {
6935 perf_output_put(handle, data->aux_size);
6936
6937 if (data->aux_size)
6938 perf_aux_sample_output(event, handle, data);
6939 }
6940
6941 if (!event->attr.watermark) {
6942 int wakeup_events = event->attr.wakeup_events;
6943
6944 if (wakeup_events) {
6945 struct perf_buffer *rb = handle->rb;
6946 int events = local_inc_return(&rb->events);
6947
6948 if (events >= wakeup_events) {
6949 local_sub(wakeup_events, &rb->events);
6950 local_inc(&rb->wakeup);
6951 }
6952 }
6953 }
6954}
6955
6956static u64 perf_virt_to_phys(u64 virt)
6957{
6958 u64 phys_addr = 0;
6959 struct page *p = NULL;
6960
6961 if (!virt)
6962 return 0;
6963
6964 if (virt >= TASK_SIZE) {
6965
6966 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6967 !(virt >= VMALLOC_START && virt < VMALLOC_END))
6968 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6969 } else {
6970
6971
6972
6973
6974
6975
6976
6977 if (current->mm != NULL) {
6978 pagefault_disable();
6979 if (get_user_page_fast_only(virt, 0, &p))
6980 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6981 pagefault_enable();
6982 }
6983
6984 if (p)
6985 put_page(p);
6986 }
6987
6988 return phys_addr;
6989}
6990
6991static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6992
6993struct perf_callchain_entry *
6994perf_callchain(struct perf_event *event, struct pt_regs *regs)
6995{
6996 bool kernel = !event->attr.exclude_callchain_kernel;
6997 bool user = !event->attr.exclude_callchain_user;
6998
6999 bool crosstask = event->ctx->task && event->ctx->task != current;
7000 const u32 max_stack = event->attr.sample_max_stack;
7001 struct perf_callchain_entry *callchain;
7002
7003 if (!kernel && !user)
7004 return &__empty_callchain;
7005
7006 callchain = get_perf_callchain(regs, 0, kernel, user,
7007 max_stack, crosstask, true);
7008 return callchain ?: &__empty_callchain;
7009}
7010
7011void perf_prepare_sample(struct perf_event_header *header,
7012 struct perf_sample_data *data,
7013 struct perf_event *event,
7014 struct pt_regs *regs)
7015{
7016 u64 sample_type = event->attr.sample_type;
7017
7018 header->type = PERF_RECORD_SAMPLE;
7019 header->size = sizeof(*header) + event->header_size;
7020
7021 header->misc = 0;
7022 header->misc |= perf_misc_flags(regs);
7023
7024 __perf_event_header__init_id(header, data, event);
7025
7026 if (sample_type & PERF_SAMPLE_IP)
7027 data->ip = perf_instruction_pointer(regs);
7028
7029 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7030 int size = 1;
7031
7032 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7033 data->callchain = perf_callchain(event, regs);
7034
7035 size += data->callchain->nr;
7036
7037 header->size += size * sizeof(u64);
7038 }
7039
7040 if (sample_type & PERF_SAMPLE_RAW) {
7041 struct perf_raw_record *raw = data->raw;
7042 int size;
7043
7044 if (raw) {
7045 struct perf_raw_frag *frag = &raw->frag;
7046 u32 sum = 0;
7047
7048 do {
7049 sum += frag->size;
7050 if (perf_raw_frag_last(frag))
7051 break;
7052 frag = frag->next;
7053 } while (1);
7054
7055 size = round_up(sum + sizeof(u32), sizeof(u64));
7056 raw->size = size - sizeof(u32);
7057 frag->pad = raw->size - sum;
7058 } else {
7059 size = sizeof(u64);
7060 }
7061
7062 header->size += size;
7063 }
7064
7065 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7066 int size = sizeof(u64);
7067 if (data->br_stack) {
7068 if (perf_sample_save_hw_index(event))
7069 size += sizeof(u64);
7070
7071 size += data->br_stack->nr
7072 * sizeof(struct perf_branch_entry);
7073 }
7074 header->size += size;
7075 }
7076
7077 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7078 perf_sample_regs_user(&data->regs_user, regs);
7079
7080 if (sample_type & PERF_SAMPLE_REGS_USER) {
7081
7082 int size = sizeof(u64);
7083
7084 if (data->regs_user.regs) {
7085 u64 mask = event->attr.sample_regs_user;
7086 size += hweight64(mask) * sizeof(u64);
7087 }
7088
7089 header->size += size;
7090 }
7091
7092 if (sample_type & PERF_SAMPLE_STACK_USER) {
7093
7094
7095
7096
7097
7098
7099 u16 stack_size = event->attr.sample_stack_user;
7100 u16 size = sizeof(u64);
7101
7102 stack_size = perf_sample_ustack_size(stack_size, header->size,
7103 data->regs_user.regs);
7104
7105
7106
7107
7108
7109
7110 if (stack_size)
7111 size += sizeof(u64) + stack_size;
7112
7113 data->stack_user_size = stack_size;
7114 header->size += size;
7115 }
7116
7117 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7118
7119 int size = sizeof(u64);
7120
7121 perf_sample_regs_intr(&data->regs_intr, regs);
7122
7123 if (data->regs_intr.regs) {
7124 u64 mask = event->attr.sample_regs_intr;
7125
7126 size += hweight64(mask) * sizeof(u64);
7127 }
7128
7129 header->size += size;
7130 }
7131
7132 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7133 data->phys_addr = perf_virt_to_phys(data->addr);
7134
7135#ifdef CONFIG_CGROUP_PERF
7136 if (sample_type & PERF_SAMPLE_CGROUP) {
7137 struct cgroup *cgrp;
7138
7139
7140 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7141 data->cgroup = cgroup_id(cgrp);
7142 }
7143#endif
7144
7145 if (sample_type & PERF_SAMPLE_AUX) {
7146 u64 size;
7147
7148 header->size += sizeof(u64);
7149
7150
7151
7152
7153
7154
7155
7156 size = min_t(size_t, U16_MAX - header->size,
7157 event->attr.aux_sample_size);
7158 size = rounddown(size, 8);
7159 size = perf_prepare_sample_aux(event, data, size);
7160
7161 WARN_ON_ONCE(size + header->size > U16_MAX);
7162 header->size += size;
7163 }
7164
7165
7166
7167
7168
7169
7170
7171
7172 WARN_ON_ONCE(header->size & 7);
7173}
7174
7175static __always_inline int
7176__perf_event_output(struct perf_event *event,
7177 struct perf_sample_data *data,
7178 struct pt_regs *regs,
7179 int (*output_begin)(struct perf_output_handle *,
7180 struct perf_sample_data *,
7181 struct perf_event *,
7182 unsigned int))
7183{
7184 struct perf_output_handle handle;
7185 struct perf_event_header header;
7186 int err;
7187
7188
7189 rcu_read_lock();
7190
7191 perf_prepare_sample(&header, data, event, regs);
7192
7193 err = output_begin(&handle, data, event, header.size);
7194 if (err)
7195 goto exit;
7196
7197 perf_output_sample(&handle, &header, data, event);
7198
7199 perf_output_end(&handle);
7200
7201exit:
7202 rcu_read_unlock();
7203 return err;
7204}
7205
7206void
7207perf_event_output_forward(struct perf_event *event,
7208 struct perf_sample_data *data,
7209 struct pt_regs *regs)
7210{
7211 __perf_event_output(event, data, regs, perf_output_begin_forward);
7212}
7213
7214void
7215perf_event_output_backward(struct perf_event *event,
7216 struct perf_sample_data *data,
7217 struct pt_regs *regs)
7218{
7219 __perf_event_output(event, data, regs, perf_output_begin_backward);
7220}
7221
7222int
7223perf_event_output(struct perf_event *event,
7224 struct perf_sample_data *data,
7225 struct pt_regs *regs)
7226{
7227 return __perf_event_output(event, data, regs, perf_output_begin);
7228}
7229
7230
7231
7232
7233
7234struct perf_read_event {
7235 struct perf_event_header header;
7236
7237 u32 pid;
7238 u32 tid;
7239};
7240
7241static void
7242perf_event_read_event(struct perf_event *event,
7243 struct task_struct *task)
7244{
7245 struct perf_output_handle handle;
7246 struct perf_sample_data sample;
7247 struct perf_read_event read_event = {
7248 .header = {
7249 .type = PERF_RECORD_READ,
7250 .misc = 0,
7251 .size = sizeof(read_event) + event->read_size,
7252 },
7253 .pid = perf_event_pid(event, task),
7254 .tid = perf_event_tid(event, task),
7255 };
7256 int ret;
7257
7258 perf_event_header__init_id(&read_event.header, &sample, event);
7259 ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7260 if (ret)
7261 return;
7262
7263 perf_output_put(&handle, read_event);
7264 perf_output_read(&handle, event);
7265 perf_event__output_id_sample(event, &handle, &sample);
7266
7267 perf_output_end(&handle);
7268}
7269
7270typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7271
7272static void
7273perf_iterate_ctx(struct perf_event_context *ctx,
7274 perf_iterate_f output,
7275 void *data, bool all)
7276{
7277 struct perf_event *event;
7278
7279 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7280 if (!all) {
7281 if (event->state < PERF_EVENT_STATE_INACTIVE)
7282 continue;
7283 if (!event_filter_match(event))
7284 continue;
7285 }
7286
7287 output(event, data);
7288 }
7289}
7290
7291static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7292{
7293 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7294 struct perf_event *event;
7295
7296 list_for_each_entry_rcu(event, &pel->list, sb_list) {
7297
7298
7299
7300
7301
7302 if (!smp_load_acquire(&event->ctx))
7303 continue;
7304
7305 if (event->state < PERF_EVENT_STATE_INACTIVE)
7306 continue;
7307 if (!event_filter_match(event))
7308 continue;
7309 output(event, data);
7310 }
7311}
7312
7313
7314
7315
7316
7317
7318
7319static void
7320perf_iterate_sb(perf_iterate_f output, void *data,
7321 struct perf_event_context *task_ctx)
7322{
7323 struct perf_event_context *ctx;
7324 int ctxn;
7325
7326 rcu_read_lock();
7327 preempt_disable();
7328
7329
7330
7331
7332
7333
7334 if (task_ctx) {
7335 perf_iterate_ctx(task_ctx, output, data, false);
7336 goto done;
7337 }
7338
7339 perf_iterate_sb_cpu(output, data);
7340
7341 for_each_task_context_nr(ctxn) {
7342 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7343 if (ctx)
7344 perf_iterate_ctx(ctx, output, data, false);
7345 }
7346done:
7347 preempt_enable();
7348 rcu_read_unlock();
7349}
7350
7351
7352
7353
7354
7355static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7356{
7357 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7358 struct perf_addr_filter *filter;
7359 unsigned int restart = 0, count = 0;
7360 unsigned long flags;
7361
7362 if (!has_addr_filter(event))
7363 return;
7364
7365 raw_spin_lock_irqsave(&ifh->lock, flags);
7366 list_for_each_entry(filter, &ifh->list, entry) {
7367 if (filter->path.dentry) {
7368 event->addr_filter_ranges[count].start = 0;
7369 event->addr_filter_ranges[count].size = 0;
7370 restart++;
7371 }
7372
7373 count++;
7374 }
7375
7376 if (restart)
7377 event->addr_filters_gen++;
7378 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7379
7380 if (restart)
7381 perf_event_stop(event, 1);
7382}
7383
7384void perf_event_exec(void)
7385{
7386 struct perf_event_context *ctx;
7387 int ctxn;
7388
7389 rcu_read_lock();
7390 for_each_task_context_nr(ctxn) {
7391 ctx = current->perf_event_ctxp[ctxn];
7392 if (!ctx)
7393 continue;
7394
7395 perf_event_enable_on_exec(ctxn);
7396
7397 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
7398 true);
7399 }
7400 rcu_read_unlock();
7401}
7402
7403struct remote_output {
7404 struct perf_buffer *rb;
7405 int err;
7406};
7407
7408static void __perf_event_output_stop(struct perf_event *event, void *data)
7409{
7410 struct perf_event *parent = event->parent;
7411 struct remote_output *ro = data;
7412 struct perf_buffer *rb = ro->rb;
7413 struct stop_event_data sd = {
7414 .event = event,
7415 };
7416
7417 if (!has_aux(event))
7418 return;
7419
7420 if (!parent)
7421 parent = event;
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433 if (rcu_dereference(parent->rb) == rb)
7434 ro->err = __perf_event_stop(&sd);
7435}
7436
7437static int __perf_pmu_output_stop(void *info)
7438{
7439 struct perf_event *event = info;
7440 struct pmu *pmu = event->ctx->pmu;
7441 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7442 struct remote_output ro = {
7443 .rb = event->rb,
7444 };
7445
7446 rcu_read_lock();
7447 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7448 if (cpuctx->task_ctx)
7449 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7450 &ro, false);
7451 rcu_read_unlock();
7452
7453 return ro.err;
7454}
7455
7456static void perf_pmu_output_stop(struct perf_event *event)
7457{
7458 struct perf_event *iter;
7459 int err, cpu;
7460
7461restart:
7462 rcu_read_lock();
7463 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7464
7465
7466
7467
7468
7469
7470 cpu = iter->cpu;
7471 if (cpu == -1)
7472 cpu = READ_ONCE(iter->oncpu);
7473
7474 if (cpu == -1)
7475 continue;
7476
7477 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7478 if (err == -EAGAIN) {
7479 rcu_read_unlock();
7480 goto restart;
7481 }
7482 }
7483 rcu_read_unlock();
7484}
7485
7486
7487
7488
7489
7490
7491
7492struct perf_task_event {
7493 struct task_struct *task;
7494 struct perf_event_context *task_ctx;
7495
7496 struct {
7497 struct perf_event_header header;
7498
7499 u32 pid;
7500 u32 ppid;
7501 u32 tid;
7502 u32 ptid;
7503 u64 time;
7504 } event_id;
7505};
7506
7507static int perf_event_task_match(struct perf_event *event)
7508{
7509 return event->attr.comm || event->attr.mmap ||
7510 event->attr.mmap2 || event->attr.mmap_data ||
7511 event->attr.task;
7512}
7513
7514static void perf_event_task_output(struct perf_event *event,
7515 void *data)
7516{
7517 struct perf_task_event *task_event = data;
7518 struct perf_output_handle handle;
7519 struct perf_sample_data sample;
7520 struct task_struct *task = task_event->task;
7521 int ret, size = task_event->event_id.header.size;
7522
7523 if (!perf_event_task_match(event))
7524 return;
7525
7526 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7527
7528 ret = perf_output_begin(&handle, &sample, event,
7529 task_event->event_id.header.size);
7530 if (ret)
7531 goto out;
7532
7533 task_event->event_id.pid = perf_event_pid(event, task);
7534 task_event->event_id.tid = perf_event_tid(event, task);
7535
7536 if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7537 task_event->event_id.ppid = perf_event_pid(event,
7538 task->real_parent);
7539 task_event->event_id.ptid = perf_event_pid(event,
7540 task->real_parent);
7541 } else {
7542 task_event->event_id.ppid = perf_event_pid(event, current);
7543 task_event->event_id.ptid = perf_event_tid(event, current);
7544 }
7545
7546 task_event->event_id.time = perf_event_clock(event);
7547
7548 perf_output_put(&handle, task_event->event_id);
7549
7550 perf_event__output_id_sample(event, &handle, &sample);
7551
7552 perf_output_end(&handle);
7553out:
7554 task_event->event_id.header.size = size;
7555}
7556
7557static void perf_event_task(struct task_struct *task,
7558 struct perf_event_context *task_ctx,
7559 int new)
7560{
7561 struct perf_task_event task_event;
7562
7563 if (!atomic_read(&nr_comm_events) &&
7564 !atomic_read(&nr_mmap_events) &&
7565 !atomic_read(&nr_task_events))
7566 return;
7567
7568 task_event = (struct perf_task_event){
7569 .task = task,
7570 .task_ctx = task_ctx,
7571 .event_id = {
7572 .header = {
7573 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7574 .misc = 0,
7575 .size = sizeof(task_event.event_id),
7576 },
7577
7578
7579
7580
7581
7582 },
7583 };
7584
7585 perf_iterate_sb(perf_event_task_output,
7586 &task_event,
7587 task_ctx);
7588}
7589
7590void perf_event_fork(struct task_struct *task)
7591{
7592 perf_event_task(task, NULL, 1);
7593 perf_event_namespaces(task);
7594}
7595
7596
7597
7598
7599
7600struct perf_comm_event {
7601 struct task_struct *task;
7602 char *comm;
7603 int comm_size;
7604
7605 struct {
7606 struct perf_event_header header;
7607
7608 u32 pid;
7609 u32 tid;
7610 } event_id;
7611};
7612
7613static int perf_event_comm_match(struct perf_event *event)
7614{
7615 return event->attr.comm;
7616}
7617
7618static void perf_event_comm_output(struct perf_event *event,
7619 void *data)
7620{
7621 struct perf_comm_event *comm_event = data;
7622 struct perf_output_handle handle;
7623 struct perf_sample_data sample;
7624 int size = comm_event->event_id.header.size;
7625 int ret;
7626
7627 if (!perf_event_comm_match(event))
7628 return;
7629
7630 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7631 ret = perf_output_begin(&handle, &sample, event,
7632 comm_event->event_id.header.size);
7633
7634 if (ret)
7635 goto out;
7636
7637 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7638 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7639
7640 perf_output_put(&handle, comm_event->event_id);
7641 __output_copy(&handle, comm_event->comm,
7642 comm_event->comm_size);
7643
7644 perf_event__output_id_sample(event, &handle, &sample);
7645
7646 perf_output_end(&handle);
7647out:
7648 comm_event->event_id.header.size = size;
7649}
7650
7651static void perf_event_comm_event(struct perf_comm_event *comm_event)
7652{
7653 char comm[TASK_COMM_LEN];
7654 unsigned int size;
7655
7656 memset(comm, 0, sizeof(comm));
7657 strlcpy(comm, comm_event->task->comm, sizeof(comm));
7658 size = ALIGN(strlen(comm)+1, sizeof(u64));
7659
7660 comm_event->comm = comm;
7661 comm_event->comm_size = size;
7662
7663 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7664
7665 perf_iterate_sb(perf_event_comm_output,
7666 comm_event,
7667 NULL);
7668}
7669
7670void perf_event_comm(struct task_struct *task, bool exec)
7671{
7672 struct perf_comm_event comm_event;
7673
7674 if (!atomic_read(&nr_comm_events))
7675 return;
7676
7677 comm_event = (struct perf_comm_event){
7678 .task = task,
7679
7680
7681 .event_id = {
7682 .header = {
7683 .type = PERF_RECORD_COMM,
7684 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7685
7686 },
7687
7688
7689 },
7690 };
7691
7692 perf_event_comm_event(&comm_event);
7693}
7694
7695
7696
7697
7698
7699struct perf_namespaces_event {
7700 struct task_struct *task;
7701
7702 struct {
7703 struct perf_event_header header;
7704
7705 u32 pid;
7706 u32 tid;
7707 u64 nr_namespaces;
7708 struct perf_ns_link_info link_info[NR_NAMESPACES];
7709 } event_id;
7710};
7711
7712static int perf_event_namespaces_match(struct perf_event *event)
7713{
7714 return event->attr.namespaces;
7715}
7716
7717static void perf_event_namespaces_output(struct perf_event *event,
7718 void *data)
7719{
7720 struct perf_namespaces_event *namespaces_event = data;
7721 struct perf_output_handle handle;
7722 struct perf_sample_data sample;
7723 u16 header_size = namespaces_event->event_id.header.size;
7724 int ret;
7725
7726 if (!perf_event_namespaces_match(event))
7727 return;
7728
7729 perf_event_header__init_id(&namespaces_event->event_id.header,
7730 &sample, event);
7731 ret = perf_output_begin(&handle, &sample, event,
7732 namespaces_event->event_id.header.size);
7733 if (ret)
7734 goto out;
7735
7736 namespaces_event->event_id.pid = perf_event_pid(event,
7737 namespaces_event->task);
7738 namespaces_event->event_id.tid = perf_event_tid(event,
7739 namespaces_event->task);
7740
7741 perf_output_put(&handle, namespaces_event->event_id);
7742
7743 perf_event__output_id_sample(event, &handle, &sample);
7744
7745 perf_output_end(&handle);
7746out:
7747 namespaces_event->event_id.header.size = header_size;
7748}
7749
7750static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7751 struct task_struct *task,
7752 const struct proc_ns_operations *ns_ops)
7753{
7754 struct path ns_path;
7755 struct inode *ns_inode;
7756 int error;
7757
7758 error = ns_get_path(&ns_path, task, ns_ops);
7759 if (!error) {
7760 ns_inode = ns_path.dentry->d_inode;
7761 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7762 ns_link_info->ino = ns_inode->i_ino;
7763 path_put(&ns_path);
7764 }
7765}
7766
7767void perf_event_namespaces(struct task_struct *task)
7768{
7769 struct perf_namespaces_event namespaces_event;
7770 struct perf_ns_link_info *ns_link_info;
7771
7772 if (!atomic_read(&nr_namespaces_events))
7773 return;
7774
7775 namespaces_event = (struct perf_namespaces_event){
7776 .task = task,
7777 .event_id = {
7778 .header = {
7779 .type = PERF_RECORD_NAMESPACES,
7780 .misc = 0,
7781 .size = sizeof(namespaces_event.event_id),
7782 },
7783
7784
7785 .nr_namespaces = NR_NAMESPACES,
7786
7787 },
7788 };
7789
7790 ns_link_info = namespaces_event.event_id.link_info;
7791
7792 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7793 task, &mntns_operations);
7794
7795#ifdef CONFIG_USER_NS
7796 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7797 task, &userns_operations);
7798#endif
7799#ifdef CONFIG_NET_NS
7800 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7801 task, &netns_operations);
7802#endif
7803#ifdef CONFIG_UTS_NS
7804 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7805 task, &utsns_operations);
7806#endif
7807#ifdef CONFIG_IPC_NS
7808 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7809 task, &ipcns_operations);
7810#endif
7811#ifdef CONFIG_PID_NS
7812 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7813 task, &pidns_operations);
7814#endif
7815#ifdef CONFIG_CGROUPS
7816 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7817 task, &cgroupns_operations);
7818#endif
7819
7820 perf_iterate_sb(perf_event_namespaces_output,
7821 &namespaces_event,
7822 NULL);
7823}
7824
7825
7826
7827
7828#ifdef CONFIG_CGROUP_PERF
7829
7830struct perf_cgroup_event {
7831 char *path;
7832 int path_size;
7833 struct {
7834 struct perf_event_header header;
7835 u64 id;
7836 char path[];
7837 } event_id;
7838};
7839
7840static int perf_event_cgroup_match(struct perf_event *event)
7841{
7842 return event->attr.cgroup;
7843}
7844
7845static void perf_event_cgroup_output(struct perf_event *event, void *data)
7846{
7847 struct perf_cgroup_event *cgroup_event = data;
7848 struct perf_output_handle handle;
7849 struct perf_sample_data sample;
7850 u16 header_size = cgroup_event->event_id.header.size;
7851 int ret;
7852
7853 if (!perf_event_cgroup_match(event))
7854 return;
7855
7856 perf_event_header__init_id(&cgroup_event->event_id.header,
7857 &sample, event);
7858 ret = perf_output_begin(&handle, &sample, event,
7859 cgroup_event->event_id.header.size);
7860 if (ret)
7861 goto out;
7862
7863 perf_output_put(&handle, cgroup_event->event_id);
7864 __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
7865
7866 perf_event__output_id_sample(event, &handle, &sample);
7867
7868 perf_output_end(&handle);
7869out:
7870 cgroup_event->event_id.header.size = header_size;
7871}
7872
7873static void perf_event_cgroup(struct cgroup *cgrp)
7874{
7875 struct perf_cgroup_event cgroup_event;
7876 char path_enomem[16] = "//enomem";
7877 char *pathname;
7878 size_t size;
7879
7880 if (!atomic_read(&nr_cgroup_events))
7881 return;
7882
7883 cgroup_event = (struct perf_cgroup_event){
7884 .event_id = {
7885 .header = {
7886 .type = PERF_RECORD_CGROUP,
7887 .misc = 0,
7888 .size = sizeof(cgroup_event.event_id),
7889 },
7890 .id = cgroup_id(cgrp),
7891 },
7892 };
7893
7894 pathname = kmalloc(PATH_MAX, GFP_KERNEL);
7895 if (pathname == NULL) {
7896 cgroup_event.path = path_enomem;
7897 } else {
7898
7899 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
7900 cgroup_event.path = pathname;
7901 }
7902
7903
7904
7905
7906
7907
7908 size = strlen(cgroup_event.path) + 1;
7909 while (!IS_ALIGNED(size, sizeof(u64)))
7910 cgroup_event.path[size++] = '\0';
7911
7912 cgroup_event.event_id.header.size += size;
7913 cgroup_event.path_size = size;
7914
7915 perf_iterate_sb(perf_event_cgroup_output,
7916 &cgroup_event,
7917 NULL);
7918
7919 kfree(pathname);
7920}
7921
7922#endif
7923
7924
7925
7926
7927
7928struct perf_mmap_event {
7929 struct vm_area_struct *vma;
7930
7931 const char *file_name;
7932 int file_size;
7933 int maj, min;
7934 u64 ino;
7935 u64 ino_generation;
7936 u32 prot, flags;
7937
7938 struct {
7939 struct perf_event_header header;
7940
7941 u32 pid;
7942 u32 tid;
7943 u64 start;
7944 u64 len;
7945 u64 pgoff;
7946 } event_id;
7947};
7948
7949static int perf_event_mmap_match(struct perf_event *event,
7950 void *data)
7951{
7952 struct perf_mmap_event *mmap_event = data;
7953 struct vm_area_struct *vma = mmap_event->vma;
7954 int executable = vma->vm_flags & VM_EXEC;
7955
7956 return (!executable && event->attr.mmap_data) ||
7957 (executable && (event->attr.mmap || event->attr.mmap2));
7958}
7959
7960static void perf_event_mmap_output(struct perf_event *event,
7961 void *data)
7962{
7963 struct perf_mmap_event *mmap_event = data;
7964 struct perf_output_handle handle;
7965 struct perf_sample_data sample;
7966 int size = mmap_event->event_id.header.size;
7967 u32 type = mmap_event->event_id.header.type;
7968 int ret;
7969
7970 if (!perf_event_mmap_match(event, data))
7971 return;
7972
7973 if (event->attr.mmap2) {
7974 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7975 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7976 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7977 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7978 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7979 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7980 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7981 }
7982
7983 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7984 ret = perf_output_begin(&handle, &sample, event,
7985 mmap_event->event_id.header.size);
7986 if (ret)
7987 goto out;
7988
7989 mmap_event->event_id.pid = perf_event_pid(event, current);
7990 mmap_event->event_id.tid = perf_event_tid(event, current);
7991
7992 perf_output_put(&handle, mmap_event->event_id);
7993
7994 if (event->attr.mmap2) {
7995 perf_output_put(&handle, mmap_event->maj);
7996 perf_output_put(&handle, mmap_event->min);
7997 perf_output_put(&handle, mmap_event->ino);
7998 perf_output_put(&handle, mmap_event->ino_generation);
7999 perf_output_put(&handle, mmap_event->prot);
8000 perf_output_put(&handle, mmap_event->flags);
8001 }
8002
8003 __output_copy(&handle, mmap_event->file_name,
8004 mmap_event->file_size);
8005
8006 perf_event__output_id_sample(event, &handle, &sample);
8007
8008 perf_output_end(&handle);
8009out:
8010 mmap_event->event_id.header.size = size;
8011 mmap_event->event_id.header.type = type;
8012}
8013
8014static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8015{
8016 struct vm_area_struct *vma = mmap_event->vma;
8017 struct file *file = vma->vm_file;
8018 int maj = 0, min = 0;
8019 u64 ino = 0, gen = 0;
8020 u32 prot = 0, flags = 0;
8021 unsigned int size;
8022 char tmp[16];
8023 char *buf = NULL;
8024 char *name;
8025
8026 if (vma->vm_flags & VM_READ)
8027 prot |= PROT_READ;
8028 if (vma->vm_flags & VM_WRITE)
8029 prot |= PROT_WRITE;
8030 if (vma->vm_flags & VM_EXEC)
8031 prot |= PROT_EXEC;
8032
8033 if (vma->vm_flags & VM_MAYSHARE)
8034 flags = MAP_SHARED;
8035 else
8036 flags = MAP_PRIVATE;
8037
8038 if (vma->vm_flags & VM_DENYWRITE)
8039 flags |= MAP_DENYWRITE;
8040 if (vma->vm_flags & VM_MAYEXEC)
8041 flags |= MAP_EXECUTABLE;
8042 if (vma->vm_flags & VM_LOCKED)
8043 flags |= MAP_LOCKED;
8044 if (is_vm_hugetlb_page(vma))
8045 flags |= MAP_HUGETLB;
8046
8047 if (file) {
8048 struct inode *inode;
8049 dev_t dev;
8050
8051 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8052 if (!buf) {
8053 name = "//enomem";
8054 goto cpy_name;
8055 }
8056
8057
8058
8059
8060
8061 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8062 if (IS_ERR(name)) {
8063 name = "//toolong";
8064 goto cpy_name;
8065 }
8066 inode = file_inode(vma->vm_file);
8067 dev = inode->i_sb->s_dev;
8068 ino = inode->i_ino;
8069 gen = inode->i_generation;
8070 maj = MAJOR(dev);
8071 min = MINOR(dev);
8072
8073 goto got_name;
8074 } else {
8075 if (vma->vm_ops && vma->vm_ops->name) {
8076 name = (char *) vma->vm_ops->name(vma);
8077 if (name)
8078 goto cpy_name;
8079 }
8080
8081 name = (char *)arch_vma_name(vma);
8082 if (name)
8083 goto cpy_name;
8084
8085 if (vma->vm_start <= vma->vm_mm->start_brk &&
8086 vma->vm_end >= vma->vm_mm->brk) {
8087 name = "[heap]";
8088 goto cpy_name;
8089 }
8090 if (vma->vm_start <= vma->vm_mm->start_stack &&
8091 vma->vm_end >= vma->vm_mm->start_stack) {
8092 name = "[stack]";
8093 goto cpy_name;
8094 }
8095
8096 name = "//anon";
8097 goto cpy_name;
8098 }
8099
8100cpy_name:
8101 strlcpy(tmp, name, sizeof(tmp));
8102 name = tmp;
8103got_name:
8104
8105
8106
8107
8108
8109 size = strlen(name)+1;
8110 while (!IS_ALIGNED(size, sizeof(u64)))
8111 name[size++] = '\0';
8112
8113 mmap_event->file_name = name;
8114 mmap_event->file_size = size;
8115 mmap_event->maj = maj;
8116 mmap_event->min = min;
8117 mmap_event->ino = ino;
8118 mmap_event->ino_generation = gen;
8119 mmap_event->prot = prot;
8120 mmap_event->flags = flags;
8121
8122 if (!(vma->vm_flags & VM_EXEC))
8123 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8124
8125 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8126
8127 perf_iterate_sb(perf_event_mmap_output,
8128 mmap_event,
8129 NULL);
8130
8131 kfree(buf);
8132}
8133
8134
8135
8136
8137static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8138 struct file *file, unsigned long offset,
8139 unsigned long size)
8140{
8141
8142 if (!filter->path.dentry)
8143 return false;
8144
8145 if (d_inode(filter->path.dentry) != file_inode(file))
8146 return false;
8147
8148 if (filter->offset > offset + size)
8149 return false;
8150
8151 if (filter->offset + filter->size < offset)
8152 return false;
8153
8154 return true;
8155}
8156
8157static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8158 struct vm_area_struct *vma,
8159 struct perf_addr_filter_range *fr)
8160{
8161 unsigned long vma_size = vma->vm_end - vma->vm_start;
8162 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8163 struct file *file = vma->vm_file;
8164
8165 if (!perf_addr_filter_match(filter, file, off, vma_size))
8166 return false;
8167
8168 if (filter->offset < off) {
8169 fr->start = vma->vm_start;
8170 fr->size = min(vma_size, filter->size - (off - filter->offset));
8171 } else {
8172 fr->start = vma->vm_start + filter->offset - off;
8173 fr->size = min(vma->vm_end - fr->start, filter->size);
8174 }
8175
8176 return true;
8177}
8178
8179static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8180{
8181 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8182 struct vm_area_struct *vma = data;
8183 struct perf_addr_filter *filter;
8184 unsigned int restart = 0, count = 0;
8185 unsigned long flags;
8186
8187 if (!has_addr_filter(event))
8188 return;
8189
8190 if (!vma->vm_file)
8191 return;
8192
8193 raw_spin_lock_irqsave(&ifh->lock, flags);
8194 list_for_each_entry(filter, &ifh->list, entry) {
8195 if (perf_addr_filter_vma_adjust(filter, vma,
8196 &event->addr_filter_ranges[count]))
8197 restart++;
8198
8199 count++;
8200 }
8201
8202 if (restart)
8203 event->addr_filters_gen++;
8204 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8205
8206 if (restart)
8207 perf_event_stop(event, 1);
8208}
8209
8210
8211
8212
8213static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8214{
8215 struct perf_event_context *ctx;
8216 int ctxn;
8217
8218
8219
8220
8221
8222 if (!(vma->vm_flags & VM_EXEC))
8223 return;
8224
8225 rcu_read_lock();
8226 for_each_task_context_nr(ctxn) {
8227 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8228 if (!ctx)
8229 continue;
8230
8231 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8232 }
8233 rcu_read_unlock();
8234}
8235
8236void perf_event_mmap(struct vm_area_struct *vma)
8237{
8238 struct perf_mmap_event mmap_event;
8239
8240 if (!atomic_read(&nr_mmap_events))
8241 return;
8242
8243 mmap_event = (struct perf_mmap_event){
8244 .vma = vma,
8245
8246
8247 .event_id = {
8248 .header = {
8249 .type = PERF_RECORD_MMAP,
8250 .misc = PERF_RECORD_MISC_USER,
8251
8252 },
8253
8254
8255 .start = vma->vm_start,
8256 .len = vma->vm_end - vma->vm_start,
8257 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
8258 },
8259
8260
8261
8262
8263
8264
8265 };
8266
8267 perf_addr_filters_adjust(vma);
8268 perf_event_mmap_event(&mmap_event);
8269}
8270
8271void perf_event_aux_event(struct perf_event *event, unsigned long head,
8272 unsigned long size, u64 flags)
8273{
8274 struct perf_output_handle handle;
8275 struct perf_sample_data sample;
8276 struct perf_aux_event {
8277 struct perf_event_header header;
8278 u64 offset;
8279 u64 size;
8280 u64 flags;
8281 } rec = {
8282 .header = {
8283 .type = PERF_RECORD_AUX,
8284 .misc = 0,
8285 .size = sizeof(rec),
8286 },
8287 .offset = head,
8288 .size = size,
8289 .flags = flags,
8290 };
8291 int ret;
8292
8293 perf_event_header__init_id(&rec.header, &sample, event);
8294 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8295
8296 if (ret)
8297 return;
8298
8299 perf_output_put(&handle, rec);
8300 perf_event__output_id_sample(event, &handle, &sample);
8301
8302 perf_output_end(&handle);
8303}
8304
8305
8306
8307
8308void perf_log_lost_samples(struct perf_event *event, u64 lost)
8309{
8310 struct perf_output_handle handle;
8311 struct perf_sample_data sample;
8312 int ret;
8313
8314 struct {
8315 struct perf_event_header header;
8316 u64 lost;
8317 } lost_samples_event = {
8318 .header = {
8319 .type = PERF_RECORD_LOST_SAMPLES,
8320 .misc = 0,
8321 .size = sizeof(lost_samples_event),
8322 },
8323 .lost = lost,
8324 };
8325
8326 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8327
8328 ret = perf_output_begin(&handle, &sample, event,
8329 lost_samples_event.header.size);
8330 if (ret)
8331 return;
8332
8333 perf_output_put(&handle, lost_samples_event);
8334 perf_event__output_id_sample(event, &handle, &sample);
8335 perf_output_end(&handle);
8336}
8337
8338
8339
8340
8341
8342struct perf_switch_event {
8343 struct task_struct *task;
8344 struct task_struct *next_prev;
8345
8346 struct {
8347 struct perf_event_header header;
8348 u32 next_prev_pid;
8349 u32 next_prev_tid;
8350 } event_id;
8351};
8352
8353static int perf_event_switch_match(struct perf_event *event)
8354{
8355 return event->attr.context_switch;
8356}
8357
8358static void perf_event_switch_output(struct perf_event *event, void *data)
8359{
8360 struct perf_switch_event *se = data;
8361 struct perf_output_handle handle;
8362 struct perf_sample_data sample;
8363 int ret;
8364
8365 if (!perf_event_switch_match(event))
8366 return;
8367
8368
8369 if (event->ctx->task) {
8370 se->event_id.header.type = PERF_RECORD_SWITCH;
8371 se->event_id.header.size = sizeof(se->event_id.header);
8372 } else {
8373 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8374 se->event_id.header.size = sizeof(se->event_id);
8375 se->event_id.next_prev_pid =
8376 perf_event_pid(event, se->next_prev);
8377 se->event_id.next_prev_tid =
8378 perf_event_tid(event, se->next_prev);
8379 }
8380
8381 perf_event_header__init_id(&se->event_id.header, &sample, event);
8382
8383 ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8384 if (ret)
8385 return;
8386
8387 if (event->ctx->task)
8388 perf_output_put(&handle, se->event_id.header);
8389 else
8390 perf_output_put(&handle, se->event_id);
8391
8392 perf_event__output_id_sample(event, &handle, &sample);
8393
8394 perf_output_end(&handle);
8395}
8396
8397static void perf_event_switch(struct task_struct *task,
8398 struct task_struct *next_prev, bool sched_in)
8399{
8400 struct perf_switch_event switch_event;
8401
8402
8403
8404 switch_event = (struct perf_switch_event){
8405 .task = task,
8406 .next_prev = next_prev,
8407 .event_id = {
8408 .header = {
8409
8410 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8411
8412 },
8413
8414
8415 },
8416 };
8417
8418 if (!sched_in && task->state == TASK_RUNNING)
8419 switch_event.event_id.header.misc |=
8420 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8421
8422 perf_iterate_sb(perf_event_switch_output,
8423 &switch_event,
8424 NULL);
8425}
8426
8427
8428
8429
8430
8431static void perf_log_throttle(struct perf_event *event, int enable)
8432{
8433 struct perf_output_handle handle;
8434 struct perf_sample_data sample;
8435 int ret;
8436
8437 struct {
8438 struct perf_event_header header;
8439 u64 time;
8440 u64 id;
8441 u64 stream_id;
8442 } throttle_event = {
8443 .header = {
8444 .type = PERF_RECORD_THROTTLE,
8445 .misc = 0,
8446 .size = sizeof(throttle_event),
8447 },
8448 .time = perf_event_clock(event),
8449 .id = primary_event_id(event),
8450 .stream_id = event->id,
8451 };
8452
8453 if (enable)
8454 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8455
8456 perf_event_header__init_id(&throttle_event.header, &sample, event);
8457
8458 ret = perf_output_begin(&handle, &sample, event,
8459 throttle_event.header.size);
8460 if (ret)
8461 return;
8462
8463 perf_output_put(&handle, throttle_event);
8464 perf_event__output_id_sample(event, &handle, &sample);
8465 perf_output_end(&handle);
8466}
8467
8468
8469
8470
8471
8472struct perf_ksymbol_event {
8473 const char *name;
8474 int name_len;
8475 struct {
8476 struct perf_event_header header;
8477 u64 addr;
8478 u32 len;
8479 u16 ksym_type;
8480 u16 flags;
8481 } event_id;
8482};
8483
8484static int perf_event_ksymbol_match(struct perf_event *event)
8485{
8486 return event->attr.ksymbol;
8487}
8488
8489static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8490{
8491 struct perf_ksymbol_event *ksymbol_event = data;
8492 struct perf_output_handle handle;
8493 struct perf_sample_data sample;
8494 int ret;
8495
8496 if (!perf_event_ksymbol_match(event))
8497 return;
8498
8499 perf_event_header__init_id(&ksymbol_event->event_id.header,
8500 &sample, event);
8501 ret = perf_output_begin(&handle, &sample, event,
8502 ksymbol_event->event_id.header.size);
8503 if (ret)
8504 return;
8505
8506 perf_output_put(&handle, ksymbol_event->event_id);
8507 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8508 perf_event__output_id_sample(event, &handle, &sample);
8509
8510 perf_output_end(&handle);
8511}
8512
8513void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8514 const char *sym)
8515{
8516 struct perf_ksymbol_event ksymbol_event;
8517 char name[KSYM_NAME_LEN];
8518 u16 flags = 0;
8519 int name_len;
8520
8521 if (!atomic_read(&nr_ksymbol_events))
8522 return;
8523
8524 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8525 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8526 goto err;
8527
8528 strlcpy(name, sym, KSYM_NAME_LEN);
8529 name_len = strlen(name) + 1;
8530 while (!IS_ALIGNED(name_len, sizeof(u64)))
8531 name[name_len++] = '\0';
8532 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8533
8534 if (unregister)
8535 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8536
8537 ksymbol_event = (struct perf_ksymbol_event){
8538 .name = name,
8539 .name_len = name_len,
8540 .event_id = {
8541 .header = {
8542 .type = PERF_RECORD_KSYMBOL,
8543 .size = sizeof(ksymbol_event.event_id) +
8544 name_len,
8545 },
8546 .addr = addr,
8547 .len = len,
8548 .ksym_type = ksym_type,
8549 .flags = flags,
8550 },
8551 };
8552
8553 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8554 return;
8555err:
8556 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8557}
8558
8559
8560
8561
8562
8563struct perf_bpf_event {
8564 struct bpf_prog *prog;
8565 struct {
8566 struct perf_event_header header;
8567 u16 type;
8568 u16 flags;
8569 u32 id;
8570 u8 tag[BPF_TAG_SIZE];
8571 } event_id;
8572};
8573
8574static int perf_event_bpf_match(struct perf_event *event)
8575{
8576 return event->attr.bpf_event;
8577}
8578
8579static void perf_event_bpf_output(struct perf_event *event, void *data)
8580{
8581 struct perf_bpf_event *bpf_event = data;
8582 struct perf_output_handle handle;
8583 struct perf_sample_data sample;
8584 int ret;
8585
8586 if (!perf_event_bpf_match(event))
8587 return;
8588
8589 perf_event_header__init_id(&bpf_event->event_id.header,
8590 &sample, event);
8591 ret = perf_output_begin(&handle, data, event,
8592 bpf_event->event_id.header.size);
8593 if (ret)
8594 return;
8595
8596 perf_output_put(&handle, bpf_event->event_id);
8597 perf_event__output_id_sample(event, &handle, &sample);
8598
8599 perf_output_end(&handle);
8600}
8601
8602static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8603 enum perf_bpf_event_type type)
8604{
8605 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8606 int i;
8607
8608 if (prog->aux->func_cnt == 0) {
8609 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8610 (u64)(unsigned long)prog->bpf_func,
8611 prog->jited_len, unregister,
8612 prog->aux->ksym.name);
8613 } else {
8614 for (i = 0; i < prog->aux->func_cnt; i++) {
8615 struct bpf_prog *subprog = prog->aux->func[i];
8616
8617 perf_event_ksymbol(
8618 PERF_RECORD_KSYMBOL_TYPE_BPF,
8619 (u64)(unsigned long)subprog->bpf_func,
8620 subprog->jited_len, unregister,
8621 prog->aux->ksym.name);
8622 }
8623 }
8624}
8625
8626void perf_event_bpf_event(struct bpf_prog *prog,
8627 enum perf_bpf_event_type type,
8628 u16 flags)
8629{
8630 struct perf_bpf_event bpf_event;
8631
8632 if (type <= PERF_BPF_EVENT_UNKNOWN ||
8633 type >= PERF_BPF_EVENT_MAX)
8634 return;
8635
8636 switch (type) {
8637 case PERF_BPF_EVENT_PROG_LOAD:
8638 case PERF_BPF_EVENT_PROG_UNLOAD:
8639 if (atomic_read(&nr_ksymbol_events))
8640 perf_event_bpf_emit_ksymbols(prog, type);
8641 break;
8642 default:
8643 break;
8644 }
8645
8646 if (!atomic_read(&nr_bpf_events))
8647 return;
8648
8649 bpf_event = (struct perf_bpf_event){
8650 .prog = prog,
8651 .event_id = {
8652 .header = {
8653 .type = PERF_RECORD_BPF_EVENT,
8654 .size = sizeof(bpf_event.event_id),
8655 },
8656 .type = type,
8657 .flags = flags,
8658 .id = prog->aux->id,
8659 },
8660 };
8661
8662 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8663
8664 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8665 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8666}
8667
8668struct perf_text_poke_event {
8669 const void *old_bytes;
8670 const void *new_bytes;
8671 size_t pad;
8672 u16 old_len;
8673 u16 new_len;
8674
8675 struct {
8676 struct perf_event_header header;
8677
8678 u64 addr;
8679 } event_id;
8680};
8681
8682static int perf_event_text_poke_match(struct perf_event *event)
8683{
8684 return event->attr.text_poke;
8685}
8686
8687static void perf_event_text_poke_output(struct perf_event *event, void *data)
8688{
8689 struct perf_text_poke_event *text_poke_event = data;
8690 struct perf_output_handle handle;
8691 struct perf_sample_data sample;
8692 u64 padding = 0;
8693 int ret;
8694
8695 if (!perf_event_text_poke_match(event))
8696 return;
8697
8698 perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
8699
8700 ret = perf_output_begin(&handle, &sample, event,
8701 text_poke_event->event_id.header.size);
8702 if (ret)
8703 return;
8704
8705 perf_output_put(&handle, text_poke_event->event_id);
8706 perf_output_put(&handle, text_poke_event->old_len);
8707 perf_output_put(&handle, text_poke_event->new_len);
8708
8709 __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
8710 __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
8711
8712 if (text_poke_event->pad)
8713 __output_copy(&handle, &padding, text_poke_event->pad);
8714
8715 perf_event__output_id_sample(event, &handle, &sample);
8716
8717 perf_output_end(&handle);
8718}
8719
8720void perf_event_text_poke(const void *addr, const void *old_bytes,
8721 size_t old_len, const void *new_bytes, size_t new_len)
8722{
8723 struct perf_text_poke_event text_poke_event;
8724 size_t tot, pad;
8725
8726 if (!atomic_read(&nr_text_poke_events))
8727 return;
8728
8729 tot = sizeof(text_poke_event.old_len) + old_len;
8730 tot += sizeof(text_poke_event.new_len) + new_len;
8731 pad = ALIGN(tot, sizeof(u64)) - tot;
8732
8733 text_poke_event = (struct perf_text_poke_event){
8734 .old_bytes = old_bytes,
8735 .new_bytes = new_bytes,
8736 .pad = pad,
8737 .old_len = old_len,
8738 .new_len = new_len,
8739 .event_id = {
8740 .header = {
8741 .type = PERF_RECORD_TEXT_POKE,
8742 .misc = PERF_RECORD_MISC_KERNEL,
8743 .size = sizeof(text_poke_event.event_id) + tot + pad,
8744 },
8745 .addr = (unsigned long)addr,
8746 },
8747 };
8748
8749 perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
8750}
8751
8752void perf_event_itrace_started(struct perf_event *event)
8753{
8754 event->attach_state |= PERF_ATTACH_ITRACE;
8755}
8756
8757static void perf_log_itrace_start(struct perf_event *event)
8758{
8759 struct perf_output_handle handle;
8760 struct perf_sample_data sample;
8761 struct perf_aux_event {
8762 struct perf_event_header header;
8763 u32 pid;
8764 u32 tid;
8765 } rec;
8766 int ret;
8767
8768 if (event->parent)
8769 event = event->parent;
8770
8771 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
8772 event->attach_state & PERF_ATTACH_ITRACE)
8773 return;
8774
8775 rec.header.type = PERF_RECORD_ITRACE_START;
8776 rec.header.misc = 0;
8777 rec.header.size = sizeof(rec);
8778 rec.pid = perf_event_pid(event, current);
8779 rec.tid = perf_event_tid(event, current);
8780
8781 perf_event_header__init_id(&rec.header, &sample, event);
8782 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8783
8784 if (ret)
8785 return;
8786
8787 perf_output_put(&handle, rec);
8788 perf_event__output_id_sample(event, &handle, &sample);
8789
8790 perf_output_end(&handle);
8791}
8792
8793static int
8794__perf_event_account_interrupt(struct perf_event *event, int throttle)
8795{
8796 struct hw_perf_event *hwc = &event->hw;
8797 int ret = 0;
8798 u64 seq;
8799
8800 seq = __this_cpu_read(perf_throttled_seq);
8801 if (seq != hwc->interrupts_seq) {
8802 hwc->interrupts_seq = seq;
8803 hwc->interrupts = 1;
8804 } else {
8805 hwc->interrupts++;
8806 if (unlikely(throttle
8807 && hwc->interrupts >= max_samples_per_tick)) {
8808 __this_cpu_inc(perf_throttled_count);
8809 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
8810 hwc->interrupts = MAX_INTERRUPTS;
8811 perf_log_throttle(event, 0);
8812 ret = 1;
8813 }
8814 }
8815
8816 if (event->attr.freq) {
8817 u64 now = perf_clock();
8818 s64 delta = now - hwc->freq_time_stamp;
8819
8820 hwc->freq_time_stamp = now;
8821
8822 if (delta > 0 && delta < 2*TICK_NSEC)
8823 perf_adjust_period(event, delta, hwc->last_period, true);
8824 }
8825
8826 return ret;
8827}
8828
8829int perf_event_account_interrupt(struct perf_event *event)
8830{
8831 return __perf_event_account_interrupt(event, 1);
8832}
8833
8834
8835
8836
8837
8838static int __perf_event_overflow(struct perf_event *event,
8839 int throttle, struct perf_sample_data *data,
8840 struct pt_regs *regs)
8841{
8842 int events = atomic_read(&event->event_limit);
8843 int ret = 0;
8844
8845
8846
8847
8848
8849 if (unlikely(!is_sampling_event(event)))
8850 return 0;
8851
8852 ret = __perf_event_account_interrupt(event, throttle);
8853
8854
8855
8856
8857
8858
8859 event->pending_kill = POLL_IN;
8860 if (events && atomic_dec_and_test(&event->event_limit)) {
8861 ret = 1;
8862 event->pending_kill = POLL_HUP;
8863
8864 perf_event_disable_inatomic(event);
8865 }
8866
8867 READ_ONCE(event->overflow_handler)(event, data, regs);
8868
8869 if (*perf_event_fasync(event) && event->pending_kill) {
8870 event->pending_wakeup = 1;
8871 irq_work_queue(&event->pending);
8872 }
8873
8874 return ret;
8875}
8876
8877int perf_event_overflow(struct perf_event *event,
8878 struct perf_sample_data *data,
8879 struct pt_regs *regs)
8880{
8881 return __perf_event_overflow(event, 1, data, regs);
8882}
8883
8884
8885
8886
8887
8888struct swevent_htable {
8889 struct swevent_hlist *swevent_hlist;
8890 struct mutex hlist_mutex;
8891 int hlist_refcount;
8892
8893
8894 int recursion[PERF_NR_CONTEXTS];
8895};
8896
8897static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
8898
8899
8900
8901
8902
8903
8904
8905
8906u64 perf_swevent_set_period(struct perf_event *event)
8907{
8908 struct hw_perf_event *hwc = &event->hw;
8909 u64 period = hwc->last_period;
8910 u64 nr, offset;
8911 s64 old, val;
8912
8913 hwc->last_period = hwc->sample_period;
8914
8915again:
8916 old = val = local64_read(&hwc->period_left);
8917 if (val < 0)
8918 return 0;
8919
8920 nr = div64_u64(period + val, period);
8921 offset = nr * period;
8922 val -= offset;
8923 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
8924 goto again;
8925
8926 return nr;
8927}
8928
8929static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
8930 struct perf_sample_data *data,
8931 struct pt_regs *regs)
8932{
8933 struct hw_perf_event *hwc = &event->hw;
8934 int throttle = 0;
8935
8936 if (!overflow)
8937 overflow = perf_swevent_set_period(event);
8938
8939 if (hwc->interrupts == MAX_INTERRUPTS)
8940 return;
8941
8942 for (; overflow; overflow--) {
8943 if (__perf_event_overflow(event, throttle,
8944 data, regs)) {
8945
8946
8947
8948
8949 break;
8950 }
8951 throttle = 1;
8952 }
8953}
8954
8955static void perf_swevent_event(struct perf_event *event, u64 nr,
8956 struct perf_sample_data *data,
8957 struct pt_regs *regs)
8958{
8959 struct hw_perf_event *hwc = &event->hw;
8960
8961 local64_add(nr, &event->count);
8962
8963 if (!regs)
8964 return;
8965
8966 if (!is_sampling_event(event))
8967 return;
8968
8969 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8970 data->period = nr;
8971 return perf_swevent_overflow(event, 1, data, regs);
8972 } else
8973 data->period = event->hw.last_period;
8974
8975 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8976 return perf_swevent_overflow(event, 1, data, regs);
8977
8978 if (local64_add_negative(nr, &hwc->period_left))
8979 return;
8980
8981 perf_swevent_overflow(event, 0, data, regs);
8982}
8983
8984static int perf_exclude_event(struct perf_event *event,
8985 struct pt_regs *regs)
8986{
8987 if (event->hw.state & PERF_HES_STOPPED)
8988 return 1;
8989
8990 if (regs) {
8991 if (event->attr.exclude_user && user_mode(regs))
8992 return 1;
8993
8994 if (event->attr.exclude_kernel && !user_mode(regs))
8995 return 1;
8996 }
8997
8998 return 0;
8999}
9000
9001static int perf_swevent_match(struct perf_event *event,
9002 enum perf_type_id type,
9003 u32 event_id,
9004 struct perf_sample_data *data,
9005 struct pt_regs *regs)
9006{
9007 if (event->attr.type != type)
9008 return 0;
9009
9010 if (event->attr.config != event_id)
9011 return 0;
9012
9013 if (perf_exclude_event(event, regs))
9014 return 0;
9015
9016 return 1;
9017}
9018
9019static inline u64 swevent_hash(u64 type, u32 event_id)
9020{
9021 u64 val = event_id | (type << 32);
9022
9023 return hash_64(val, SWEVENT_HLIST_BITS);
9024}
9025
9026static inline struct hlist_head *
9027__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9028{
9029 u64 hash = swevent_hash(type, event_id);
9030
9031 return &hlist->heads[hash];
9032}
9033
9034
9035static inline struct hlist_head *
9036find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9037{
9038 struct swevent_hlist *hlist;
9039
9040 hlist = rcu_dereference(swhash->swevent_hlist);
9041 if (!hlist)
9042 return NULL;
9043
9044 return __find_swevent_head(hlist, type, event_id);
9045}
9046
9047
9048static inline struct hlist_head *
9049find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9050{
9051 struct swevent_hlist *hlist;
9052 u32 event_id = event->attr.config;
9053 u64 type = event->attr.type;
9054
9055
9056
9057
9058
9059
9060 hlist = rcu_dereference_protected(swhash->swevent_hlist,
9061 lockdep_is_held(&event->ctx->lock));
9062 if (!hlist)
9063 return NULL;
9064
9065 return __find_swevent_head(hlist, type, event_id);
9066}
9067
9068static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9069 u64 nr,
9070 struct perf_sample_data *data,
9071 struct pt_regs *regs)
9072{
9073 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9074 struct perf_event *event;
9075 struct hlist_head *head;
9076
9077 rcu_read_lock();
9078 head = find_swevent_head_rcu(swhash, type, event_id);
9079 if (!head)
9080 goto end;
9081
9082 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9083 if (perf_swevent_match(event, type, event_id, data, regs))
9084 perf_swevent_event(event, nr, data, regs);
9085 }
9086end:
9087 rcu_read_unlock();
9088}
9089
9090DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9091
9092int perf_swevent_get_recursion_context(void)
9093{
9094 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9095
9096 return get_recursion_context(swhash->recursion);
9097}
9098EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9099
9100void perf_swevent_put_recursion_context(int rctx)
9101{
9102 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9103
9104 put_recursion_context(swhash->recursion, rctx);
9105}
9106
9107void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9108{
9109 struct perf_sample_data data;
9110
9111 if (WARN_ON_ONCE(!regs))
9112 return;
9113
9114 perf_sample_data_init(&data, addr, 0);
9115 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9116}
9117
9118void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9119{
9120 int rctx;
9121
9122 preempt_disable_notrace();
9123 rctx = perf_swevent_get_recursion_context();
9124 if (unlikely(rctx < 0))
9125 goto fail;
9126
9127 ___perf_sw_event(event_id, nr, regs, addr);
9128
9129 perf_swevent_put_recursion_context(rctx);
9130fail:
9131 preempt_enable_notrace();
9132}
9133
9134static void perf_swevent_read(struct perf_event *event)
9135{
9136}
9137
9138static int perf_swevent_add(struct perf_event *event, int flags)
9139{
9140 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9141 struct hw_perf_event *hwc = &event->hw;
9142 struct hlist_head *head;
9143
9144 if (is_sampling_event(event)) {
9145 hwc->last_period = hwc->sample_period;
9146 perf_swevent_set_period(event);
9147 }
9148
9149 hwc->state = !(flags & PERF_EF_START);
9150
9151 head = find_swevent_head(swhash, event);
9152 if (WARN_ON_ONCE(!head))
9153 return -EINVAL;
9154
9155 hlist_add_head_rcu(&event->hlist_entry, head);
9156 perf_event_update_userpage(event);
9157
9158 return 0;
9159}
9160
9161static void perf_swevent_del(struct perf_event *event, int flags)
9162{
9163 hlist_del_rcu(&event->hlist_entry);
9164}
9165
9166static void perf_swevent_start(struct perf_event *event, int flags)
9167{
9168 event->hw.state = 0;
9169}
9170
9171static void perf_swevent_stop(struct perf_event *event, int flags)
9172{
9173 event->hw.state = PERF_HES_STOPPED;
9174}
9175
9176
9177static inline struct swevent_hlist *
9178swevent_hlist_deref(struct swevent_htable *swhash)
9179{
9180 return rcu_dereference_protected(swhash->swevent_hlist,
9181 lockdep_is_held(&swhash->hlist_mutex));
9182}
9183
9184static void swevent_hlist_release(struct swevent_htable *swhash)
9185{
9186 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9187
9188 if (!hlist)
9189 return;
9190
9191 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9192 kfree_rcu(hlist, rcu_head);
9193}
9194
9195static void swevent_hlist_put_cpu(int cpu)
9196{
9197 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9198
9199 mutex_lock(&swhash->hlist_mutex);
9200
9201 if (!--swhash->hlist_refcount)
9202 swevent_hlist_release(swhash);
9203
9204 mutex_unlock(&swhash->hlist_mutex);
9205}
9206
9207static void swevent_hlist_put(void)
9208{
9209 int cpu;
9210
9211 for_each_possible_cpu(cpu)
9212 swevent_hlist_put_cpu(cpu);
9213}
9214
9215static int swevent_hlist_get_cpu(int cpu)
9216{
9217 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9218 int err = 0;
9219
9220 mutex_lock(&swhash->hlist_mutex);
9221 if (!swevent_hlist_deref(swhash) &&
9222 cpumask_test_cpu(cpu, perf_online_mask)) {
9223 struct swevent_hlist *hlist;
9224
9225 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9226 if (!hlist) {
9227 err = -ENOMEM;
9228 goto exit;
9229 }
9230 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9231 }
9232 swhash->hlist_refcount++;
9233exit:
9234 mutex_unlock(&swhash->hlist_mutex);
9235
9236 return err;
9237}
9238
9239static int swevent_hlist_get(void)
9240{
9241 int err, cpu, failed_cpu;
9242
9243 mutex_lock(&pmus_lock);
9244 for_each_possible_cpu(cpu) {
9245 err = swevent_hlist_get_cpu(cpu);
9246 if (err) {
9247 failed_cpu = cpu;
9248 goto fail;
9249 }
9250 }
9251 mutex_unlock(&pmus_lock);
9252 return 0;
9253fail:
9254 for_each_possible_cpu(cpu) {
9255 if (cpu == failed_cpu)
9256 break;
9257 swevent_hlist_put_cpu(cpu);
9258 }
9259 mutex_unlock(&pmus_lock);
9260 return err;
9261}
9262
9263struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9264
9265static void sw_perf_event_destroy(struct perf_event *event)
9266{
9267 u64 event_id = event->attr.config;
9268
9269 WARN_ON(event->parent);
9270
9271 static_key_slow_dec(&perf_swevent_enabled[event_id]);
9272 swevent_hlist_put();
9273}
9274
9275static int perf_swevent_init(struct perf_event *event)
9276{
9277 u64 event_id = event->attr.config;
9278
9279 if (event->attr.type != PERF_TYPE_SOFTWARE)
9280 return -ENOENT;
9281
9282
9283
9284
9285 if (has_branch_stack(event))
9286 return -EOPNOTSUPP;
9287
9288 switch (event_id) {
9289 case PERF_COUNT_SW_CPU_CLOCK:
9290 case PERF_COUNT_SW_TASK_CLOCK:
9291 return -ENOENT;
9292
9293 default:
9294 break;
9295 }
9296
9297 if (event_id >= PERF_COUNT_SW_MAX)
9298 return -ENOENT;
9299
9300 if (!event->parent) {
9301 int err;
9302
9303 err = swevent_hlist_get();
9304 if (err)
9305 return err;
9306
9307 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9308 event->destroy = sw_perf_event_destroy;
9309 }
9310
9311 return 0;
9312}
9313
9314static struct pmu perf_swevent = {
9315 .task_ctx_nr = perf_sw_context,
9316
9317 .capabilities = PERF_PMU_CAP_NO_NMI,
9318
9319 .event_init = perf_swevent_init,
9320 .add = perf_swevent_add,
9321 .del = perf_swevent_del,
9322 .start = perf_swevent_start,
9323 .stop = perf_swevent_stop,
9324 .read = perf_swevent_read,
9325};
9326
9327#ifdef CONFIG_EVENT_TRACING
9328
9329static int perf_tp_filter_match(struct perf_event *event,
9330 struct perf_sample_data *data)
9331{
9332 void *record = data->raw->frag.data;
9333
9334
9335 if (event->parent)
9336 event = event->parent;
9337
9338 if (likely(!event->filter) || filter_match_preds(event->filter, record))
9339 return 1;
9340 return 0;
9341}
9342
9343static int perf_tp_event_match(struct perf_event *event,
9344 struct perf_sample_data *data,
9345 struct pt_regs *regs)
9346{
9347 if (event->hw.state & PERF_HES_STOPPED)
9348 return 0;
9349
9350
9351
9352 if (event->attr.exclude_kernel && !user_mode(regs))
9353 return 0;
9354
9355 if (!perf_tp_filter_match(event, data))
9356 return 0;
9357
9358 return 1;
9359}
9360
9361void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9362 struct trace_event_call *call, u64 count,
9363 struct pt_regs *regs, struct hlist_head *head,
9364 struct task_struct *task)
9365{
9366 if (bpf_prog_array_valid(call)) {
9367 *(struct pt_regs **)raw_data = regs;
9368 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9369 perf_swevent_put_recursion_context(rctx);
9370 return;
9371 }
9372 }
9373 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9374 rctx, task);
9375}
9376EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9377
9378void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9379 struct pt_regs *regs, struct hlist_head *head, int rctx,
9380 struct task_struct *task)
9381{
9382 struct perf_sample_data data;
9383 struct perf_event *event;
9384
9385 struct perf_raw_record raw = {
9386 .frag = {
9387 .size = entry_size,
9388 .data = record,
9389 },
9390 };
9391
9392 perf_sample_data_init(&data, 0, 0);
9393 data.raw = &raw;
9394
9395 perf_trace_buf_update(record, event_type);
9396
9397 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9398 if (perf_tp_event_match(event, &data, regs))
9399 perf_swevent_event(event, count, &data, regs);
9400 }
9401
9402
9403
9404
9405
9406 if (task && task != current) {
9407 struct perf_event_context *ctx;
9408 struct trace_entry *entry = record;
9409
9410 rcu_read_lock();
9411 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9412 if (!ctx)
9413 goto unlock;
9414
9415 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9416 if (event->cpu != smp_processor_id())
9417 continue;
9418 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9419 continue;
9420 if (event->attr.config != entry->type)
9421 continue;
9422 if (perf_tp_event_match(event, &data, regs))
9423 perf_swevent_event(event, count, &data, regs);
9424 }
9425unlock:
9426 rcu_read_unlock();
9427 }
9428
9429 perf_swevent_put_recursion_context(rctx);
9430}
9431EXPORT_SYMBOL_GPL(perf_tp_event);
9432
9433static void tp_perf_event_destroy(struct perf_event *event)
9434{
9435 perf_trace_destroy(event);
9436}
9437
9438static int perf_tp_event_init(struct perf_event *event)
9439{
9440 int err;
9441
9442 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9443 return -ENOENT;
9444
9445
9446
9447
9448 if (has_branch_stack(event))
9449 return -EOPNOTSUPP;
9450
9451 err = perf_trace_init(event);
9452 if (err)
9453 return err;
9454
9455 event->destroy = tp_perf_event_destroy;
9456
9457 return 0;
9458}
9459
9460static struct pmu perf_tracepoint = {
9461 .task_ctx_nr = perf_sw_context,
9462
9463 .event_init = perf_tp_event_init,
9464 .add = perf_trace_add,
9465 .del = perf_trace_del,
9466 .start = perf_swevent_start,
9467 .stop = perf_swevent_stop,
9468 .read = perf_swevent_read,
9469};
9470
9471#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486enum perf_probe_config {
9487 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
9488 PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9489 PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9490};
9491
9492PMU_FORMAT_ATTR(retprobe, "config:0");
9493#endif
9494
9495#ifdef CONFIG_KPROBE_EVENTS
9496static struct attribute *kprobe_attrs[] = {
9497 &format_attr_retprobe.attr,
9498 NULL,
9499};
9500
9501static struct attribute_group kprobe_format_group = {
9502 .name = "format",
9503 .attrs = kprobe_attrs,
9504};
9505
9506static const struct attribute_group *kprobe_attr_groups[] = {
9507 &kprobe_format_group,
9508 NULL,
9509};
9510
9511static int perf_kprobe_event_init(struct perf_event *event);
9512static struct pmu perf_kprobe = {
9513 .task_ctx_nr = perf_sw_context,
9514 .event_init = perf_kprobe_event_init,
9515 .add = perf_trace_add,
9516 .del = perf_trace_del,
9517 .start = perf_swevent_start,
9518 .stop = perf_swevent_stop,
9519 .read = perf_swevent_read,
9520 .attr_groups = kprobe_attr_groups,
9521};
9522
9523static int perf_kprobe_event_init(struct perf_event *event)
9524{
9525 int err;
9526 bool is_retprobe;
9527
9528 if (event->attr.type != perf_kprobe.type)
9529 return -ENOENT;
9530
9531 if (!perfmon_capable())
9532 return -EACCES;
9533
9534
9535
9536
9537 if (has_branch_stack(event))
9538 return -EOPNOTSUPP;
9539
9540 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9541 err = perf_kprobe_init(event, is_retprobe);
9542 if (err)
9543 return err;
9544
9545 event->destroy = perf_kprobe_destroy;
9546
9547 return 0;
9548}
9549#endif
9550
9551#ifdef CONFIG_UPROBE_EVENTS
9552PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9553
9554static struct attribute *uprobe_attrs[] = {
9555 &format_attr_retprobe.attr,
9556 &format_attr_ref_ctr_offset.attr,
9557 NULL,
9558};
9559
9560static struct attribute_group uprobe_format_group = {
9561 .name = "format",
9562 .attrs = uprobe_attrs,
9563};
9564
9565static const struct attribute_group *uprobe_attr_groups[] = {
9566 &uprobe_format_group,
9567 NULL,
9568};
9569
9570static int perf_uprobe_event_init(struct perf_event *event);
9571static struct pmu perf_uprobe = {
9572 .task_ctx_nr = perf_sw_context,
9573 .event_init = perf_uprobe_event_init,
9574 .add = perf_trace_add,
9575 .del = perf_trace_del,
9576 .start = perf_swevent_start,
9577 .stop = perf_swevent_stop,
9578 .read = perf_swevent_read,
9579 .attr_groups = uprobe_attr_groups,
9580};
9581
9582static int perf_uprobe_event_init(struct perf_event *event)
9583{
9584 int err;
9585 unsigned long ref_ctr_offset;
9586 bool is_retprobe;
9587
9588 if (event->attr.type != perf_uprobe.type)
9589 return -ENOENT;
9590
9591 if (!perfmon_capable())
9592 return -EACCES;
9593
9594
9595
9596
9597 if (has_branch_stack(event))
9598 return -EOPNOTSUPP;
9599
9600 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9601 ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9602 err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
9603 if (err)
9604 return err;
9605
9606 event->destroy = perf_uprobe_destroy;
9607
9608 return 0;
9609}
9610#endif
9611
9612static inline void perf_tp_register(void)
9613{
9614 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9615#ifdef CONFIG_KPROBE_EVENTS
9616 perf_pmu_register(&perf_kprobe, "kprobe", -1);
9617#endif
9618#ifdef CONFIG_UPROBE_EVENTS
9619 perf_pmu_register(&perf_uprobe, "uprobe", -1);
9620#endif
9621}
9622
9623static void perf_event_free_filter(struct perf_event *event)
9624{
9625 ftrace_profile_free_filter(event);
9626}
9627
9628#ifdef CONFIG_BPF_SYSCALL
9629static void bpf_overflow_handler(struct perf_event *event,
9630 struct perf_sample_data *data,
9631 struct pt_regs *regs)
9632{
9633 struct bpf_perf_event_data_kern ctx = {
9634 .data = data,
9635 .event = event,
9636 };
9637 int ret = 0;
9638
9639 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9640 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9641 goto out;
9642 rcu_read_lock();
9643 ret = BPF_PROG_RUN(event->prog, &ctx);
9644 rcu_read_unlock();
9645out:
9646 __this_cpu_dec(bpf_prog_active);
9647 if (!ret)
9648 return;
9649
9650 event->orig_overflow_handler(event, data, regs);
9651}
9652
9653static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9654{
9655 struct bpf_prog *prog;
9656
9657 if (event->overflow_handler_context)
9658
9659 return -EINVAL;
9660
9661 if (event->prog)
9662 return -EEXIST;
9663
9664 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
9665 if (IS_ERR(prog))
9666 return PTR_ERR(prog);
9667
9668 if (event->attr.precise_ip &&
9669 prog->call_get_stack &&
9670 (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9671 event->attr.exclude_callchain_kernel ||
9672 event->attr.exclude_callchain_user)) {
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682 bpf_prog_put(prog);
9683 return -EPROTO;
9684 }
9685
9686 event->prog = prog;
9687 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
9688 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
9689 return 0;
9690}
9691
9692static void perf_event_free_bpf_handler(struct perf_event *event)
9693{
9694 struct bpf_prog *prog = event->prog;
9695
9696 if (!prog)
9697 return;
9698
9699 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
9700 event->prog = NULL;
9701 bpf_prog_put(prog);
9702}
9703#else
9704static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9705{
9706 return -EOPNOTSUPP;
9707}
9708static void perf_event_free_bpf_handler(struct perf_event *event)
9709{
9710}
9711#endif
9712
9713
9714
9715
9716
9717static inline bool perf_event_is_tracing(struct perf_event *event)
9718{
9719 if (event->pmu == &perf_tracepoint)
9720 return true;
9721#ifdef CONFIG_KPROBE_EVENTS
9722 if (event->pmu == &perf_kprobe)
9723 return true;
9724#endif
9725#ifdef CONFIG_UPROBE_EVENTS
9726 if (event->pmu == &perf_uprobe)
9727 return true;
9728#endif
9729 return false;
9730}
9731
9732static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9733{
9734 bool is_kprobe, is_tracepoint, is_syscall_tp;
9735 struct bpf_prog *prog;
9736 int ret;
9737
9738 if (!perf_event_is_tracing(event))
9739 return perf_event_set_bpf_handler(event, prog_fd);
9740
9741 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
9742 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
9743 is_syscall_tp = is_syscall_trace_event(event->tp_event);
9744 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
9745
9746 return -EINVAL;
9747
9748 prog = bpf_prog_get(prog_fd);
9749 if (IS_ERR(prog))
9750 return PTR_ERR(prog);
9751
9752 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
9753 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
9754 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
9755
9756 bpf_prog_put(prog);
9757 return -EINVAL;
9758 }
9759
9760
9761 if (prog->kprobe_override &&
9762 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
9763 bpf_prog_put(prog);
9764 return -EINVAL;
9765 }
9766
9767 if (is_tracepoint || is_syscall_tp) {
9768 int off = trace_event_get_offsets(event->tp_event);
9769
9770 if (prog->aux->max_ctx_offset > off) {
9771 bpf_prog_put(prog);
9772 return -EACCES;
9773 }
9774 }
9775
9776 ret = perf_event_attach_bpf_prog(event, prog);
9777 if (ret)
9778 bpf_prog_put(prog);
9779 return ret;
9780}
9781
9782static void perf_event_free_bpf_prog(struct perf_event *event)
9783{
9784 if (!perf_event_is_tracing(event)) {
9785 perf_event_free_bpf_handler(event);
9786 return;
9787 }
9788 perf_event_detach_bpf_prog(event);
9789}
9790
9791#else
9792
9793static inline void perf_tp_register(void)
9794{
9795}
9796
9797static void perf_event_free_filter(struct perf_event *event)
9798{
9799}
9800
9801static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9802{
9803 return -ENOENT;
9804}
9805
9806static void perf_event_free_bpf_prog(struct perf_event *event)
9807{
9808}
9809#endif
9810
9811#ifdef CONFIG_HAVE_HW_BREAKPOINT
9812void perf_bp_event(struct perf_event *bp, void *data)
9813{
9814 struct perf_sample_data sample;
9815 struct pt_regs *regs = data;
9816
9817 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
9818
9819 if (!bp->hw.state && !perf_exclude_event(bp, regs))
9820 perf_swevent_event(bp, 1, &sample, regs);
9821}
9822#endif
9823
9824
9825
9826
9827static struct perf_addr_filter *
9828perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
9829{
9830 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
9831 struct perf_addr_filter *filter;
9832
9833 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
9834 if (!filter)
9835 return NULL;
9836
9837 INIT_LIST_HEAD(&filter->entry);
9838 list_add_tail(&filter->entry, filters);
9839
9840 return filter;
9841}
9842
9843static void free_filters_list(struct list_head *filters)
9844{
9845 struct perf_addr_filter *filter, *iter;
9846
9847 list_for_each_entry_safe(filter, iter, filters, entry) {
9848 path_put(&filter->path);
9849 list_del(&filter->entry);
9850 kfree(filter);
9851 }
9852}
9853
9854
9855
9856
9857static void perf_addr_filters_splice(struct perf_event *event,
9858 struct list_head *head)
9859{
9860 unsigned long flags;
9861 LIST_HEAD(list);
9862
9863 if (!has_addr_filter(event))
9864 return;
9865
9866
9867 if (event->parent)
9868 return;
9869
9870 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
9871
9872 list_splice_init(&event->addr_filters.list, &list);
9873 if (head)
9874 list_splice(head, &event->addr_filters.list);
9875
9876 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
9877
9878 free_filters_list(&list);
9879}
9880
9881
9882
9883
9884
9885
9886static void perf_addr_filter_apply(struct perf_addr_filter *filter,
9887 struct mm_struct *mm,
9888 struct perf_addr_filter_range *fr)
9889{
9890 struct vm_area_struct *vma;
9891
9892 for (vma = mm->mmap; vma; vma = vma->vm_next) {
9893 if (!vma->vm_file)
9894 continue;
9895
9896 if (perf_addr_filter_vma_adjust(filter, vma, fr))
9897 return;
9898 }
9899}
9900
9901
9902
9903
9904
9905static void perf_event_addr_filters_apply(struct perf_event *event)
9906{
9907 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9908 struct task_struct *task = READ_ONCE(event->ctx->task);
9909 struct perf_addr_filter *filter;
9910 struct mm_struct *mm = NULL;
9911 unsigned int count = 0;
9912 unsigned long flags;
9913
9914
9915
9916
9917
9918 if (task == TASK_TOMBSTONE)
9919 return;
9920
9921 if (ifh->nr_file_filters) {
9922 mm = get_task_mm(event->ctx->task);
9923 if (!mm)
9924 goto restart;
9925
9926 mmap_read_lock(mm);
9927 }
9928
9929 raw_spin_lock_irqsave(&ifh->lock, flags);
9930 list_for_each_entry(filter, &ifh->list, entry) {
9931 if (filter->path.dentry) {
9932
9933
9934
9935
9936 event->addr_filter_ranges[count].start = 0;
9937 event->addr_filter_ranges[count].size = 0;
9938
9939 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
9940 } else {
9941 event->addr_filter_ranges[count].start = filter->offset;
9942 event->addr_filter_ranges[count].size = filter->size;
9943 }
9944
9945 count++;
9946 }
9947
9948 event->addr_filters_gen++;
9949 raw_spin_unlock_irqrestore(&ifh->lock, flags);
9950
9951 if (ifh->nr_file_filters) {
9952 mmap_read_unlock(mm);
9953
9954 mmput(mm);
9955 }
9956
9957restart:
9958 perf_event_stop(event, 1);
9959}
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980enum {
9981 IF_ACT_NONE = -1,
9982 IF_ACT_FILTER,
9983 IF_ACT_START,
9984 IF_ACT_STOP,
9985 IF_SRC_FILE,
9986 IF_SRC_KERNEL,
9987 IF_SRC_FILEADDR,
9988 IF_SRC_KERNELADDR,
9989};
9990
9991enum {
9992 IF_STATE_ACTION = 0,
9993 IF_STATE_SOURCE,
9994 IF_STATE_END,
9995};
9996
9997static const match_table_t if_tokens = {
9998 { IF_ACT_FILTER, "filter" },
9999 { IF_ACT_START, "start" },
10000 { IF_ACT_STOP, "stop" },
10001 { IF_SRC_FILE, "%u/%u@%s" },
10002 { IF_SRC_KERNEL, "%u/%u" },
10003 { IF_SRC_FILEADDR, "%u@%s" },
10004 { IF_SRC_KERNELADDR, "%u" },
10005 { IF_ACT_NONE, NULL },
10006};
10007
10008
10009
10010
10011static int
10012perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10013 struct list_head *filters)
10014{
10015 struct perf_addr_filter *filter = NULL;
10016 char *start, *orig, *filename = NULL;
10017 substring_t args[MAX_OPT_ARGS];
10018 int state = IF_STATE_ACTION, token;
10019 unsigned int kernel = 0;
10020 int ret = -EINVAL;
10021
10022 orig = fstr = kstrdup(fstr, GFP_KERNEL);
10023 if (!fstr)
10024 return -ENOMEM;
10025
10026 while ((start = strsep(&fstr, " ,\n")) != NULL) {
10027 static const enum perf_addr_filter_action_t actions[] = {
10028 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10029 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
10030 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
10031 };
10032 ret = -EINVAL;
10033
10034 if (!*start)
10035 continue;
10036
10037
10038 if (state == IF_STATE_ACTION) {
10039 filter = perf_addr_filter_new(event, filters);
10040 if (!filter)
10041 goto fail;
10042 }
10043
10044 token = match_token(start, if_tokens, args);
10045 switch (token) {
10046 case IF_ACT_FILTER:
10047 case IF_ACT_START:
10048 case IF_ACT_STOP:
10049 if (state != IF_STATE_ACTION)
10050 goto fail;
10051
10052 filter->action = actions[token];
10053 state = IF_STATE_SOURCE;
10054 break;
10055
10056 case IF_SRC_KERNELADDR:
10057 case IF_SRC_KERNEL:
10058 kernel = 1;
10059 fallthrough;
10060
10061 case IF_SRC_FILEADDR:
10062 case IF_SRC_FILE:
10063 if (state != IF_STATE_SOURCE)
10064 goto fail;
10065
10066 *args[0].to = 0;
10067 ret = kstrtoul(args[0].from, 0, &filter->offset);
10068 if (ret)
10069 goto fail;
10070
10071 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10072 *args[1].to = 0;
10073 ret = kstrtoul(args[1].from, 0, &filter->size);
10074 if (ret)
10075 goto fail;
10076 }
10077
10078 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10079 int fpos = token == IF_SRC_FILE ? 2 : 1;
10080
10081 kfree(filename);
10082 filename = match_strdup(&args[fpos]);
10083 if (!filename) {
10084 ret = -ENOMEM;
10085 goto fail;
10086 }
10087 }
10088
10089 state = IF_STATE_END;
10090 break;
10091
10092 default:
10093 goto fail;
10094 }
10095
10096
10097
10098
10099
10100
10101 if (state == IF_STATE_END) {
10102 ret = -EINVAL;
10103 if (kernel && event->attr.exclude_kernel)
10104 goto fail;
10105
10106
10107
10108
10109
10110 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10111 !filter->size)
10112 goto fail;
10113
10114 if (!kernel) {
10115 if (!filename)
10116 goto fail;
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126 ret = -EOPNOTSUPP;
10127 if (!event->ctx->task)
10128 goto fail;
10129
10130
10131 ret = kern_path(filename, LOOKUP_FOLLOW,
10132 &filter->path);
10133 if (ret)
10134 goto fail;
10135
10136 ret = -EINVAL;
10137 if (!filter->path.dentry ||
10138 !S_ISREG(d_inode(filter->path.dentry)
10139 ->i_mode))
10140 goto fail;
10141
10142 event->addr_filters.nr_file_filters++;
10143 }
10144
10145
10146 state = IF_STATE_ACTION;
10147 filter = NULL;
10148 }
10149 }
10150
10151 if (state != IF_STATE_ACTION)
10152 goto fail;
10153
10154 kfree(filename);
10155 kfree(orig);
10156
10157 return 0;
10158
10159fail:
10160 kfree(filename);
10161 free_filters_list(filters);
10162 kfree(orig);
10163
10164 return ret;
10165}
10166
10167static int
10168perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10169{
10170 LIST_HEAD(filters);
10171 int ret;
10172
10173
10174
10175
10176
10177 lockdep_assert_held(&event->ctx->mutex);
10178
10179 if (WARN_ON_ONCE(event->parent))
10180 return -EINVAL;
10181
10182 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10183 if (ret)
10184 goto fail_clear_files;
10185
10186 ret = event->pmu->addr_filters_validate(&filters);
10187 if (ret)
10188 goto fail_free_filters;
10189
10190
10191 perf_addr_filters_splice(event, &filters);
10192
10193
10194 perf_event_for_each_child(event, perf_event_addr_filters_apply);
10195
10196 return ret;
10197
10198fail_free_filters:
10199 free_filters_list(&filters);
10200
10201fail_clear_files:
10202 event->addr_filters.nr_file_filters = 0;
10203
10204 return ret;
10205}
10206
10207static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10208{
10209 int ret = -EINVAL;
10210 char *filter_str;
10211
10212 filter_str = strndup_user(arg, PAGE_SIZE);
10213 if (IS_ERR(filter_str))
10214 return PTR_ERR(filter_str);
10215
10216#ifdef CONFIG_EVENT_TRACING
10217 if (perf_event_is_tracing(event)) {
10218 struct perf_event_context *ctx = event->ctx;
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231 mutex_unlock(&ctx->mutex);
10232 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10233 mutex_lock(&ctx->mutex);
10234 } else
10235#endif
10236 if (has_addr_filter(event))
10237 ret = perf_event_set_addr_filter(event, filter_str);
10238
10239 kfree(filter_str);
10240 return ret;
10241}
10242
10243
10244
10245
10246
10247static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10248{
10249 enum hrtimer_restart ret = HRTIMER_RESTART;
10250 struct perf_sample_data data;
10251 struct pt_regs *regs;
10252 struct perf_event *event;
10253 u64 period;
10254
10255 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10256
10257 if (event->state != PERF_EVENT_STATE_ACTIVE)
10258 return HRTIMER_NORESTART;
10259
10260 event->pmu->read(event);
10261
10262 perf_sample_data_init(&data, 0, event->hw.last_period);
10263 regs = get_irq_regs();
10264
10265 if (regs && !perf_exclude_event(event, regs)) {
10266 if (!(event->attr.exclude_idle && is_idle_task(current)))
10267 if (__perf_event_overflow(event, 1, &data, regs))
10268 ret = HRTIMER_NORESTART;
10269 }
10270
10271 period = max_t(u64, 10000, event->hw.sample_period);
10272 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10273
10274 return ret;
10275}
10276
10277static void perf_swevent_start_hrtimer(struct perf_event *event)
10278{
10279 struct hw_perf_event *hwc = &event->hw;
10280 s64 period;
10281
10282 if (!is_sampling_event(event))
10283 return;
10284
10285 period = local64_read(&hwc->period_left);
10286 if (period) {
10287 if (period < 0)
10288 period = 10000;
10289
10290 local64_set(&hwc->period_left, 0);
10291 } else {
10292 period = max_t(u64, 10000, hwc->sample_period);
10293 }
10294 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10295 HRTIMER_MODE_REL_PINNED_HARD);
10296}
10297
10298static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10299{
10300 struct hw_perf_event *hwc = &event->hw;
10301
10302 if (is_sampling_event(event)) {
10303 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10304 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10305
10306 hrtimer_cancel(&hwc->hrtimer);
10307 }
10308}
10309
10310static void perf_swevent_init_hrtimer(struct perf_event *event)
10311{
10312 struct hw_perf_event *hwc = &event->hw;
10313
10314 if (!is_sampling_event(event))
10315 return;
10316
10317 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10318 hwc->hrtimer.function = perf_swevent_hrtimer;
10319
10320
10321
10322
10323
10324 if (event->attr.freq) {
10325 long freq = event->attr.sample_freq;
10326
10327 event->attr.sample_period = NSEC_PER_SEC / freq;
10328 hwc->sample_period = event->attr.sample_period;
10329 local64_set(&hwc->period_left, hwc->sample_period);
10330 hwc->last_period = hwc->sample_period;
10331 event->attr.freq = 0;
10332 }
10333}
10334
10335
10336
10337
10338
10339static void cpu_clock_event_update(struct perf_event *event)
10340{
10341 s64 prev;
10342 u64 now;
10343
10344 now = local_clock();
10345 prev = local64_xchg(&event->hw.prev_count, now);
10346 local64_add(now - prev, &event->count);
10347}
10348
10349static void cpu_clock_event_start(struct perf_event *event, int flags)
10350{
10351 local64_set(&event->hw.prev_count, local_clock());
10352 perf_swevent_start_hrtimer(event);
10353}
10354
10355static void cpu_clock_event_stop(struct perf_event *event, int flags)
10356{
10357 perf_swevent_cancel_hrtimer(event);
10358 cpu_clock_event_update(event);
10359}
10360
10361static int cpu_clock_event_add(struct perf_event *event, int flags)
10362{
10363 if (flags & PERF_EF_START)
10364 cpu_clock_event_start(event, flags);
10365 perf_event_update_userpage(event);
10366
10367 return 0;
10368}
10369
10370static void cpu_clock_event_del(struct perf_event *event, int flags)
10371{
10372 cpu_clock_event_stop(event, flags);
10373}
10374
10375static void cpu_clock_event_read(struct perf_event *event)
10376{
10377 cpu_clock_event_update(event);
10378}
10379
10380static int cpu_clock_event_init(struct perf_event *event)
10381{
10382 if (event->attr.type != PERF_TYPE_SOFTWARE)
10383 return -ENOENT;
10384
10385 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10386 return -ENOENT;
10387
10388
10389
10390
10391 if (has_branch_stack(event))
10392 return -EOPNOTSUPP;
10393
10394 perf_swevent_init_hrtimer(event);
10395
10396 return 0;
10397}
10398
10399static struct pmu perf_cpu_clock = {
10400 .task_ctx_nr = perf_sw_context,
10401
10402 .capabilities = PERF_PMU_CAP_NO_NMI,
10403
10404 .event_init = cpu_clock_event_init,
10405 .add = cpu_clock_event_add,
10406 .del = cpu_clock_event_del,
10407 .start = cpu_clock_event_start,
10408 .stop = cpu_clock_event_stop,
10409 .read = cpu_clock_event_read,
10410};
10411
10412
10413
10414
10415
10416static void task_clock_event_update(struct perf_event *event, u64 now)
10417{
10418 u64 prev;
10419 s64 delta;
10420
10421 prev = local64_xchg(&event->hw.prev_count, now);
10422 delta = now - prev;
10423 local64_add(delta, &event->count);
10424}
10425
10426static void task_clock_event_start(struct perf_event *event, int flags)
10427{
10428 local64_set(&event->hw.prev_count, event->ctx->time);
10429 perf_swevent_start_hrtimer(event);
10430}
10431
10432static void task_clock_event_stop(struct perf_event *event, int flags)
10433{
10434 perf_swevent_cancel_hrtimer(event);
10435 task_clock_event_update(event, event->ctx->time);
10436}
10437
10438static int task_clock_event_add(struct perf_event *event, int flags)
10439{
10440 if (flags & PERF_EF_START)
10441 task_clock_event_start(event, flags);
10442 perf_event_update_userpage(event);
10443
10444 return 0;
10445}
10446
10447static void task_clock_event_del(struct perf_event *event, int flags)
10448{
10449 task_clock_event_stop(event, PERF_EF_UPDATE);
10450}
10451
10452static void task_clock_event_read(struct perf_event *event)
10453{
10454 u64 now = perf_clock();
10455 u64 delta = now - event->ctx->timestamp;
10456 u64 time = event->ctx->time + delta;
10457
10458 task_clock_event_update(event, time);
10459}
10460
10461static int task_clock_event_init(struct perf_event *event)
10462{
10463 if (event->attr.type != PERF_TYPE_SOFTWARE)
10464 return -ENOENT;
10465
10466 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10467 return -ENOENT;
10468
10469
10470
10471
10472 if (has_branch_stack(event))
10473 return -EOPNOTSUPP;
10474
10475 perf_swevent_init_hrtimer(event);
10476
10477 return 0;
10478}
10479
10480static struct pmu perf_task_clock = {
10481 .task_ctx_nr = perf_sw_context,
10482
10483 .capabilities = PERF_PMU_CAP_NO_NMI,
10484
10485 .event_init = task_clock_event_init,
10486 .add = task_clock_event_add,
10487 .del = task_clock_event_del,
10488 .start = task_clock_event_start,
10489 .stop = task_clock_event_stop,
10490 .read = task_clock_event_read,
10491};
10492
10493static void perf_pmu_nop_void(struct pmu *pmu)
10494{
10495}
10496
10497static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10498{
10499}
10500
10501static int perf_pmu_nop_int(struct pmu *pmu)
10502{
10503 return 0;
10504}
10505
10506static int perf_event_nop_int(struct perf_event *event, u64 value)
10507{
10508 return 0;
10509}
10510
10511static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10512
10513static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10514{
10515 __this_cpu_write(nop_txn_flags, flags);
10516
10517 if (flags & ~PERF_PMU_TXN_ADD)
10518 return;
10519
10520 perf_pmu_disable(pmu);
10521}
10522
10523static int perf_pmu_commit_txn(struct pmu *pmu)
10524{
10525 unsigned int flags = __this_cpu_read(nop_txn_flags);
10526
10527 __this_cpu_write(nop_txn_flags, 0);
10528
10529 if (flags & ~PERF_PMU_TXN_ADD)
10530 return 0;
10531
10532 perf_pmu_enable(pmu);
10533 return 0;
10534}
10535
10536static void perf_pmu_cancel_txn(struct pmu *pmu)
10537{
10538 unsigned int flags = __this_cpu_read(nop_txn_flags);
10539
10540 __this_cpu_write(nop_txn_flags, 0);
10541
10542 if (flags & ~PERF_PMU_TXN_ADD)
10543 return;
10544
10545 perf_pmu_enable(pmu);
10546}
10547
10548static int perf_event_idx_default(struct perf_event *event)
10549{
10550 return 0;
10551}
10552
10553
10554
10555
10556
10557static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10558{
10559 struct pmu *pmu;
10560
10561 if (ctxn < 0)
10562 return NULL;
10563
10564 list_for_each_entry(pmu, &pmus, entry) {
10565 if (pmu->task_ctx_nr == ctxn)
10566 return pmu->pmu_cpu_context;
10567 }
10568
10569 return NULL;
10570}
10571
10572static void free_pmu_context(struct pmu *pmu)
10573{
10574
10575
10576
10577
10578
10579 if (pmu->task_ctx_nr > perf_invalid_context)
10580 return;
10581
10582 free_percpu(pmu->pmu_cpu_context);
10583}
10584
10585
10586
10587
10588static ssize_t nr_addr_filters_show(struct device *dev,
10589 struct device_attribute *attr,
10590 char *page)
10591{
10592 struct pmu *pmu = dev_get_drvdata(dev);
10593
10594 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10595}
10596DEVICE_ATTR_RO(nr_addr_filters);
10597
10598static struct idr pmu_idr;
10599
10600static ssize_t
10601type_show(struct device *dev, struct device_attribute *attr, char *page)
10602{
10603 struct pmu *pmu = dev_get_drvdata(dev);
10604
10605 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10606}
10607static DEVICE_ATTR_RO(type);
10608
10609static ssize_t
10610perf_event_mux_interval_ms_show(struct device *dev,
10611 struct device_attribute *attr,
10612 char *page)
10613{
10614 struct pmu *pmu = dev_get_drvdata(dev);
10615
10616 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10617}
10618
10619static DEFINE_MUTEX(mux_interval_mutex);
10620
10621static ssize_t
10622perf_event_mux_interval_ms_store(struct device *dev,
10623 struct device_attribute *attr,
10624 const char *buf, size_t count)
10625{
10626 struct pmu *pmu = dev_get_drvdata(dev);
10627 int timer, cpu, ret;
10628
10629 ret = kstrtoint(buf, 0, &timer);
10630 if (ret)
10631 return ret;
10632
10633 if (timer < 1)
10634 return -EINVAL;
10635
10636
10637 if (timer == pmu->hrtimer_interval_ms)
10638 return count;
10639
10640 mutex_lock(&mux_interval_mutex);
10641 pmu->hrtimer_interval_ms = timer;
10642
10643
10644 cpus_read_lock();
10645 for_each_online_cpu(cpu) {
10646 struct perf_cpu_context *cpuctx;
10647 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10648 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
10649
10650 cpu_function_call(cpu,
10651 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10652 }
10653 cpus_read_unlock();
10654 mutex_unlock(&mux_interval_mutex);
10655
10656 return count;
10657}
10658static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
10659
10660static struct attribute *pmu_dev_attrs[] = {
10661 &dev_attr_type.attr,
10662 &dev_attr_perf_event_mux_interval_ms.attr,
10663 NULL,
10664};
10665ATTRIBUTE_GROUPS(pmu_dev);
10666
10667static int pmu_bus_running;
10668static struct bus_type pmu_bus = {
10669 .name = "event_source",
10670 .dev_groups = pmu_dev_groups,
10671};
10672
10673static void pmu_dev_release(struct device *dev)
10674{
10675 kfree(dev);
10676}
10677
10678static int pmu_dev_alloc(struct pmu *pmu)
10679{
10680 int ret = -ENOMEM;
10681
10682 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
10683 if (!pmu->dev)
10684 goto out;
10685
10686 pmu->dev->groups = pmu->attr_groups;
10687 device_initialize(pmu->dev);
10688 ret = dev_set_name(pmu->dev, "%s", pmu->name);
10689 if (ret)
10690 goto free_dev;
10691
10692 dev_set_drvdata(pmu->dev, pmu);
10693 pmu->dev->bus = &pmu_bus;
10694 pmu->dev->release = pmu_dev_release;
10695 ret = device_add(pmu->dev);
10696 if (ret)
10697 goto free_dev;
10698
10699
10700 if (pmu->nr_addr_filters)
10701 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
10702
10703 if (ret)
10704 goto del_dev;
10705
10706 if (pmu->attr_update)
10707 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
10708
10709 if (ret)
10710 goto del_dev;
10711
10712out:
10713 return ret;
10714
10715del_dev:
10716 device_del(pmu->dev);
10717
10718free_dev:
10719 put_device(pmu->dev);
10720 goto out;
10721}
10722
10723static struct lock_class_key cpuctx_mutex;
10724static struct lock_class_key cpuctx_lock;
10725
10726int perf_pmu_register(struct pmu *pmu, const char *name, int type)
10727{
10728 int cpu, ret, max = PERF_TYPE_MAX;
10729
10730 mutex_lock(&pmus_lock);
10731 ret = -ENOMEM;
10732 pmu->pmu_disable_count = alloc_percpu(int);
10733 if (!pmu->pmu_disable_count)
10734 goto unlock;
10735
10736 pmu->type = -1;
10737 if (!name)
10738 goto skip_type;
10739 pmu->name = name;
10740
10741 if (type != PERF_TYPE_SOFTWARE) {
10742 if (type >= 0)
10743 max = type;
10744
10745 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10746 if (ret < 0)
10747 goto free_pdc;
10748
10749 WARN_ON(type >= 0 && ret != type);
10750
10751 type = ret;
10752 }
10753 pmu->type = type;
10754
10755 if (pmu_bus_running) {
10756 ret = pmu_dev_alloc(pmu);
10757 if (ret)
10758 goto free_idr;
10759 }
10760
10761skip_type:
10762 if (pmu->task_ctx_nr == perf_hw_context) {
10763 static int hw_context_taken = 0;
10764
10765
10766
10767
10768
10769
10770 if (WARN_ON_ONCE(hw_context_taken &&
10771 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
10772 pmu->task_ctx_nr = perf_invalid_context;
10773
10774 hw_context_taken = 1;
10775 }
10776
10777 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
10778 if (pmu->pmu_cpu_context)
10779 goto got_cpu_context;
10780
10781 ret = -ENOMEM;
10782 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
10783 if (!pmu->pmu_cpu_context)
10784 goto free_dev;
10785
10786 for_each_possible_cpu(cpu) {
10787 struct perf_cpu_context *cpuctx;
10788
10789 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10790 __perf_event_init_context(&cpuctx->ctx);
10791 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
10792 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
10793 cpuctx->ctx.pmu = pmu;
10794 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
10795
10796 __perf_mux_hrtimer_init(cpuctx, cpu);
10797
10798 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
10799 cpuctx->heap = cpuctx->heap_default;
10800 }
10801
10802got_cpu_context:
10803 if (!pmu->start_txn) {
10804 if (pmu->pmu_enable) {
10805
10806
10807
10808
10809
10810 pmu->start_txn = perf_pmu_start_txn;
10811 pmu->commit_txn = perf_pmu_commit_txn;
10812 pmu->cancel_txn = perf_pmu_cancel_txn;
10813 } else {
10814 pmu->start_txn = perf_pmu_nop_txn;
10815 pmu->commit_txn = perf_pmu_nop_int;
10816 pmu->cancel_txn = perf_pmu_nop_void;
10817 }
10818 }
10819
10820 if (!pmu->pmu_enable) {
10821 pmu->pmu_enable = perf_pmu_nop_void;
10822 pmu->pmu_disable = perf_pmu_nop_void;
10823 }
10824
10825 if (!pmu->check_period)
10826 pmu->check_period = perf_event_nop_int;
10827
10828 if (!pmu->event_idx)
10829 pmu->event_idx = perf_event_idx_default;
10830
10831
10832
10833
10834
10835
10836 if (type == PERF_TYPE_SOFTWARE || !name)
10837 list_add_rcu(&pmu->entry, &pmus);
10838 else
10839 list_add_tail_rcu(&pmu->entry, &pmus);
10840
10841 atomic_set(&pmu->exclusive_cnt, 0);
10842 ret = 0;
10843unlock:
10844 mutex_unlock(&pmus_lock);
10845
10846 return ret;
10847
10848free_dev:
10849 device_del(pmu->dev);
10850 put_device(pmu->dev);
10851
10852free_idr:
10853 if (pmu->type != PERF_TYPE_SOFTWARE)
10854 idr_remove(&pmu_idr, pmu->type);
10855
10856free_pdc:
10857 free_percpu(pmu->pmu_disable_count);
10858 goto unlock;
10859}
10860EXPORT_SYMBOL_GPL(perf_pmu_register);
10861
10862void perf_pmu_unregister(struct pmu *pmu)
10863{
10864 mutex_lock(&pmus_lock);
10865 list_del_rcu(&pmu->entry);
10866
10867
10868
10869
10870
10871 synchronize_srcu(&pmus_srcu);
10872 synchronize_rcu();
10873
10874 free_percpu(pmu->pmu_disable_count);
10875 if (pmu->type != PERF_TYPE_SOFTWARE)
10876 idr_remove(&pmu_idr, pmu->type);
10877 if (pmu_bus_running) {
10878 if (pmu->nr_addr_filters)
10879 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
10880 device_del(pmu->dev);
10881 put_device(pmu->dev);
10882 }
10883 free_pmu_context(pmu);
10884 mutex_unlock(&pmus_lock);
10885}
10886EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10887
10888static inline bool has_extended_regs(struct perf_event *event)
10889{
10890 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10891 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10892}
10893
10894static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10895{
10896 struct perf_event_context *ctx = NULL;
10897 int ret;
10898
10899 if (!try_module_get(pmu->module))
10900 return -ENODEV;
10901
10902
10903
10904
10905
10906
10907
10908 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
10909
10910
10911
10912
10913 ctx = perf_event_ctx_lock_nested(event->group_leader,
10914 SINGLE_DEPTH_NESTING);
10915 BUG_ON(!ctx);
10916 }
10917
10918 event->pmu = pmu;
10919 ret = pmu->event_init(event);
10920
10921 if (ctx)
10922 perf_event_ctx_unlock(event->group_leader, ctx);
10923
10924 if (!ret) {
10925 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
10926 has_extended_regs(event))
10927 ret = -EOPNOTSUPP;
10928
10929 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
10930 event_has_any_exclude_flag(event))
10931 ret = -EINVAL;
10932
10933 if (ret && event->destroy)
10934 event->destroy(event);
10935 }
10936
10937 if (ret)
10938 module_put(pmu->module);
10939
10940 return ret;
10941}
10942
10943static struct pmu *perf_init_event(struct perf_event *event)
10944{
10945 int idx, type, ret;
10946 struct pmu *pmu;
10947
10948 idx = srcu_read_lock(&pmus_srcu);
10949
10950
10951 if (event->parent && event->parent->pmu) {
10952 pmu = event->parent->pmu;
10953 ret = perf_try_init_event(pmu, event);
10954 if (!ret)
10955 goto unlock;
10956 }
10957
10958
10959
10960
10961
10962 type = event->attr.type;
10963 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
10964 type = PERF_TYPE_RAW;
10965
10966again:
10967 rcu_read_lock();
10968 pmu = idr_find(&pmu_idr, type);
10969 rcu_read_unlock();
10970 if (pmu) {
10971 ret = perf_try_init_event(pmu, event);
10972 if (ret == -ENOENT && event->attr.type != type) {
10973 type = event->attr.type;
10974 goto again;
10975 }
10976
10977 if (ret)
10978 pmu = ERR_PTR(ret);
10979
10980 goto unlock;
10981 }
10982
10983 list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
10984 ret = perf_try_init_event(pmu, event);
10985 if (!ret)
10986 goto unlock;
10987
10988 if (ret != -ENOENT) {
10989 pmu = ERR_PTR(ret);
10990 goto unlock;
10991 }
10992 }
10993 pmu = ERR_PTR(-ENOENT);
10994unlock:
10995 srcu_read_unlock(&pmus_srcu, idx);
10996
10997 return pmu;
10998}
10999
11000static void attach_sb_event(struct perf_event *event)
11001{
11002 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11003
11004 raw_spin_lock(&pel->lock);
11005 list_add_rcu(&event->sb_list, &pel->list);
11006 raw_spin_unlock(&pel->lock);
11007}
11008
11009
11010
11011
11012
11013
11014
11015
11016static void account_pmu_sb_event(struct perf_event *event)
11017{
11018 if (is_sb_event(event))
11019 attach_sb_event(event);
11020}
11021
11022static void account_event_cpu(struct perf_event *event, int cpu)
11023{
11024 if (event->parent)
11025 return;
11026
11027 if (is_cgroup_event(event))
11028 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11029}
11030
11031
11032static void account_freq_event_nohz(void)
11033{
11034#ifdef CONFIG_NO_HZ_FULL
11035
11036 spin_lock(&nr_freq_lock);
11037 if (atomic_inc_return(&nr_freq_events) == 1)
11038 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11039 spin_unlock(&nr_freq_lock);
11040#endif
11041}
11042
11043static void account_freq_event(void)
11044{
11045 if (tick_nohz_full_enabled())
11046 account_freq_event_nohz();
11047 else
11048 atomic_inc(&nr_freq_events);
11049}
11050
11051
11052static void account_event(struct perf_event *event)
11053{
11054 bool inc = false;
11055
11056 if (event->parent)
11057 return;
11058
11059 if (event->attach_state & PERF_ATTACH_TASK)
11060 inc = true;
11061 if (event->attr.mmap || event->attr.mmap_data)
11062 atomic_inc(&nr_mmap_events);
11063 if (event->attr.comm)
11064 atomic_inc(&nr_comm_events);
11065 if (event->attr.namespaces)
11066 atomic_inc(&nr_namespaces_events);
11067 if (event->attr.cgroup)
11068 atomic_inc(&nr_cgroup_events);
11069 if (event->attr.task)
11070 atomic_inc(&nr_task_events);
11071 if (event->attr.freq)
11072 account_freq_event();
11073 if (event->attr.context_switch) {
11074 atomic_inc(&nr_switch_events);
11075 inc = true;
11076 }
11077 if (has_branch_stack(event))
11078 inc = true;
11079 if (is_cgroup_event(event))
11080 inc = true;
11081 if (event->attr.ksymbol)
11082 atomic_inc(&nr_ksymbol_events);
11083 if (event->attr.bpf_event)
11084 atomic_inc(&nr_bpf_events);
11085 if (event->attr.text_poke)
11086 atomic_inc(&nr_text_poke_events);
11087
11088 if (inc) {
11089
11090
11091
11092
11093
11094 if (atomic_inc_not_zero(&perf_sched_count))
11095 goto enabled;
11096
11097 mutex_lock(&perf_sched_mutex);
11098 if (!atomic_read(&perf_sched_count)) {
11099 static_branch_enable(&perf_sched_events);
11100
11101
11102
11103
11104
11105 synchronize_rcu();
11106 }
11107
11108
11109
11110
11111 atomic_inc(&perf_sched_count);
11112 mutex_unlock(&perf_sched_mutex);
11113 }
11114enabled:
11115
11116 account_event_cpu(event, event->cpu);
11117
11118 account_pmu_sb_event(event);
11119}
11120
11121
11122
11123
11124static struct perf_event *
11125perf_event_alloc(struct perf_event_attr *attr, int cpu,
11126 struct task_struct *task,
11127 struct perf_event *group_leader,
11128 struct perf_event *parent_event,
11129 perf_overflow_handler_t overflow_handler,
11130 void *context, int cgroup_fd)
11131{
11132 struct pmu *pmu;
11133 struct perf_event *event;
11134 struct hw_perf_event *hwc;
11135 long err = -EINVAL;
11136
11137 if ((unsigned)cpu >= nr_cpu_ids) {
11138 if (!task || cpu != -1)
11139 return ERR_PTR(-EINVAL);
11140 }
11141
11142 event = kzalloc(sizeof(*event), GFP_KERNEL);
11143 if (!event)
11144 return ERR_PTR(-ENOMEM);
11145
11146
11147
11148
11149
11150 if (!group_leader)
11151 group_leader = event;
11152
11153 mutex_init(&event->child_mutex);
11154 INIT_LIST_HEAD(&event->child_list);
11155
11156 INIT_LIST_HEAD(&event->event_entry);
11157 INIT_LIST_HEAD(&event->sibling_list);
11158 INIT_LIST_HEAD(&event->active_list);
11159 init_event_group(event);
11160 INIT_LIST_HEAD(&event->rb_entry);
11161 INIT_LIST_HEAD(&event->active_entry);
11162 INIT_LIST_HEAD(&event->addr_filters.list);
11163 INIT_HLIST_NODE(&event->hlist_entry);
11164
11165
11166 init_waitqueue_head(&event->waitq);
11167 event->pending_disable = -1;
11168 init_irq_work(&event->pending, perf_pending_event);
11169
11170 mutex_init(&event->mmap_mutex);
11171 raw_spin_lock_init(&event->addr_filters.lock);
11172
11173 atomic_long_set(&event->refcount, 1);
11174 event->cpu = cpu;
11175 event->attr = *attr;
11176 event->group_leader = group_leader;
11177 event->pmu = NULL;
11178 event->oncpu = -1;
11179
11180 event->parent = parent_event;
11181
11182 event->ns = get_pid_ns(task_active_pid_ns(current));
11183 event->id = atomic64_inc_return(&perf_event_id);
11184
11185 event->state = PERF_EVENT_STATE_INACTIVE;
11186
11187 if (task) {
11188 event->attach_state = PERF_ATTACH_TASK;
11189
11190
11191
11192
11193
11194 event->hw.target = get_task_struct(task);
11195 }
11196
11197 event->clock = &local_clock;
11198 if (parent_event)
11199 event->clock = parent_event->clock;
11200
11201 if (!overflow_handler && parent_event) {
11202 overflow_handler = parent_event->overflow_handler;
11203 context = parent_event->overflow_handler_context;
11204#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11205 if (overflow_handler == bpf_overflow_handler) {
11206 struct bpf_prog *prog = parent_event->prog;
11207
11208 bpf_prog_inc(prog);
11209 event->prog = prog;
11210 event->orig_overflow_handler =
11211 parent_event->orig_overflow_handler;
11212 }
11213#endif
11214 }
11215
11216 if (overflow_handler) {
11217 event->overflow_handler = overflow_handler;
11218 event->overflow_handler_context = context;
11219 } else if (is_write_backward(event)){
11220 event->overflow_handler = perf_event_output_backward;
11221 event->overflow_handler_context = NULL;
11222 } else {
11223 event->overflow_handler = perf_event_output_forward;
11224 event->overflow_handler_context = NULL;
11225 }
11226
11227 perf_event__state_init(event);
11228
11229 pmu = NULL;
11230
11231 hwc = &event->hw;
11232 hwc->sample_period = attr->sample_period;
11233 if (attr->freq && attr->sample_freq)
11234 hwc->sample_period = 1;
11235 hwc->last_period = hwc->sample_period;
11236
11237 local64_set(&hwc->period_left, hwc->sample_period);
11238
11239
11240
11241
11242
11243 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11244 goto err_ns;
11245
11246 if (!has_branch_stack(event))
11247 event->attr.branch_sample_type = 0;
11248
11249 pmu = perf_init_event(event);
11250 if (IS_ERR(pmu)) {
11251 err = PTR_ERR(pmu);
11252 goto err_ns;
11253 }
11254
11255
11256
11257
11258
11259 if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11260 err = -EINVAL;
11261 goto err_pmu;
11262 }
11263
11264 if (event->attr.aux_output &&
11265 !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11266 err = -EOPNOTSUPP;
11267 goto err_pmu;
11268 }
11269
11270 if (cgroup_fd != -1) {
11271 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11272 if (err)
11273 goto err_pmu;
11274 }
11275
11276 err = exclusive_event_init(event);
11277 if (err)
11278 goto err_pmu;
11279
11280 if (has_addr_filter(event)) {
11281 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11282 sizeof(struct perf_addr_filter_range),
11283 GFP_KERNEL);
11284 if (!event->addr_filter_ranges) {
11285 err = -ENOMEM;
11286 goto err_per_task;
11287 }
11288
11289
11290
11291
11292
11293 if (event->parent) {
11294 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11295
11296 raw_spin_lock_irq(&ifh->lock);
11297 memcpy(event->addr_filter_ranges,
11298 event->parent->addr_filter_ranges,
11299 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11300 raw_spin_unlock_irq(&ifh->lock);
11301 }
11302
11303
11304 event->addr_filters_gen = 1;
11305 }
11306
11307 if (!event->parent) {
11308 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11309 err = get_callchain_buffers(attr->sample_max_stack);
11310 if (err)
11311 goto err_addr_filters;
11312 }
11313 }
11314
11315 err = security_perf_event_alloc(event);
11316 if (err)
11317 goto err_callchain_buffer;
11318
11319
11320 account_event(event);
11321
11322 return event;
11323
11324err_callchain_buffer:
11325 if (!event->parent) {
11326 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11327 put_callchain_buffers();
11328 }
11329err_addr_filters:
11330 kfree(event->addr_filter_ranges);
11331
11332err_per_task:
11333 exclusive_event_destroy(event);
11334
11335err_pmu:
11336 if (is_cgroup_event(event))
11337 perf_detach_cgroup(event);
11338 if (event->destroy)
11339 event->destroy(event);
11340 module_put(pmu->module);
11341err_ns:
11342 if (event->ns)
11343 put_pid_ns(event->ns);
11344 if (event->hw.target)
11345 put_task_struct(event->hw.target);
11346 kfree(event);
11347
11348 return ERR_PTR(err);
11349}
11350
11351static int perf_copy_attr(struct perf_event_attr __user *uattr,
11352 struct perf_event_attr *attr)
11353{
11354 u32 size;
11355 int ret;
11356
11357
11358 memset(attr, 0, sizeof(*attr));
11359
11360 ret = get_user(size, &uattr->size);
11361 if (ret)
11362 return ret;
11363
11364
11365 if (!size)
11366 size = PERF_ATTR_SIZE_VER0;
11367 if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11368 goto err_size;
11369
11370 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11371 if (ret) {
11372 if (ret == -E2BIG)
11373 goto err_size;
11374 return ret;
11375 }
11376
11377 attr->size = size;
11378
11379 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11380 return -EINVAL;
11381
11382 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11383 return -EINVAL;
11384
11385 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11386 return -EINVAL;
11387
11388 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11389 u64 mask = attr->branch_sample_type;
11390
11391
11392 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11393 return -EINVAL;
11394
11395
11396 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11397 return -EINVAL;
11398
11399
11400 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11401
11402
11403 if (!attr->exclude_kernel)
11404 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11405
11406 if (!attr->exclude_user)
11407 mask |= PERF_SAMPLE_BRANCH_USER;
11408
11409 if (!attr->exclude_hv)
11410 mask |= PERF_SAMPLE_BRANCH_HV;
11411
11412
11413
11414 attr->branch_sample_type = mask;
11415 }
11416
11417 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11418 ret = perf_allow_kernel(attr);
11419 if (ret)
11420 return ret;
11421 }
11422 }
11423
11424 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11425 ret = perf_reg_validate(attr->sample_regs_user);
11426 if (ret)
11427 return ret;
11428 }
11429
11430 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11431 if (!arch_perf_have_user_stack_dump())
11432 return -ENOSYS;
11433
11434
11435
11436
11437
11438
11439 if (attr->sample_stack_user >= USHRT_MAX)
11440 return -EINVAL;
11441 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11442 return -EINVAL;
11443 }
11444
11445 if (!attr->sample_max_stack)
11446 attr->sample_max_stack = sysctl_perf_event_max_stack;
11447
11448 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11449 ret = perf_reg_validate(attr->sample_regs_intr);
11450
11451#ifndef CONFIG_CGROUP_PERF
11452 if (attr->sample_type & PERF_SAMPLE_CGROUP)
11453 return -EINVAL;
11454#endif
11455
11456out:
11457 return ret;
11458
11459err_size:
11460 put_user(sizeof(*attr), &uattr->size);
11461 ret = -E2BIG;
11462 goto out;
11463}
11464
11465static int
11466perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11467{
11468 struct perf_buffer *rb = NULL;
11469 int ret = -EINVAL;
11470
11471 if (!output_event)
11472 goto set;
11473
11474
11475 if (event == output_event)
11476 goto out;
11477
11478
11479
11480
11481 if (output_event->cpu != event->cpu)
11482 goto out;
11483
11484
11485
11486
11487 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11488 goto out;
11489
11490
11491
11492
11493 if (output_event->clock != event->clock)
11494 goto out;
11495
11496
11497
11498
11499
11500 if (is_write_backward(output_event) != is_write_backward(event))
11501 goto out;
11502
11503
11504
11505
11506 if (has_aux(event) && has_aux(output_event) &&
11507 event->pmu != output_event->pmu)
11508 goto out;
11509
11510set:
11511 mutex_lock(&event->mmap_mutex);
11512
11513 if (atomic_read(&event->mmap_count))
11514 goto unlock;
11515
11516 if (output_event) {
11517
11518 rb = ring_buffer_get(output_event);
11519 if (!rb)
11520 goto unlock;
11521 }
11522
11523 ring_buffer_attach(event, rb);
11524
11525 ret = 0;
11526unlock:
11527 mutex_unlock(&event->mmap_mutex);
11528
11529out:
11530 return ret;
11531}
11532
11533static void mutex_lock_double(struct mutex *a, struct mutex *b)
11534{
11535 if (b < a)
11536 swap(a, b);
11537
11538 mutex_lock(a);
11539 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11540}
11541
11542static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11543{
11544 bool nmi_safe = false;
11545
11546 switch (clk_id) {
11547 case CLOCK_MONOTONIC:
11548 event->clock = &ktime_get_mono_fast_ns;
11549 nmi_safe = true;
11550 break;
11551
11552 case CLOCK_MONOTONIC_RAW:
11553 event->clock = &ktime_get_raw_fast_ns;
11554 nmi_safe = true;
11555 break;
11556
11557 case CLOCK_REALTIME:
11558 event->clock = &ktime_get_real_ns;
11559 break;
11560
11561 case CLOCK_BOOTTIME:
11562 event->clock = &ktime_get_boottime_ns;
11563 break;
11564
11565 case CLOCK_TAI:
11566 event->clock = &ktime_get_clocktai_ns;
11567 break;
11568
11569 default:
11570 return -EINVAL;
11571 }
11572
11573 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11574 return -EINVAL;
11575
11576 return 0;
11577}
11578
11579
11580
11581
11582
11583static struct perf_event_context *
11584__perf_event_ctx_lock_double(struct perf_event *group_leader,
11585 struct perf_event_context *ctx)
11586{
11587 struct perf_event_context *gctx;
11588
11589again:
11590 rcu_read_lock();
11591 gctx = READ_ONCE(group_leader->ctx);
11592 if (!refcount_inc_not_zero(&gctx->refcount)) {
11593 rcu_read_unlock();
11594 goto again;
11595 }
11596 rcu_read_unlock();
11597
11598 mutex_lock_double(&gctx->mutex, &ctx->mutex);
11599
11600 if (group_leader->ctx != gctx) {
11601 mutex_unlock(&ctx->mutex);
11602 mutex_unlock(&gctx->mutex);
11603 put_ctx(gctx);
11604 goto again;
11605 }
11606
11607 return gctx;
11608}
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618SYSCALL_DEFINE5(perf_event_open,
11619 struct perf_event_attr __user *, attr_uptr,
11620 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
11621{
11622 struct perf_event *group_leader = NULL, *output_event = NULL;
11623 struct perf_event *event, *sibling;
11624 struct perf_event_attr attr;
11625 struct perf_event_context *ctx, *gctx;
11626 struct file *event_file = NULL;
11627 struct fd group = {NULL, 0};
11628 struct task_struct *task = NULL;
11629 struct pmu *pmu;
11630 int event_fd;
11631 int move_group = 0;
11632 int err;
11633 int f_flags = O_RDWR;
11634 int cgroup_fd = -1;
11635
11636
11637 if (flags & ~PERF_FLAG_ALL)
11638 return -EINVAL;
11639
11640
11641 err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
11642 if (err)
11643 return err;
11644
11645 err = perf_copy_attr(attr_uptr, &attr);
11646 if (err)
11647 return err;
11648
11649 if (!attr.exclude_kernel) {
11650 err = perf_allow_kernel(&attr);
11651 if (err)
11652 return err;
11653 }
11654
11655 if (attr.namespaces) {
11656 if (!perfmon_capable())
11657 return -EACCES;
11658 }
11659
11660 if (attr.freq) {
11661 if (attr.sample_freq > sysctl_perf_event_sample_rate)
11662 return -EINVAL;
11663 } else {
11664 if (attr.sample_period & (1ULL << 63))
11665 return -EINVAL;
11666 }
11667
11668
11669 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
11670 err = perf_allow_kernel(&attr);
11671 if (err)
11672 return err;
11673 }
11674
11675 err = security_locked_down(LOCKDOWN_PERF);
11676 if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
11677
11678 return err;
11679
11680 err = 0;
11681
11682
11683
11684
11685
11686
11687
11688 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
11689 return -EINVAL;
11690
11691 if (flags & PERF_FLAG_FD_CLOEXEC)
11692 f_flags |= O_CLOEXEC;
11693
11694 event_fd = get_unused_fd_flags(f_flags);
11695 if (event_fd < 0)
11696 return event_fd;
11697
11698 if (group_fd != -1) {
11699 err = perf_fget_light(group_fd, &group);
11700 if (err)
11701 goto err_fd;
11702 group_leader = group.file->private_data;
11703 if (flags & PERF_FLAG_FD_OUTPUT)
11704 output_event = group_leader;
11705 if (flags & PERF_FLAG_FD_NO_GROUP)
11706 group_leader = NULL;
11707 }
11708
11709 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
11710 task = find_lively_task_by_vpid(pid);
11711 if (IS_ERR(task)) {
11712 err = PTR_ERR(task);
11713 goto err_group_fd;
11714 }
11715 }
11716
11717 if (task && group_leader &&
11718 group_leader->attr.inherit != attr.inherit) {
11719 err = -EINVAL;
11720 goto err_task;
11721 }
11722
11723 if (task) {
11724 err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
11725 if (err)
11726 goto err_task;
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736 err = -EACCES;
11737 if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
11738 goto err_cred;
11739 }
11740
11741 if (flags & PERF_FLAG_PID_CGROUP)
11742 cgroup_fd = pid;
11743
11744 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
11745 NULL, NULL, cgroup_fd);
11746 if (IS_ERR(event)) {
11747 err = PTR_ERR(event);
11748 goto err_cred;
11749 }
11750
11751 if (is_sampling_event(event)) {
11752 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
11753 err = -EOPNOTSUPP;
11754 goto err_alloc;
11755 }
11756 }
11757
11758
11759
11760
11761
11762 pmu = event->pmu;
11763
11764 if (attr.use_clockid) {
11765 err = perf_event_set_clock(event, attr.clockid);
11766 if (err)
11767 goto err_alloc;
11768 }
11769
11770 if (pmu->task_ctx_nr == perf_sw_context)
11771 event->event_caps |= PERF_EV_CAP_SOFTWARE;
11772
11773 if (group_leader) {
11774 if (is_software_event(event) &&
11775 !in_software_context(group_leader)) {
11776
11777
11778
11779
11780
11781
11782
11783
11784 pmu = group_leader->ctx->pmu;
11785 } else if (!is_software_event(event) &&
11786 is_software_event(group_leader) &&
11787 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11788
11789
11790
11791
11792
11793 move_group = 1;
11794 }
11795 }
11796
11797
11798
11799
11800 ctx = find_get_context(pmu, task, event);
11801 if (IS_ERR(ctx)) {
11802 err = PTR_ERR(ctx);
11803 goto err_alloc;
11804 }
11805
11806
11807
11808
11809 if (group_leader) {
11810 err = -EINVAL;
11811
11812
11813
11814
11815
11816 if (group_leader->group_leader != group_leader)
11817 goto err_context;
11818
11819
11820 if (group_leader->clock != event->clock)
11821 goto err_context;
11822
11823
11824
11825
11826
11827
11828 if (group_leader->cpu != event->cpu)
11829 goto err_context;
11830
11831
11832
11833
11834
11835 if (group_leader->ctx->task != ctx->task)
11836 goto err_context;
11837
11838
11839
11840
11841
11842
11843 if (!move_group && group_leader->ctx != ctx)
11844 goto err_context;
11845
11846
11847
11848
11849 if (attr.exclusive || attr.pinned)
11850 goto err_context;
11851 }
11852
11853 if (output_event) {
11854 err = perf_event_set_output(event, output_event);
11855 if (err)
11856 goto err_context;
11857 }
11858
11859 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
11860 f_flags);
11861 if (IS_ERR(event_file)) {
11862 err = PTR_ERR(event_file);
11863 event_file = NULL;
11864 goto err_context;
11865 }
11866
11867 if (move_group) {
11868 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
11869
11870 if (gctx->task == TASK_TOMBSTONE) {
11871 err = -ESRCH;
11872 goto err_locked;
11873 }
11874
11875
11876
11877
11878
11879 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11880
11881
11882
11883
11884
11885 if (gctx != ctx) {
11886 err = -EINVAL;
11887 goto err_locked;
11888 } else {
11889 perf_event_ctx_unlock(group_leader, gctx);
11890 move_group = 0;
11891 }
11892 }
11893
11894
11895
11896
11897 err = -EBUSY;
11898 if (!exclusive_event_installable(group_leader, ctx))
11899 goto err_locked;
11900
11901 for_each_sibling_event(sibling, group_leader) {
11902 if (!exclusive_event_installable(sibling, ctx))
11903 goto err_locked;
11904 }
11905 } else {
11906 mutex_lock(&ctx->mutex);
11907 }
11908
11909 if (ctx->task == TASK_TOMBSTONE) {
11910 err = -ESRCH;
11911 goto err_locked;
11912 }
11913
11914 if (!perf_event_validate_size(event)) {
11915 err = -E2BIG;
11916 goto err_locked;
11917 }
11918
11919 if (!task) {
11920
11921
11922
11923
11924
11925
11926 struct perf_cpu_context *cpuctx =
11927 container_of(ctx, struct perf_cpu_context, ctx);
11928
11929 if (!cpuctx->online) {
11930 err = -ENODEV;
11931 goto err_locked;
11932 }
11933 }
11934
11935 if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
11936 err = -EINVAL;
11937 goto err_locked;
11938 }
11939
11940
11941
11942
11943
11944 if (!exclusive_event_installable(event, ctx)) {
11945 err = -EBUSY;
11946 goto err_locked;
11947 }
11948
11949 WARN_ON_ONCE(ctx->parent_ctx);
11950
11951
11952
11953
11954
11955
11956 if (move_group) {
11957
11958
11959
11960
11961 perf_remove_from_context(group_leader, 0);
11962 put_ctx(gctx);
11963
11964 for_each_sibling_event(sibling, group_leader) {
11965 perf_remove_from_context(sibling, 0);
11966 put_ctx(gctx);
11967 }
11968
11969
11970
11971
11972
11973 synchronize_rcu();
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985 for_each_sibling_event(sibling, group_leader) {
11986 perf_event__state_init(sibling);
11987 perf_install_in_context(ctx, sibling, sibling->cpu);
11988 get_ctx(ctx);
11989 }
11990
11991
11992
11993
11994
11995
11996 perf_event__state_init(group_leader);
11997 perf_install_in_context(ctx, group_leader, group_leader->cpu);
11998 get_ctx(ctx);
11999 }
12000
12001
12002
12003
12004
12005
12006
12007 perf_event__header_size(event);
12008 perf_event__id_header_size(event);
12009
12010 event->owner = current;
12011
12012 perf_install_in_context(ctx, event, event->cpu);
12013 perf_unpin_context(ctx);
12014
12015 if (move_group)
12016 perf_event_ctx_unlock(group_leader, gctx);
12017 mutex_unlock(&ctx->mutex);
12018
12019 if (task) {
12020 mutex_unlock(&task->signal->exec_update_mutex);
12021 put_task_struct(task);
12022 }
12023
12024 mutex_lock(¤t->perf_event_mutex);
12025 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
12026 mutex_unlock(¤t->perf_event_mutex);
12027
12028
12029
12030
12031
12032
12033
12034 fdput(group);
12035 fd_install(event_fd, event_file);
12036 return event_fd;
12037
12038err_locked:
12039 if (move_group)
12040 perf_event_ctx_unlock(group_leader, gctx);
12041 mutex_unlock(&ctx->mutex);
12042
12043 fput(event_file);
12044err_context:
12045 perf_unpin_context(ctx);
12046 put_ctx(ctx);
12047err_alloc:
12048
12049
12050
12051
12052 if (!event_file)
12053 free_event(event);
12054err_cred:
12055 if (task)
12056 mutex_unlock(&task->signal->exec_update_mutex);
12057err_task:
12058 if (task)
12059 put_task_struct(task);
12060err_group_fd:
12061 fdput(group);
12062err_fd:
12063 put_unused_fd(event_fd);
12064 return err;
12065}
12066
12067
12068
12069
12070
12071
12072
12073
12074struct perf_event *
12075perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12076 struct task_struct *task,
12077 perf_overflow_handler_t overflow_handler,
12078 void *context)
12079{
12080 struct perf_event_context *ctx;
12081 struct perf_event *event;
12082 int err;
12083
12084
12085
12086
12087
12088 if (attr->aux_output)
12089 return ERR_PTR(-EINVAL);
12090
12091 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12092 overflow_handler, context, -1);
12093 if (IS_ERR(event)) {
12094 err = PTR_ERR(event);
12095 goto err;
12096 }
12097
12098
12099 event->owner = TASK_TOMBSTONE;
12100
12101
12102
12103
12104 ctx = find_get_context(event->pmu, task, event);
12105 if (IS_ERR(ctx)) {
12106 err = PTR_ERR(ctx);
12107 goto err_free;
12108 }
12109
12110 WARN_ON_ONCE(ctx->parent_ctx);
12111 mutex_lock(&ctx->mutex);
12112 if (ctx->task == TASK_TOMBSTONE) {
12113 err = -ESRCH;
12114 goto err_unlock;
12115 }
12116
12117 if (!task) {
12118
12119
12120
12121
12122
12123
12124 struct perf_cpu_context *cpuctx =
12125 container_of(ctx, struct perf_cpu_context, ctx);
12126 if (!cpuctx->online) {
12127 err = -ENODEV;
12128 goto err_unlock;
12129 }
12130 }
12131
12132 if (!exclusive_event_installable(event, ctx)) {
12133 err = -EBUSY;
12134 goto err_unlock;
12135 }
12136
12137 perf_install_in_context(ctx, event, event->cpu);
12138 perf_unpin_context(ctx);
12139 mutex_unlock(&ctx->mutex);
12140
12141 return event;
12142
12143err_unlock:
12144 mutex_unlock(&ctx->mutex);
12145 perf_unpin_context(ctx);
12146 put_ctx(ctx);
12147err_free:
12148 free_event(event);
12149err:
12150 return ERR_PTR(err);
12151}
12152EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12153
12154void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12155{
12156 struct perf_event_context *src_ctx;
12157 struct perf_event_context *dst_ctx;
12158 struct perf_event *event, *tmp;
12159 LIST_HEAD(events);
12160
12161 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12162 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12163
12164
12165
12166
12167
12168 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12169 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12170 event_entry) {
12171 perf_remove_from_context(event, 0);
12172 unaccount_event_cpu(event, src_cpu);
12173 put_ctx(src_ctx);
12174 list_add(&event->migrate_entry, &events);
12175 }
12176
12177
12178
12179
12180 synchronize_rcu();
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12191 if (event->group_leader == event)
12192 continue;
12193
12194 list_del(&event->migrate_entry);
12195 if (event->state >= PERF_EVENT_STATE_OFF)
12196 event->state = PERF_EVENT_STATE_INACTIVE;
12197 account_event_cpu(event, dst_cpu);
12198 perf_install_in_context(dst_ctx, event, dst_cpu);
12199 get_ctx(dst_ctx);
12200 }
12201
12202
12203
12204
12205
12206 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12207 list_del(&event->migrate_entry);
12208 if (event->state >= PERF_EVENT_STATE_OFF)
12209 event->state = PERF_EVENT_STATE_INACTIVE;
12210 account_event_cpu(event, dst_cpu);
12211 perf_install_in_context(dst_ctx, event, dst_cpu);
12212 get_ctx(dst_ctx);
12213 }
12214 mutex_unlock(&dst_ctx->mutex);
12215 mutex_unlock(&src_ctx->mutex);
12216}
12217EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12218
12219static void sync_child_event(struct perf_event *child_event,
12220 struct task_struct *child)
12221{
12222 struct perf_event *parent_event = child_event->parent;
12223 u64 child_val;
12224
12225 if (child_event->attr.inherit_stat)
12226 perf_event_read_event(child_event, child);
12227
12228 child_val = perf_event_count(child_event);
12229
12230
12231
12232
12233 atomic64_add(child_val, &parent_event->child_count);
12234 atomic64_add(child_event->total_time_enabled,
12235 &parent_event->child_total_time_enabled);
12236 atomic64_add(child_event->total_time_running,
12237 &parent_event->child_total_time_running);
12238}
12239
12240static void
12241perf_event_exit_event(struct perf_event *child_event,
12242 struct perf_event_context *child_ctx,
12243 struct task_struct *child)
12244{
12245 struct perf_event *parent_event = child_event->parent;
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259 raw_spin_lock_irq(&child_ctx->lock);
12260 WARN_ON_ONCE(child_ctx->is_active);
12261
12262 if (parent_event)
12263 perf_group_detach(child_event);
12264 list_del_event(child_event, child_ctx);
12265 perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT);
12266 raw_spin_unlock_irq(&child_ctx->lock);
12267
12268
12269
12270
12271 if (!parent_event) {
12272 perf_event_wakeup(child_event);
12273 return;
12274 }
12275
12276
12277
12278
12279 sync_child_event(child_event, child);
12280
12281
12282
12283
12284 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
12285 mutex_lock(&parent_event->child_mutex);
12286 list_del_init(&child_event->child_list);
12287 mutex_unlock(&parent_event->child_mutex);
12288
12289
12290
12291
12292 perf_event_wakeup(parent_event);
12293 free_event(child_event);
12294 put_event(parent_event);
12295}
12296
12297static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12298{
12299 struct perf_event_context *child_ctx, *clone_ctx = NULL;
12300 struct perf_event *child_event, *next;
12301
12302 WARN_ON_ONCE(child != current);
12303
12304 child_ctx = perf_pin_task_context(child, ctxn);
12305 if (!child_ctx)
12306 return;
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318 mutex_lock(&child_ctx->mutex);
12319
12320
12321
12322
12323
12324
12325 raw_spin_lock_irq(&child_ctx->lock);
12326 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12327
12328
12329
12330
12331
12332 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12333 put_ctx(child_ctx);
12334 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12335 put_task_struct(current);
12336
12337 clone_ctx = unclone_ctx(child_ctx);
12338 raw_spin_unlock_irq(&child_ctx->lock);
12339
12340 if (clone_ctx)
12341 put_ctx(clone_ctx);
12342
12343
12344
12345
12346
12347
12348 perf_event_task(child, child_ctx, 0);
12349
12350 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12351 perf_event_exit_event(child_event, child_ctx, child);
12352
12353 mutex_unlock(&child_ctx->mutex);
12354
12355 put_ctx(child_ctx);
12356}
12357
12358
12359
12360
12361
12362
12363
12364void perf_event_exit_task(struct task_struct *child)
12365{
12366 struct perf_event *event, *tmp;
12367 int ctxn;
12368
12369 mutex_lock(&child->perf_event_mutex);
12370 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12371 owner_entry) {
12372 list_del_init(&event->owner_entry);
12373
12374
12375
12376
12377
12378
12379 smp_store_release(&event->owner, NULL);
12380 }
12381 mutex_unlock(&child->perf_event_mutex);
12382
12383 for_each_task_context_nr(ctxn)
12384 perf_event_exit_task_context(child, ctxn);
12385
12386
12387
12388
12389
12390
12391
12392 perf_event_task(child, NULL, 0);
12393}
12394
12395static void perf_free_event(struct perf_event *event,
12396 struct perf_event_context *ctx)
12397{
12398 struct perf_event *parent = event->parent;
12399
12400 if (WARN_ON_ONCE(!parent))
12401 return;
12402
12403 mutex_lock(&parent->child_mutex);
12404 list_del_init(&event->child_list);
12405 mutex_unlock(&parent->child_mutex);
12406
12407 put_event(parent);
12408
12409 raw_spin_lock_irq(&ctx->lock);
12410 perf_group_detach(event);
12411 list_del_event(event, ctx);
12412 raw_spin_unlock_irq(&ctx->lock);
12413 free_event(event);
12414}
12415
12416
12417
12418
12419
12420
12421
12422
12423void perf_event_free_task(struct task_struct *task)
12424{
12425 struct perf_event_context *ctx;
12426 struct perf_event *event, *tmp;
12427 int ctxn;
12428
12429 for_each_task_context_nr(ctxn) {
12430 ctx = task->perf_event_ctxp[ctxn];
12431 if (!ctx)
12432 continue;
12433
12434 mutex_lock(&ctx->mutex);
12435 raw_spin_lock_irq(&ctx->lock);
12436
12437
12438
12439
12440
12441
12442 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12443 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12444 put_task_struct(task);
12445 raw_spin_unlock_irq(&ctx->lock);
12446
12447 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12448 perf_free_event(event, ctx);
12449
12450 mutex_unlock(&ctx->mutex);
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12467 put_ctx(ctx);
12468 }
12469}
12470
12471void perf_event_delayed_put(struct task_struct *task)
12472{
12473 int ctxn;
12474
12475 for_each_task_context_nr(ctxn)
12476 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12477}
12478
12479struct file *perf_event_get(unsigned int fd)
12480{
12481 struct file *file = fget(fd);
12482 if (!file)
12483 return ERR_PTR(-EBADF);
12484
12485 if (file->f_op != &perf_fops) {
12486 fput(file);
12487 return ERR_PTR(-EBADF);
12488 }
12489
12490 return file;
12491}
12492
12493const struct perf_event *perf_get_event(struct file *file)
12494{
12495 if (file->f_op != &perf_fops)
12496 return ERR_PTR(-EINVAL);
12497
12498 return file->private_data;
12499}
12500
12501const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12502{
12503 if (!event)
12504 return ERR_PTR(-EINVAL);
12505
12506 return &event->attr;
12507}
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517static struct perf_event *
12518inherit_event(struct perf_event *parent_event,
12519 struct task_struct *parent,
12520 struct perf_event_context *parent_ctx,
12521 struct task_struct *child,
12522 struct perf_event *group_leader,
12523 struct perf_event_context *child_ctx)
12524{
12525 enum perf_event_state parent_state = parent_event->state;
12526 struct perf_event *child_event;
12527 unsigned long flags;
12528
12529
12530
12531
12532
12533
12534
12535 if (parent_event->parent)
12536 parent_event = parent_event->parent;
12537
12538 child_event = perf_event_alloc(&parent_event->attr,
12539 parent_event->cpu,
12540 child,
12541 group_leader, parent_event,
12542 NULL, NULL, -1);
12543 if (IS_ERR(child_event))
12544 return child_event;
12545
12546
12547 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12548 !child_ctx->task_ctx_data) {
12549 struct pmu *pmu = child_event->pmu;
12550
12551 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
12552 if (!child_ctx->task_ctx_data) {
12553 free_event(child_event);
12554 return ERR_PTR(-ENOMEM);
12555 }
12556 }
12557
12558
12559
12560
12561
12562
12563
12564 mutex_lock(&parent_event->child_mutex);
12565 if (is_orphaned_event(parent_event) ||
12566 !atomic_long_inc_not_zero(&parent_event->refcount)) {
12567 mutex_unlock(&parent_event->child_mutex);
12568
12569 free_event(child_event);
12570 return NULL;
12571 }
12572
12573 get_ctx(child_ctx);
12574
12575
12576
12577
12578
12579
12580 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
12581 child_event->state = PERF_EVENT_STATE_INACTIVE;
12582 else
12583 child_event->state = PERF_EVENT_STATE_OFF;
12584
12585 if (parent_event->attr.freq) {
12586 u64 sample_period = parent_event->hw.sample_period;
12587 struct hw_perf_event *hwc = &child_event->hw;
12588
12589 hwc->sample_period = sample_period;
12590 hwc->last_period = sample_period;
12591
12592 local64_set(&hwc->period_left, sample_period);
12593 }
12594
12595 child_event->ctx = child_ctx;
12596 child_event->overflow_handler = parent_event->overflow_handler;
12597 child_event->overflow_handler_context
12598 = parent_event->overflow_handler_context;
12599
12600
12601
12602
12603 perf_event__header_size(child_event);
12604 perf_event__id_header_size(child_event);
12605
12606
12607
12608
12609 raw_spin_lock_irqsave(&child_ctx->lock, flags);
12610 add_event_to_ctx(child_event, child_ctx);
12611 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
12612
12613
12614
12615
12616 list_add_tail(&child_event->child_list, &parent_event->child_list);
12617 mutex_unlock(&parent_event->child_mutex);
12618
12619 return child_event;
12620}
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632static int inherit_group(struct perf_event *parent_event,
12633 struct task_struct *parent,
12634 struct perf_event_context *parent_ctx,
12635 struct task_struct *child,
12636 struct perf_event_context *child_ctx)
12637{
12638 struct perf_event *leader;
12639 struct perf_event *sub;
12640 struct perf_event *child_ctr;
12641
12642 leader = inherit_event(parent_event, parent, parent_ctx,
12643 child, NULL, child_ctx);
12644 if (IS_ERR(leader))
12645 return PTR_ERR(leader);
12646
12647
12648
12649
12650
12651 for_each_sibling_event(sub, parent_event) {
12652 child_ctr = inherit_event(sub, parent, parent_ctx,
12653 child, leader, child_ctx);
12654 if (IS_ERR(child_ctr))
12655 return PTR_ERR(child_ctr);
12656
12657 if (sub->aux_event == parent_event && child_ctr &&
12658 !perf_get_aux_event(child_ctr, leader))
12659 return -EINVAL;
12660 }
12661 return 0;
12662}
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675static int
12676inherit_task_group(struct perf_event *event, struct task_struct *parent,
12677 struct perf_event_context *parent_ctx,
12678 struct task_struct *child, int ctxn,
12679 int *inherited_all)
12680{
12681 int ret;
12682 struct perf_event_context *child_ctx;
12683
12684 if (!event->attr.inherit) {
12685 *inherited_all = 0;
12686 return 0;
12687 }
12688
12689 child_ctx = child->perf_event_ctxp[ctxn];
12690 if (!child_ctx) {
12691
12692
12693
12694
12695
12696
12697 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
12698 if (!child_ctx)
12699 return -ENOMEM;
12700
12701 child->perf_event_ctxp[ctxn] = child_ctx;
12702 }
12703
12704 ret = inherit_group(event, parent, parent_ctx,
12705 child, child_ctx);
12706
12707 if (ret)
12708 *inherited_all = 0;
12709
12710 return ret;
12711}
12712
12713
12714
12715
12716static int perf_event_init_context(struct task_struct *child, int ctxn)
12717{
12718 struct perf_event_context *child_ctx, *parent_ctx;
12719 struct perf_event_context *cloned_ctx;
12720 struct perf_event *event;
12721 struct task_struct *parent = current;
12722 int inherited_all = 1;
12723 unsigned long flags;
12724 int ret = 0;
12725
12726 if (likely(!parent->perf_event_ctxp[ctxn]))
12727 return 0;
12728
12729
12730
12731
12732
12733 parent_ctx = perf_pin_task_context(parent, ctxn);
12734 if (!parent_ctx)
12735 return 0;
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748 mutex_lock(&parent_ctx->mutex);
12749
12750
12751
12752
12753
12754 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
12755 ret = inherit_task_group(event, parent, parent_ctx,
12756 child, ctxn, &inherited_all);
12757 if (ret)
12758 goto out_unlock;
12759 }
12760
12761
12762
12763
12764
12765
12766 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12767 parent_ctx->rotate_disable = 1;
12768 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12769
12770 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
12771 ret = inherit_task_group(event, parent, parent_ctx,
12772 child, ctxn, &inherited_all);
12773 if (ret)
12774 goto out_unlock;
12775 }
12776
12777 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12778 parent_ctx->rotate_disable = 0;
12779
12780 child_ctx = child->perf_event_ctxp[ctxn];
12781
12782 if (child_ctx && inherited_all) {
12783
12784
12785
12786
12787
12788
12789
12790 cloned_ctx = parent_ctx->parent_ctx;
12791 if (cloned_ctx) {
12792 child_ctx->parent_ctx = cloned_ctx;
12793 child_ctx->parent_gen = parent_ctx->parent_gen;
12794 } else {
12795 child_ctx->parent_ctx = parent_ctx;
12796 child_ctx->parent_gen = parent_ctx->generation;
12797 }
12798 get_ctx(child_ctx->parent_ctx);
12799 }
12800
12801 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12802out_unlock:
12803 mutex_unlock(&parent_ctx->mutex);
12804
12805 perf_unpin_context(parent_ctx);
12806 put_ctx(parent_ctx);
12807
12808 return ret;
12809}
12810
12811
12812
12813
12814int perf_event_init_task(struct task_struct *child)
12815{
12816 int ctxn, ret;
12817
12818 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
12819 mutex_init(&child->perf_event_mutex);
12820 INIT_LIST_HEAD(&child->perf_event_list);
12821
12822 for_each_task_context_nr(ctxn) {
12823 ret = perf_event_init_context(child, ctxn);
12824 if (ret) {
12825 perf_event_free_task(child);
12826 return ret;
12827 }
12828 }
12829
12830 return 0;
12831}
12832
12833static void __init perf_event_init_all_cpus(void)
12834{
12835 struct swevent_htable *swhash;
12836 int cpu;
12837
12838 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
12839
12840 for_each_possible_cpu(cpu) {
12841 swhash = &per_cpu(swevent_htable, cpu);
12842 mutex_init(&swhash->hlist_mutex);
12843 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
12844
12845 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
12846 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
12847
12848#ifdef CONFIG_CGROUP_PERF
12849 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
12850#endif
12851 }
12852}
12853
12854static void perf_swevent_init_cpu(unsigned int cpu)
12855{
12856 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
12857
12858 mutex_lock(&swhash->hlist_mutex);
12859 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
12860 struct swevent_hlist *hlist;
12861
12862 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
12863 WARN_ON(!hlist);
12864 rcu_assign_pointer(swhash->swevent_hlist, hlist);
12865 }
12866 mutex_unlock(&swhash->hlist_mutex);
12867}
12868
12869#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
12870static void __perf_event_exit_context(void *__info)
12871{
12872 struct perf_event_context *ctx = __info;
12873 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
12874 struct perf_event *event;
12875
12876 raw_spin_lock(&ctx->lock);
12877 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
12878 list_for_each_entry(event, &ctx->event_list, event_entry)
12879 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
12880 raw_spin_unlock(&ctx->lock);
12881}
12882
12883static void perf_event_exit_cpu_context(int cpu)
12884{
12885 struct perf_cpu_context *cpuctx;
12886 struct perf_event_context *ctx;
12887 struct pmu *pmu;
12888
12889 mutex_lock(&pmus_lock);
12890 list_for_each_entry(pmu, &pmus, entry) {
12891 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12892 ctx = &cpuctx->ctx;
12893
12894 mutex_lock(&ctx->mutex);
12895 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
12896 cpuctx->online = 0;
12897 mutex_unlock(&ctx->mutex);
12898 }
12899 cpumask_clear_cpu(cpu, perf_online_mask);
12900 mutex_unlock(&pmus_lock);
12901}
12902#else
12903
12904static void perf_event_exit_cpu_context(int cpu) { }
12905
12906#endif
12907
12908int perf_event_init_cpu(unsigned int cpu)
12909{
12910 struct perf_cpu_context *cpuctx;
12911 struct perf_event_context *ctx;
12912 struct pmu *pmu;
12913
12914 perf_swevent_init_cpu(cpu);
12915
12916 mutex_lock(&pmus_lock);
12917 cpumask_set_cpu(cpu, perf_online_mask);
12918 list_for_each_entry(pmu, &pmus, entry) {
12919 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12920 ctx = &cpuctx->ctx;
12921
12922 mutex_lock(&ctx->mutex);
12923 cpuctx->online = 1;
12924 mutex_unlock(&ctx->mutex);
12925 }
12926 mutex_unlock(&pmus_lock);
12927
12928 return 0;
12929}
12930
12931int perf_event_exit_cpu(unsigned int cpu)
12932{
12933 perf_event_exit_cpu_context(cpu);
12934 return 0;
12935}
12936
12937static int
12938perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
12939{
12940 int cpu;
12941
12942 for_each_online_cpu(cpu)
12943 perf_event_exit_cpu(cpu);
12944
12945 return NOTIFY_OK;
12946}
12947
12948
12949
12950
12951
12952static struct notifier_block perf_reboot_notifier = {
12953 .notifier_call = perf_reboot,
12954 .priority = INT_MIN,
12955};
12956
12957void __init perf_event_init(void)
12958{
12959 int ret;
12960
12961 idr_init(&pmu_idr);
12962
12963 perf_event_init_all_cpus();
12964 init_srcu_struct(&pmus_srcu);
12965 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
12966 perf_pmu_register(&perf_cpu_clock, NULL, -1);
12967 perf_pmu_register(&perf_task_clock, NULL, -1);
12968 perf_tp_register();
12969 perf_event_init_cpu(smp_processor_id());
12970 register_reboot_notifier(&perf_reboot_notifier);
12971
12972 ret = init_hw_breakpoint();
12973 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
12974
12975
12976
12977
12978
12979 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
12980 != 1024);
12981}
12982
12983ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
12984 char *page)
12985{
12986 struct perf_pmu_events_attr *pmu_attr =
12987 container_of(attr, struct perf_pmu_events_attr, attr);
12988
12989 if (pmu_attr->event_str)
12990 return sprintf(page, "%s\n", pmu_attr->event_str);
12991
12992 return 0;
12993}
12994EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
12995
12996static int __init perf_event_sysfs_init(void)
12997{
12998 struct pmu *pmu;
12999 int ret;
13000
13001 mutex_lock(&pmus_lock);
13002
13003 ret = bus_register(&pmu_bus);
13004 if (ret)
13005 goto unlock;
13006
13007 list_for_each_entry(pmu, &pmus, entry) {
13008 if (!pmu->name || pmu->type < 0)
13009 continue;
13010
13011 ret = pmu_dev_alloc(pmu);
13012 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13013 }
13014 pmu_bus_running = 1;
13015 ret = 0;
13016
13017unlock:
13018 mutex_unlock(&pmus_lock);
13019
13020 return ret;
13021}
13022device_initcall(perf_event_sysfs_init);
13023
13024#ifdef CONFIG_CGROUP_PERF
13025static struct cgroup_subsys_state *
13026perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13027{
13028 struct perf_cgroup *jc;
13029
13030 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13031 if (!jc)
13032 return ERR_PTR(-ENOMEM);
13033
13034 jc->info = alloc_percpu(struct perf_cgroup_info);
13035 if (!jc->info) {
13036 kfree(jc);
13037 return ERR_PTR(-ENOMEM);
13038 }
13039
13040 return &jc->css;
13041}
13042
13043static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13044{
13045 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13046
13047 free_percpu(jc->info);
13048 kfree(jc);
13049}
13050
13051static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13052{
13053 perf_event_cgroup(css->cgroup);
13054 return 0;
13055}
13056
13057static int __perf_cgroup_move(void *info)
13058{
13059 struct task_struct *task = info;
13060 rcu_read_lock();
13061 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13062 rcu_read_unlock();
13063 return 0;
13064}
13065
13066static void perf_cgroup_attach(struct cgroup_taskset *tset)
13067{
13068 struct task_struct *task;
13069 struct cgroup_subsys_state *css;
13070
13071 cgroup_taskset_for_each(task, css, tset)
13072 task_function_call(task, __perf_cgroup_move, task);
13073}
13074
13075struct cgroup_subsys perf_event_cgrp_subsys = {
13076 .css_alloc = perf_cgroup_css_alloc,
13077 .css_free = perf_cgroup_css_free,
13078 .css_online = perf_cgroup_css_online,
13079 .attach = perf_cgroup_attach,
13080
13081
13082
13083
13084
13085 .implicit_on_dfl = true,
13086 .threaded = true,
13087};
13088#endif
13089