1
2
3
4
5
6
7
8
9
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/rculist.h>
32#include <linux/uaccess.h>
33#include <linux/syscalls.h>
34#include <linux/anon_inodes.h>
35#include <linux/kernel_stat.h>
36#include <linux/cgroup.h>
37#include <linux/perf_event.h>
38#include <linux/trace_events.h>
39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h>
41#include <linux/module.h>
42#include <linux/mman.h>
43#include <linux/compat.h>
44#include <linux/bpf.h>
45#include <linux/filter.h>
46#include <linux/namei.h>
47#include <linux/parser.h>
48#include <linux/sched/clock.h>
49#include <linux/sched/mm.h>
50#include <linux/proc_ns.h>
51#include <linux/mount.h>
52
53#include "internal.h"
54
55#include <asm/irq_regs.h>
56
57typedef int (*remote_function_f)(void *);
58
59struct remote_function_call {
60 struct task_struct *p;
61 remote_function_f func;
62 void *info;
63 int ret;
64};
65
66static void remote_function(void *data)
67{
68 struct remote_function_call *tfc = data;
69 struct task_struct *p = tfc->p;
70
71 if (p) {
72
73 if (task_cpu(p) != smp_processor_id())
74 return;
75
76
77
78
79
80
81 tfc->ret = -ESRCH;
82 if (p != current)
83 return;
84 }
85
86 tfc->ret = tfc->func(tfc->info);
87}
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102static int
103task_function_call(struct task_struct *p, remote_function_f func, void *info)
104{
105 struct remote_function_call data = {
106 .p = p,
107 .func = func,
108 .info = info,
109 .ret = -EAGAIN,
110 };
111 int ret;
112
113 do {
114 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
115 if (!ret)
116 ret = data.ret;
117 } while (ret == -EAGAIN);
118
119 return ret;
120}
121
122
123
124
125
126
127
128
129
130
131static int cpu_function_call(int cpu, remote_function_f func, void *info)
132{
133 struct remote_function_call data = {
134 .p = NULL,
135 .func = func,
136 .info = info,
137 .ret = -ENXIO,
138 };
139
140 smp_call_function_single(cpu, remote_function, &data, 1);
141
142 return data.ret;
143}
144
145static inline struct perf_cpu_context *
146__get_cpu_context(struct perf_event_context *ctx)
147{
148 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
149}
150
151static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
152 struct perf_event_context *ctx)
153{
154 raw_spin_lock(&cpuctx->ctx.lock);
155 if (ctx)
156 raw_spin_lock(&ctx->lock);
157}
158
159static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
160 struct perf_event_context *ctx)
161{
162 if (ctx)
163 raw_spin_unlock(&ctx->lock);
164 raw_spin_unlock(&cpuctx->ctx.lock);
165}
166
167#define TASK_TOMBSTONE ((void *)-1L)
168
169static bool is_kernel_event(struct perf_event *event)
170{
171 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
172}
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
194 struct perf_event_context *, void *);
195
196struct event_function_struct {
197 struct perf_event *event;
198 event_f func;
199 void *data;
200};
201
202static int event_function(void *info)
203{
204 struct event_function_struct *efs = info;
205 struct perf_event *event = efs->event;
206 struct perf_event_context *ctx = event->ctx;
207 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
208 struct perf_event_context *task_ctx = cpuctx->task_ctx;
209 int ret = 0;
210
211 lockdep_assert_irqs_disabled();
212
213 perf_ctx_lock(cpuctx, task_ctx);
214
215
216
217
218 if (ctx->task) {
219 if (ctx->task != current) {
220 ret = -ESRCH;
221 goto unlock;
222 }
223
224
225
226
227
228
229
230
231 WARN_ON_ONCE(!ctx->is_active);
232
233
234
235
236 WARN_ON_ONCE(task_ctx != ctx);
237 } else {
238 WARN_ON_ONCE(&cpuctx->ctx != ctx);
239 }
240
241 efs->func(event, cpuctx, ctx, efs->data);
242unlock:
243 perf_ctx_unlock(cpuctx, task_ctx);
244
245 return ret;
246}
247
248static void event_function_call(struct perf_event *event, event_f func, void *data)
249{
250 struct perf_event_context *ctx = event->ctx;
251 struct task_struct *task = READ_ONCE(ctx->task);
252 struct event_function_struct efs = {
253 .event = event,
254 .func = func,
255 .data = data,
256 };
257
258 if (!event->parent) {
259
260
261
262
263
264 lockdep_assert_held(&ctx->mutex);
265 }
266
267 if (!task) {
268 cpu_function_call(event->cpu, event_function, &efs);
269 return;
270 }
271
272 if (task == TASK_TOMBSTONE)
273 return;
274
275again:
276 if (!task_function_call(task, event_function, &efs))
277 return;
278
279 raw_spin_lock_irq(&ctx->lock);
280
281
282
283
284 task = ctx->task;
285 if (task == TASK_TOMBSTONE) {
286 raw_spin_unlock_irq(&ctx->lock);
287 return;
288 }
289 if (ctx->is_active) {
290 raw_spin_unlock_irq(&ctx->lock);
291 goto again;
292 }
293 func(event, NULL, ctx, data);
294 raw_spin_unlock_irq(&ctx->lock);
295}
296
297
298
299
300
301static void event_function_local(struct perf_event *event, event_f func, void *data)
302{
303 struct perf_event_context *ctx = event->ctx;
304 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
305 struct task_struct *task = READ_ONCE(ctx->task);
306 struct perf_event_context *task_ctx = NULL;
307
308 lockdep_assert_irqs_disabled();
309
310 if (task) {
311 if (task == TASK_TOMBSTONE)
312 return;
313
314 task_ctx = ctx;
315 }
316
317 perf_ctx_lock(cpuctx, task_ctx);
318
319 task = ctx->task;
320 if (task == TASK_TOMBSTONE)
321 goto unlock;
322
323 if (task) {
324
325
326
327
328
329 if (ctx->is_active) {
330 if (WARN_ON_ONCE(task != current))
331 goto unlock;
332
333 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
334 goto unlock;
335 }
336 } else {
337 WARN_ON_ONCE(&cpuctx->ctx != ctx);
338 }
339
340 func(event, cpuctx, ctx, data);
341unlock:
342 perf_ctx_unlock(cpuctx, task_ctx);
343}
344
345#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
346 PERF_FLAG_FD_OUTPUT |\
347 PERF_FLAG_PID_CGROUP |\
348 PERF_FLAG_FD_CLOEXEC)
349
350
351
352
353#define PERF_SAMPLE_BRANCH_PERM_PLM \
354 (PERF_SAMPLE_BRANCH_KERNEL |\
355 PERF_SAMPLE_BRANCH_HV)
356
357enum event_type_t {
358 EVENT_FLEXIBLE = 0x1,
359 EVENT_PINNED = 0x2,
360 EVENT_TIME = 0x4,
361
362 EVENT_CPU = 0x8,
363 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
364};
365
366
367
368
369
370
371static void perf_sched_delayed(struct work_struct *work);
372DEFINE_STATIC_KEY_FALSE(perf_sched_events);
373static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
374static DEFINE_MUTEX(perf_sched_mutex);
375static atomic_t perf_sched_count;
376
377static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
378static DEFINE_PER_CPU(int, perf_sched_cb_usages);
379static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
380
381static atomic_t nr_mmap_events __read_mostly;
382static atomic_t nr_comm_events __read_mostly;
383static atomic_t nr_namespaces_events __read_mostly;
384static atomic_t nr_task_events __read_mostly;
385static atomic_t nr_freq_events __read_mostly;
386static atomic_t nr_switch_events __read_mostly;
387static atomic_t nr_ksymbol_events __read_mostly;
388static atomic_t nr_bpf_events __read_mostly;
389
390static LIST_HEAD(pmus);
391static DEFINE_MUTEX(pmus_lock);
392static struct srcu_struct pmus_srcu;
393static cpumask_var_t perf_online_mask;
394
395
396
397
398
399
400
401
402int sysctl_perf_event_paranoid __read_mostly = 2;
403
404
405int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
406
407
408
409
410#define DEFAULT_MAX_SAMPLE_RATE 100000
411#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
412#define DEFAULT_CPU_TIME_MAX_PERCENT 25
413
414int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
415
416static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
417static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
418
419static int perf_sample_allowed_ns __read_mostly =
420 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
421
422static void update_perf_cpu_limits(void)
423{
424 u64 tmp = perf_sample_period_ns;
425
426 tmp *= sysctl_perf_cpu_time_max_percent;
427 tmp = div_u64(tmp, 100);
428 if (!tmp)
429 tmp = 1;
430
431 WRITE_ONCE(perf_sample_allowed_ns, tmp);
432}
433
434static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
435
436int perf_proc_update_handler(struct ctl_table *table, int write,
437 void __user *buffer, size_t *lenp,
438 loff_t *ppos)
439{
440 int ret;
441 int perf_cpu = sysctl_perf_cpu_time_max_percent;
442
443
444
445 if (write && (perf_cpu == 100 || perf_cpu == 0))
446 return -EINVAL;
447
448 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
449 if (ret || !write)
450 return ret;
451
452 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
453 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
454 update_perf_cpu_limits();
455
456 return 0;
457}
458
459int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
460
461int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
462 void __user *buffer, size_t *lenp,
463 loff_t *ppos)
464{
465 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
466
467 if (ret || !write)
468 return ret;
469
470 if (sysctl_perf_cpu_time_max_percent == 100 ||
471 sysctl_perf_cpu_time_max_percent == 0) {
472 printk(KERN_WARNING
473 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
474 WRITE_ONCE(perf_sample_allowed_ns, 0);
475 } else {
476 update_perf_cpu_limits();
477 }
478
479 return 0;
480}
481
482
483
484
485
486
487
488#define NR_ACCUMULATED_SAMPLES 128
489static DEFINE_PER_CPU(u64, running_sample_length);
490
491static u64 __report_avg;
492static u64 __report_allowed;
493
494static void perf_duration_warn(struct irq_work *w)
495{
496 printk_ratelimited(KERN_INFO
497 "perf: interrupt took too long (%lld > %lld), lowering "
498 "kernel.perf_event_max_sample_rate to %d\n",
499 __report_avg, __report_allowed,
500 sysctl_perf_event_sample_rate);
501}
502
503static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
504
505void perf_sample_event_took(u64 sample_len_ns)
506{
507 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
508 u64 running_len;
509 u64 avg_len;
510 u32 max;
511
512 if (max_len == 0)
513 return;
514
515
516 running_len = __this_cpu_read(running_sample_length);
517 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
518 running_len += sample_len_ns;
519 __this_cpu_write(running_sample_length, running_len);
520
521
522
523
524
525
526 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
527 if (avg_len <= max_len)
528 return;
529
530 __report_avg = avg_len;
531 __report_allowed = max_len;
532
533
534
535
536 avg_len += avg_len / 4;
537 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
538 if (avg_len < max)
539 max /= (u32)avg_len;
540 else
541 max = 1;
542
543 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
544 WRITE_ONCE(max_samples_per_tick, max);
545
546 sysctl_perf_event_sample_rate = max * HZ;
547 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
548
549 if (!irq_work_queue(&perf_duration_work)) {
550 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
551 "kernel.perf_event_max_sample_rate to %d\n",
552 __report_avg, __report_allowed,
553 sysctl_perf_event_sample_rate);
554 }
555}
556
557static atomic64_t perf_event_id;
558
559static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
560 enum event_type_t event_type);
561
562static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
563 enum event_type_t event_type,
564 struct task_struct *task);
565
566static void update_context_time(struct perf_event_context *ctx);
567static u64 perf_event_time(struct perf_event *event);
568
569void __weak perf_event_print_debug(void) { }
570
571extern __weak const char *perf_pmu_name(void)
572{
573 return "pmu";
574}
575
576static inline u64 perf_clock(void)
577{
578 return local_clock();
579}
580
581static inline u64 perf_event_clock(struct perf_event *event)
582{
583 return event->clock();
584}
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608static __always_inline enum perf_event_state
609__perf_effective_state(struct perf_event *event)
610{
611 struct perf_event *leader = event->group_leader;
612
613 if (leader->state <= PERF_EVENT_STATE_OFF)
614 return leader->state;
615
616 return event->state;
617}
618
619static __always_inline void
620__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
621{
622 enum perf_event_state state = __perf_effective_state(event);
623 u64 delta = now - event->tstamp;
624
625 *enabled = event->total_time_enabled;
626 if (state >= PERF_EVENT_STATE_INACTIVE)
627 *enabled += delta;
628
629 *running = event->total_time_running;
630 if (state >= PERF_EVENT_STATE_ACTIVE)
631 *running += delta;
632}
633
634static void perf_event_update_time(struct perf_event *event)
635{
636 u64 now = perf_event_time(event);
637
638 __perf_update_times(event, now, &event->total_time_enabled,
639 &event->total_time_running);
640 event->tstamp = now;
641}
642
643static void perf_event_update_sibling_time(struct perf_event *leader)
644{
645 struct perf_event *sibling;
646
647 for_each_sibling_event(sibling, leader)
648 perf_event_update_time(sibling);
649}
650
651static void
652perf_event_set_state(struct perf_event *event, enum perf_event_state state)
653{
654 if (event->state == state)
655 return;
656
657 perf_event_update_time(event);
658
659
660
661
662 if ((event->state < 0) ^ (state < 0))
663 perf_event_update_sibling_time(event);
664
665 WRITE_ONCE(event->state, state);
666}
667
668#ifdef CONFIG_CGROUP_PERF
669
670static inline bool
671perf_cgroup_match(struct perf_event *event)
672{
673 struct perf_event_context *ctx = event->ctx;
674 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
675
676
677 if (!event->cgrp)
678 return true;
679
680
681 if (!cpuctx->cgrp)
682 return false;
683
684
685
686
687
688
689
690 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
691 event->cgrp->css.cgroup);
692}
693
694static inline void perf_detach_cgroup(struct perf_event *event)
695{
696 css_put(&event->cgrp->css);
697 event->cgrp = NULL;
698}
699
700static inline int is_cgroup_event(struct perf_event *event)
701{
702 return event->cgrp != NULL;
703}
704
705static inline u64 perf_cgroup_event_time(struct perf_event *event)
706{
707 struct perf_cgroup_info *t;
708
709 t = per_cpu_ptr(event->cgrp->info, event->cpu);
710 return t->time;
711}
712
713static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
714{
715 struct perf_cgroup_info *info;
716 u64 now;
717
718 now = perf_clock();
719
720 info = this_cpu_ptr(cgrp->info);
721
722 info->time += now - info->timestamp;
723 info->timestamp = now;
724}
725
726static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
727{
728 struct perf_cgroup *cgrp = cpuctx->cgrp;
729 struct cgroup_subsys_state *css;
730
731 if (cgrp) {
732 for (css = &cgrp->css; css; css = css->parent) {
733 cgrp = container_of(css, struct perf_cgroup, css);
734 __update_cgrp_time(cgrp);
735 }
736 }
737}
738
739static inline void update_cgrp_time_from_event(struct perf_event *event)
740{
741 struct perf_cgroup *cgrp;
742
743
744
745
746
747 if (!is_cgroup_event(event))
748 return;
749
750 cgrp = perf_cgroup_from_task(current, event->ctx);
751
752
753
754 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
755 __update_cgrp_time(event->cgrp);
756}
757
758static inline void
759perf_cgroup_set_timestamp(struct task_struct *task,
760 struct perf_event_context *ctx)
761{
762 struct perf_cgroup *cgrp;
763 struct perf_cgroup_info *info;
764 struct cgroup_subsys_state *css;
765
766
767
768
769
770
771 if (!task || !ctx->nr_cgroups)
772 return;
773
774 cgrp = perf_cgroup_from_task(task, ctx);
775
776 for (css = &cgrp->css; css; css = css->parent) {
777 cgrp = container_of(css, struct perf_cgroup, css);
778 info = this_cpu_ptr(cgrp->info);
779 info->timestamp = ctx->timestamp;
780 }
781}
782
783static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
784
785#define PERF_CGROUP_SWOUT 0x1
786#define PERF_CGROUP_SWIN 0x2
787
788
789
790
791
792
793
794static void perf_cgroup_switch(struct task_struct *task, int mode)
795{
796 struct perf_cpu_context *cpuctx;
797 struct list_head *list;
798 unsigned long flags;
799
800
801
802
803
804 local_irq_save(flags);
805
806 list = this_cpu_ptr(&cgrp_cpuctx_list);
807 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
808 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
809
810 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
811 perf_pmu_disable(cpuctx->ctx.pmu);
812
813 if (mode & PERF_CGROUP_SWOUT) {
814 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
815
816
817
818
819 cpuctx->cgrp = NULL;
820 }
821
822 if (mode & PERF_CGROUP_SWIN) {
823 WARN_ON_ONCE(cpuctx->cgrp);
824
825
826
827
828
829
830
831 cpuctx->cgrp = perf_cgroup_from_task(task,
832 &cpuctx->ctx);
833 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
834 }
835 perf_pmu_enable(cpuctx->ctx.pmu);
836 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
837 }
838
839 local_irq_restore(flags);
840}
841
842static inline void perf_cgroup_sched_out(struct task_struct *task,
843 struct task_struct *next)
844{
845 struct perf_cgroup *cgrp1;
846 struct perf_cgroup *cgrp2 = NULL;
847
848 rcu_read_lock();
849
850
851
852
853
854 cgrp1 = perf_cgroup_from_task(task, NULL);
855 cgrp2 = perf_cgroup_from_task(next, NULL);
856
857
858
859
860
861
862 if (cgrp1 != cgrp2)
863 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
864
865 rcu_read_unlock();
866}
867
868static inline void perf_cgroup_sched_in(struct task_struct *prev,
869 struct task_struct *task)
870{
871 struct perf_cgroup *cgrp1;
872 struct perf_cgroup *cgrp2 = NULL;
873
874 rcu_read_lock();
875
876
877
878
879
880 cgrp1 = perf_cgroup_from_task(task, NULL);
881 cgrp2 = perf_cgroup_from_task(prev, NULL);
882
883
884
885
886
887
888 if (cgrp1 != cgrp2)
889 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
890
891 rcu_read_unlock();
892}
893
894static inline int perf_cgroup_connect(int fd, struct perf_event *event,
895 struct perf_event_attr *attr,
896 struct perf_event *group_leader)
897{
898 struct perf_cgroup *cgrp;
899 struct cgroup_subsys_state *css;
900 struct fd f = fdget(fd);
901 int ret = 0;
902
903 if (!f.file)
904 return -EBADF;
905
906 css = css_tryget_online_from_dir(f.file->f_path.dentry,
907 &perf_event_cgrp_subsys);
908 if (IS_ERR(css)) {
909 ret = PTR_ERR(css);
910 goto out;
911 }
912
913 cgrp = container_of(css, struct perf_cgroup, css);
914 event->cgrp = cgrp;
915
916
917
918
919
920
921 if (group_leader && group_leader->cgrp != cgrp) {
922 perf_detach_cgroup(event);
923 ret = -EINVAL;
924 }
925out:
926 fdput(f);
927 return ret;
928}
929
930static inline void
931perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
932{
933 struct perf_cgroup_info *t;
934 t = per_cpu_ptr(event->cgrp->info, event->cpu);
935 event->shadow_ctx_time = now - t->timestamp;
936}
937
938
939
940
941
942static inline void
943list_update_cgroup_event(struct perf_event *event,
944 struct perf_event_context *ctx, bool add)
945{
946 struct perf_cpu_context *cpuctx;
947 struct list_head *cpuctx_entry;
948
949 if (!is_cgroup_event(event))
950 return;
951
952
953
954
955
956 cpuctx = __get_cpu_context(ctx);
957
958
959
960
961
962
963
964 if (add && !cpuctx->cgrp) {
965 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
966
967 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
968 cpuctx->cgrp = cgrp;
969 }
970
971 if (add && ctx->nr_cgroups++)
972 return;
973 else if (!add && --ctx->nr_cgroups)
974 return;
975
976
977 if (!add)
978 cpuctx->cgrp = NULL;
979
980 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
981 if (add)
982 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
983 else
984 list_del(cpuctx_entry);
985}
986
987#else
988
989static inline bool
990perf_cgroup_match(struct perf_event *event)
991{
992 return true;
993}
994
995static inline void perf_detach_cgroup(struct perf_event *event)
996{}
997
998static inline int is_cgroup_event(struct perf_event *event)
999{
1000 return 0;
1001}
1002
1003static inline void update_cgrp_time_from_event(struct perf_event *event)
1004{
1005}
1006
1007static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1008{
1009}
1010
1011static inline void perf_cgroup_sched_out(struct task_struct *task,
1012 struct task_struct *next)
1013{
1014}
1015
1016static inline void perf_cgroup_sched_in(struct task_struct *prev,
1017 struct task_struct *task)
1018{
1019}
1020
1021static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1022 struct perf_event_attr *attr,
1023 struct perf_event *group_leader)
1024{
1025 return -EINVAL;
1026}
1027
1028static inline void
1029perf_cgroup_set_timestamp(struct task_struct *task,
1030 struct perf_event_context *ctx)
1031{
1032}
1033
1034void
1035perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1036{
1037}
1038
1039static inline void
1040perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1041{
1042}
1043
1044static inline u64 perf_cgroup_event_time(struct perf_event *event)
1045{
1046 return 0;
1047}
1048
1049static inline void
1050list_update_cgroup_event(struct perf_event *event,
1051 struct perf_event_context *ctx, bool add)
1052{
1053}
1054
1055#endif
1056
1057
1058
1059
1060
1061#define PERF_CPU_HRTIMER (1000 / HZ)
1062
1063
1064
1065static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1066{
1067 struct perf_cpu_context *cpuctx;
1068 bool rotations;
1069
1070 lockdep_assert_irqs_disabled();
1071
1072 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1073 rotations = perf_rotate_context(cpuctx);
1074
1075 raw_spin_lock(&cpuctx->hrtimer_lock);
1076 if (rotations)
1077 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1078 else
1079 cpuctx->hrtimer_active = 0;
1080 raw_spin_unlock(&cpuctx->hrtimer_lock);
1081
1082 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1083}
1084
1085static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1086{
1087 struct hrtimer *timer = &cpuctx->hrtimer;
1088 struct pmu *pmu = cpuctx->ctx.pmu;
1089 u64 interval;
1090
1091
1092 if (pmu->task_ctx_nr == perf_sw_context)
1093 return;
1094
1095
1096
1097
1098
1099 interval = pmu->hrtimer_interval_ms;
1100 if (interval < 1)
1101 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1102
1103 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1104
1105 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1106 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1107 timer->function = perf_mux_hrtimer_handler;
1108}
1109
1110static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1111{
1112 struct hrtimer *timer = &cpuctx->hrtimer;
1113 struct pmu *pmu = cpuctx->ctx.pmu;
1114 unsigned long flags;
1115
1116
1117 if (pmu->task_ctx_nr == perf_sw_context)
1118 return 0;
1119
1120 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1121 if (!cpuctx->hrtimer_active) {
1122 cpuctx->hrtimer_active = 1;
1123 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1124 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1125 }
1126 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1127
1128 return 0;
1129}
1130
1131void perf_pmu_disable(struct pmu *pmu)
1132{
1133 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1134 if (!(*count)++)
1135 pmu->pmu_disable(pmu);
1136}
1137
1138void perf_pmu_enable(struct pmu *pmu)
1139{
1140 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1141 if (!--(*count))
1142 pmu->pmu_enable(pmu);
1143}
1144
1145static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1146
1147
1148
1149
1150
1151
1152
1153static void perf_event_ctx_activate(struct perf_event_context *ctx)
1154{
1155 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1156
1157 lockdep_assert_irqs_disabled();
1158
1159 WARN_ON(!list_empty(&ctx->active_ctx_list));
1160
1161 list_add(&ctx->active_ctx_list, head);
1162}
1163
1164static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1165{
1166 lockdep_assert_irqs_disabled();
1167
1168 WARN_ON(list_empty(&ctx->active_ctx_list));
1169
1170 list_del_init(&ctx->active_ctx_list);
1171}
1172
1173static void get_ctx(struct perf_event_context *ctx)
1174{
1175 refcount_inc(&ctx->refcount);
1176}
1177
1178static void free_ctx(struct rcu_head *head)
1179{
1180 struct perf_event_context *ctx;
1181
1182 ctx = container_of(head, struct perf_event_context, rcu_head);
1183 kfree(ctx->task_ctx_data);
1184 kfree(ctx);
1185}
1186
1187static void put_ctx(struct perf_event_context *ctx)
1188{
1189 if (refcount_dec_and_test(&ctx->refcount)) {
1190 if (ctx->parent_ctx)
1191 put_ctx(ctx->parent_ctx);
1192 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1193 put_task_struct(ctx->task);
1194 call_rcu(&ctx->rcu_head, free_ctx);
1195 }
1196}
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264static struct perf_event_context *
1265perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1266{
1267 struct perf_event_context *ctx;
1268
1269again:
1270 rcu_read_lock();
1271 ctx = READ_ONCE(event->ctx);
1272 if (!refcount_inc_not_zero(&ctx->refcount)) {
1273 rcu_read_unlock();
1274 goto again;
1275 }
1276 rcu_read_unlock();
1277
1278 mutex_lock_nested(&ctx->mutex, nesting);
1279 if (event->ctx != ctx) {
1280 mutex_unlock(&ctx->mutex);
1281 put_ctx(ctx);
1282 goto again;
1283 }
1284
1285 return ctx;
1286}
1287
1288static inline struct perf_event_context *
1289perf_event_ctx_lock(struct perf_event *event)
1290{
1291 return perf_event_ctx_lock_nested(event, 0);
1292}
1293
1294static void perf_event_ctx_unlock(struct perf_event *event,
1295 struct perf_event_context *ctx)
1296{
1297 mutex_unlock(&ctx->mutex);
1298 put_ctx(ctx);
1299}
1300
1301
1302
1303
1304
1305
1306static __must_check struct perf_event_context *
1307unclone_ctx(struct perf_event_context *ctx)
1308{
1309 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1310
1311 lockdep_assert_held(&ctx->lock);
1312
1313 if (parent_ctx)
1314 ctx->parent_ctx = NULL;
1315 ctx->generation++;
1316
1317 return parent_ctx;
1318}
1319
1320static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1321 enum pid_type type)
1322{
1323 u32 nr;
1324
1325
1326
1327 if (event->parent)
1328 event = event->parent;
1329
1330 nr = __task_pid_nr_ns(p, type, event->ns);
1331
1332 if (!nr && !pid_alive(p))
1333 nr = -1;
1334 return nr;
1335}
1336
1337static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1338{
1339 return perf_event_pid_type(event, p, PIDTYPE_TGID);
1340}
1341
1342static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1343{
1344 return perf_event_pid_type(event, p, PIDTYPE_PID);
1345}
1346
1347
1348
1349
1350
1351static u64 primary_event_id(struct perf_event *event)
1352{
1353 u64 id = event->id;
1354
1355 if (event->parent)
1356 id = event->parent->id;
1357
1358 return id;
1359}
1360
1361
1362
1363
1364
1365
1366
1367static struct perf_event_context *
1368perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1369{
1370 struct perf_event_context *ctx;
1371
1372retry:
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382 local_irq_save(*flags);
1383 rcu_read_lock();
1384 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1385 if (ctx) {
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396 raw_spin_lock(&ctx->lock);
1397 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1398 raw_spin_unlock(&ctx->lock);
1399 rcu_read_unlock();
1400 local_irq_restore(*flags);
1401 goto retry;
1402 }
1403
1404 if (ctx->task == TASK_TOMBSTONE ||
1405 !refcount_inc_not_zero(&ctx->refcount)) {
1406 raw_spin_unlock(&ctx->lock);
1407 ctx = NULL;
1408 } else {
1409 WARN_ON_ONCE(ctx->task != task);
1410 }
1411 }
1412 rcu_read_unlock();
1413 if (!ctx)
1414 local_irq_restore(*flags);
1415 return ctx;
1416}
1417
1418
1419
1420
1421
1422
1423static struct perf_event_context *
1424perf_pin_task_context(struct task_struct *task, int ctxn)
1425{
1426 struct perf_event_context *ctx;
1427 unsigned long flags;
1428
1429 ctx = perf_lock_task_context(task, ctxn, &flags);
1430 if (ctx) {
1431 ++ctx->pin_count;
1432 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1433 }
1434 return ctx;
1435}
1436
1437static void perf_unpin_context(struct perf_event_context *ctx)
1438{
1439 unsigned long flags;
1440
1441 raw_spin_lock_irqsave(&ctx->lock, flags);
1442 --ctx->pin_count;
1443 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1444}
1445
1446
1447
1448
1449static void update_context_time(struct perf_event_context *ctx)
1450{
1451 u64 now = perf_clock();
1452
1453 ctx->time += now - ctx->timestamp;
1454 ctx->timestamp = now;
1455}
1456
1457static u64 perf_event_time(struct perf_event *event)
1458{
1459 struct perf_event_context *ctx = event->ctx;
1460
1461 if (is_cgroup_event(event))
1462 return perf_cgroup_event_time(event);
1463
1464 return ctx ? ctx->time : 0;
1465}
1466
1467static enum event_type_t get_event_type(struct perf_event *event)
1468{
1469 struct perf_event_context *ctx = event->ctx;
1470 enum event_type_t event_type;
1471
1472 lockdep_assert_held(&ctx->lock);
1473
1474
1475
1476
1477
1478 if (event->group_leader != event)
1479 event = event->group_leader;
1480
1481 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1482 if (!ctx->task)
1483 event_type |= EVENT_CPU;
1484
1485 return event_type;
1486}
1487
1488
1489
1490
1491static void init_event_group(struct perf_event *event)
1492{
1493 RB_CLEAR_NODE(&event->group_node);
1494 event->group_index = 0;
1495}
1496
1497
1498
1499
1500
1501static struct perf_event_groups *
1502get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1503{
1504 if (event->attr.pinned)
1505 return &ctx->pinned_groups;
1506 else
1507 return &ctx->flexible_groups;
1508}
1509
1510
1511
1512
1513static void perf_event_groups_init(struct perf_event_groups *groups)
1514{
1515 groups->tree = RB_ROOT;
1516 groups->index = 0;
1517}
1518
1519
1520
1521
1522
1523
1524
1525static bool
1526perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1527{
1528 if (left->cpu < right->cpu)
1529 return true;
1530 if (left->cpu > right->cpu)
1531 return false;
1532
1533 if (left->group_index < right->group_index)
1534 return true;
1535 if (left->group_index > right->group_index)
1536 return false;
1537
1538 return false;
1539}
1540
1541
1542
1543
1544
1545
1546static void
1547perf_event_groups_insert(struct perf_event_groups *groups,
1548 struct perf_event *event)
1549{
1550 struct perf_event *node_event;
1551 struct rb_node *parent;
1552 struct rb_node **node;
1553
1554 event->group_index = ++groups->index;
1555
1556 node = &groups->tree.rb_node;
1557 parent = *node;
1558
1559 while (*node) {
1560 parent = *node;
1561 node_event = container_of(*node, struct perf_event, group_node);
1562
1563 if (perf_event_groups_less(event, node_event))
1564 node = &parent->rb_left;
1565 else
1566 node = &parent->rb_right;
1567 }
1568
1569 rb_link_node(&event->group_node, parent, node);
1570 rb_insert_color(&event->group_node, &groups->tree);
1571}
1572
1573
1574
1575
1576static void
1577add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1578{
1579 struct perf_event_groups *groups;
1580
1581 groups = get_event_groups(event, ctx);
1582 perf_event_groups_insert(groups, event);
1583}
1584
1585
1586
1587
1588static void
1589perf_event_groups_delete(struct perf_event_groups *groups,
1590 struct perf_event *event)
1591{
1592 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1593 RB_EMPTY_ROOT(&groups->tree));
1594
1595 rb_erase(&event->group_node, &groups->tree);
1596 init_event_group(event);
1597}
1598
1599
1600
1601
1602static void
1603del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1604{
1605 struct perf_event_groups *groups;
1606
1607 groups = get_event_groups(event, ctx);
1608 perf_event_groups_delete(groups, event);
1609}
1610
1611
1612
1613
1614static struct perf_event *
1615perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1616{
1617 struct perf_event *node_event = NULL, *match = NULL;
1618 struct rb_node *node = groups->tree.rb_node;
1619
1620 while (node) {
1621 node_event = container_of(node, struct perf_event, group_node);
1622
1623 if (cpu < node_event->cpu) {
1624 node = node->rb_left;
1625 } else if (cpu > node_event->cpu) {
1626 node = node->rb_right;
1627 } else {
1628 match = node_event;
1629 node = node->rb_left;
1630 }
1631 }
1632
1633 return match;
1634}
1635
1636
1637
1638
1639static struct perf_event *
1640perf_event_groups_next(struct perf_event *event)
1641{
1642 struct perf_event *next;
1643
1644 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1645 if (next && next->cpu == event->cpu)
1646 return next;
1647
1648 return NULL;
1649}
1650
1651
1652
1653
1654#define perf_event_groups_for_each(event, groups) \
1655 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1656 typeof(*event), group_node); event; \
1657 event = rb_entry_safe(rb_next(&event->group_node), \
1658 typeof(*event), group_node))
1659
1660
1661
1662
1663
1664static void
1665list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1666{
1667 lockdep_assert_held(&ctx->lock);
1668
1669 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1670 event->attach_state |= PERF_ATTACH_CONTEXT;
1671
1672 event->tstamp = perf_event_time(event);
1673
1674
1675
1676
1677
1678
1679 if (event->group_leader == event) {
1680 event->group_caps = event->event_caps;
1681 add_event_to_groups(event, ctx);
1682 }
1683
1684 list_update_cgroup_event(event, ctx, true);
1685
1686 list_add_rcu(&event->event_entry, &ctx->event_list);
1687 ctx->nr_events++;
1688 if (event->attr.inherit_stat)
1689 ctx->nr_stat++;
1690
1691 ctx->generation++;
1692}
1693
1694
1695
1696
1697static inline void perf_event__state_init(struct perf_event *event)
1698{
1699 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1700 PERF_EVENT_STATE_INACTIVE;
1701}
1702
1703static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1704{
1705 int entry = sizeof(u64);
1706 int size = 0;
1707 int nr = 1;
1708
1709 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1710 size += sizeof(u64);
1711
1712 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1713 size += sizeof(u64);
1714
1715 if (event->attr.read_format & PERF_FORMAT_ID)
1716 entry += sizeof(u64);
1717
1718 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1719 nr += nr_siblings;
1720 size += sizeof(u64);
1721 }
1722
1723 size += entry * nr;
1724 event->read_size = size;
1725}
1726
1727static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1728{
1729 struct perf_sample_data *data;
1730 u16 size = 0;
1731
1732 if (sample_type & PERF_SAMPLE_IP)
1733 size += sizeof(data->ip);
1734
1735 if (sample_type & PERF_SAMPLE_ADDR)
1736 size += sizeof(data->addr);
1737
1738 if (sample_type & PERF_SAMPLE_PERIOD)
1739 size += sizeof(data->period);
1740
1741 if (sample_type & PERF_SAMPLE_WEIGHT)
1742 size += sizeof(data->weight);
1743
1744 if (sample_type & PERF_SAMPLE_READ)
1745 size += event->read_size;
1746
1747 if (sample_type & PERF_SAMPLE_DATA_SRC)
1748 size += sizeof(data->data_src.val);
1749
1750 if (sample_type & PERF_SAMPLE_TRANSACTION)
1751 size += sizeof(data->txn);
1752
1753 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1754 size += sizeof(data->phys_addr);
1755
1756 event->header_size = size;
1757}
1758
1759
1760
1761
1762
1763static void perf_event__header_size(struct perf_event *event)
1764{
1765 __perf_event_read_size(event,
1766 event->group_leader->nr_siblings);
1767 __perf_event_header_size(event, event->attr.sample_type);
1768}
1769
1770static void perf_event__id_header_size(struct perf_event *event)
1771{
1772 struct perf_sample_data *data;
1773 u64 sample_type = event->attr.sample_type;
1774 u16 size = 0;
1775
1776 if (sample_type & PERF_SAMPLE_TID)
1777 size += sizeof(data->tid_entry);
1778
1779 if (sample_type & PERF_SAMPLE_TIME)
1780 size += sizeof(data->time);
1781
1782 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1783 size += sizeof(data->id);
1784
1785 if (sample_type & PERF_SAMPLE_ID)
1786 size += sizeof(data->id);
1787
1788 if (sample_type & PERF_SAMPLE_STREAM_ID)
1789 size += sizeof(data->stream_id);
1790
1791 if (sample_type & PERF_SAMPLE_CPU)
1792 size += sizeof(data->cpu_entry);
1793
1794 event->id_header_size = size;
1795}
1796
1797static bool perf_event_validate_size(struct perf_event *event)
1798{
1799
1800
1801
1802
1803 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1804 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1805 perf_event__id_header_size(event);
1806
1807
1808
1809
1810
1811 if (event->read_size + event->header_size +
1812 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1813 return false;
1814
1815 return true;
1816}
1817
1818static void perf_group_attach(struct perf_event *event)
1819{
1820 struct perf_event *group_leader = event->group_leader, *pos;
1821
1822 lockdep_assert_held(&event->ctx->lock);
1823
1824
1825
1826
1827 if (event->attach_state & PERF_ATTACH_GROUP)
1828 return;
1829
1830 event->attach_state |= PERF_ATTACH_GROUP;
1831
1832 if (group_leader == event)
1833 return;
1834
1835 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1836
1837 group_leader->group_caps &= event->event_caps;
1838
1839 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1840 group_leader->nr_siblings++;
1841
1842 perf_event__header_size(group_leader);
1843
1844 for_each_sibling_event(pos, group_leader)
1845 perf_event__header_size(pos);
1846}
1847
1848
1849
1850
1851
1852static void
1853list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1854{
1855 WARN_ON_ONCE(event->ctx != ctx);
1856 lockdep_assert_held(&ctx->lock);
1857
1858
1859
1860
1861 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1862 return;
1863
1864 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1865
1866 list_update_cgroup_event(event, ctx, false);
1867
1868 ctx->nr_events--;
1869 if (event->attr.inherit_stat)
1870 ctx->nr_stat--;
1871
1872 list_del_rcu(&event->event_entry);
1873
1874 if (event->group_leader == event)
1875 del_event_from_groups(event, ctx);
1876
1877
1878
1879
1880
1881
1882
1883
1884 if (event->state > PERF_EVENT_STATE_OFF)
1885 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1886
1887 ctx->generation++;
1888}
1889
1890static void perf_group_detach(struct perf_event *event)
1891{
1892 struct perf_event *sibling, *tmp;
1893 struct perf_event_context *ctx = event->ctx;
1894
1895 lockdep_assert_held(&ctx->lock);
1896
1897
1898
1899
1900 if (!(event->attach_state & PERF_ATTACH_GROUP))
1901 return;
1902
1903 event->attach_state &= ~PERF_ATTACH_GROUP;
1904
1905
1906
1907
1908 if (event->group_leader != event) {
1909 list_del_init(&event->sibling_list);
1910 event->group_leader->nr_siblings--;
1911 goto out;
1912 }
1913
1914
1915
1916
1917
1918
1919 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1920
1921 sibling->group_leader = sibling;
1922 list_del_init(&sibling->sibling_list);
1923
1924
1925 sibling->group_caps = event->group_caps;
1926
1927 if (!RB_EMPTY_NODE(&event->group_node)) {
1928 add_event_to_groups(sibling, event->ctx);
1929
1930 if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1931 struct list_head *list = sibling->attr.pinned ?
1932 &ctx->pinned_active : &ctx->flexible_active;
1933
1934 list_add_tail(&sibling->active_list, list);
1935 }
1936 }
1937
1938 WARN_ON_ONCE(sibling->ctx != event->ctx);
1939 }
1940
1941out:
1942 perf_event__header_size(event->group_leader);
1943
1944 for_each_sibling_event(tmp, event->group_leader)
1945 perf_event__header_size(tmp);
1946}
1947
1948static bool is_orphaned_event(struct perf_event *event)
1949{
1950 return event->state == PERF_EVENT_STATE_DEAD;
1951}
1952
1953static inline int __pmu_filter_match(struct perf_event *event)
1954{
1955 struct pmu *pmu = event->pmu;
1956 return pmu->filter_match ? pmu->filter_match(event) : 1;
1957}
1958
1959
1960
1961
1962
1963
1964
1965static inline int pmu_filter_match(struct perf_event *event)
1966{
1967 struct perf_event *sibling;
1968
1969 if (!__pmu_filter_match(event))
1970 return 0;
1971
1972 for_each_sibling_event(sibling, event) {
1973 if (!__pmu_filter_match(sibling))
1974 return 0;
1975 }
1976
1977 return 1;
1978}
1979
1980static inline int
1981event_filter_match(struct perf_event *event)
1982{
1983 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1984 perf_cgroup_match(event) && pmu_filter_match(event);
1985}
1986
1987static void
1988event_sched_out(struct perf_event *event,
1989 struct perf_cpu_context *cpuctx,
1990 struct perf_event_context *ctx)
1991{
1992 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1993
1994 WARN_ON_ONCE(event->ctx != ctx);
1995 lockdep_assert_held(&ctx->lock);
1996
1997 if (event->state != PERF_EVENT_STATE_ACTIVE)
1998 return;
1999
2000
2001
2002
2003
2004
2005 list_del_init(&event->active_list);
2006
2007 perf_pmu_disable(event->pmu);
2008
2009 event->pmu->del(event, 0);
2010 event->oncpu = -1;
2011
2012 if (READ_ONCE(event->pending_disable) >= 0) {
2013 WRITE_ONCE(event->pending_disable, -1);
2014 state = PERF_EVENT_STATE_OFF;
2015 }
2016 perf_event_set_state(event, state);
2017
2018 if (!is_software_event(event))
2019 cpuctx->active_oncpu--;
2020 if (!--ctx->nr_active)
2021 perf_event_ctx_deactivate(ctx);
2022 if (event->attr.freq && event->attr.sample_freq)
2023 ctx->nr_freq--;
2024 if (event->attr.exclusive || !cpuctx->active_oncpu)
2025 cpuctx->exclusive = 0;
2026
2027 perf_pmu_enable(event->pmu);
2028}
2029
2030static void
2031group_sched_out(struct perf_event *group_event,
2032 struct perf_cpu_context *cpuctx,
2033 struct perf_event_context *ctx)
2034{
2035 struct perf_event *event;
2036
2037 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2038 return;
2039
2040 perf_pmu_disable(ctx->pmu);
2041
2042 event_sched_out(group_event, cpuctx, ctx);
2043
2044
2045
2046
2047 for_each_sibling_event(event, group_event)
2048 event_sched_out(event, cpuctx, ctx);
2049
2050 perf_pmu_enable(ctx->pmu);
2051
2052 if (group_event->attr.exclusive)
2053 cpuctx->exclusive = 0;
2054}
2055
2056#define DETACH_GROUP 0x01UL
2057
2058
2059
2060
2061
2062
2063
2064static void
2065__perf_remove_from_context(struct perf_event *event,
2066 struct perf_cpu_context *cpuctx,
2067 struct perf_event_context *ctx,
2068 void *info)
2069{
2070 unsigned long flags = (unsigned long)info;
2071
2072 if (ctx->is_active & EVENT_TIME) {
2073 update_context_time(ctx);
2074 update_cgrp_time_from_cpuctx(cpuctx);
2075 }
2076
2077 event_sched_out(event, cpuctx, ctx);
2078 if (flags & DETACH_GROUP)
2079 perf_group_detach(event);
2080 list_del_event(event, ctx);
2081
2082 if (!ctx->nr_events && ctx->is_active) {
2083 ctx->is_active = 0;
2084 if (ctx->task) {
2085 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2086 cpuctx->task_ctx = NULL;
2087 }
2088 }
2089}
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2102{
2103 struct perf_event_context *ctx = event->ctx;
2104
2105 lockdep_assert_held(&ctx->mutex);
2106
2107 event_function_call(event, __perf_remove_from_context, (void *)flags);
2108
2109
2110
2111
2112
2113
2114
2115 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2116 if ((flags & DETACH_GROUP) &&
2117 (event->attach_state & PERF_ATTACH_GROUP)) {
2118
2119
2120
2121
2122 raw_spin_lock_irq(&ctx->lock);
2123 perf_group_detach(event);
2124 raw_spin_unlock_irq(&ctx->lock);
2125 }
2126}
2127
2128
2129
2130
2131static void __perf_event_disable(struct perf_event *event,
2132 struct perf_cpu_context *cpuctx,
2133 struct perf_event_context *ctx,
2134 void *info)
2135{
2136 if (event->state < PERF_EVENT_STATE_INACTIVE)
2137 return;
2138
2139 if (ctx->is_active & EVENT_TIME) {
2140 update_context_time(ctx);
2141 update_cgrp_time_from_event(event);
2142 }
2143
2144 if (event == event->group_leader)
2145 group_sched_out(event, cpuctx, ctx);
2146 else
2147 event_sched_out(event, cpuctx, ctx);
2148
2149 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2150}
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166static void _perf_event_disable(struct perf_event *event)
2167{
2168 struct perf_event_context *ctx = event->ctx;
2169
2170 raw_spin_lock_irq(&ctx->lock);
2171 if (event->state <= PERF_EVENT_STATE_OFF) {
2172 raw_spin_unlock_irq(&ctx->lock);
2173 return;
2174 }
2175 raw_spin_unlock_irq(&ctx->lock);
2176
2177 event_function_call(event, __perf_event_disable, NULL);
2178}
2179
2180void perf_event_disable_local(struct perf_event *event)
2181{
2182 event_function_local(event, __perf_event_disable, NULL);
2183}
2184
2185
2186
2187
2188
2189void perf_event_disable(struct perf_event *event)
2190{
2191 struct perf_event_context *ctx;
2192
2193 ctx = perf_event_ctx_lock(event);
2194 _perf_event_disable(event);
2195 perf_event_ctx_unlock(event, ctx);
2196}
2197EXPORT_SYMBOL_GPL(perf_event_disable);
2198
2199void perf_event_disable_inatomic(struct perf_event *event)
2200{
2201 WRITE_ONCE(event->pending_disable, smp_processor_id());
2202
2203 irq_work_queue(&event->pending);
2204}
2205
2206static void perf_set_shadow_time(struct perf_event *event,
2207 struct perf_event_context *ctx)
2208{
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234 if (is_cgroup_event(event))
2235 perf_cgroup_set_shadow_time(event, event->tstamp);
2236 else
2237 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2238}
2239
2240#define MAX_INTERRUPTS (~0ULL)
2241
2242static void perf_log_throttle(struct perf_event *event, int enable);
2243static void perf_log_itrace_start(struct perf_event *event);
2244
2245static int
2246event_sched_in(struct perf_event *event,
2247 struct perf_cpu_context *cpuctx,
2248 struct perf_event_context *ctx)
2249{
2250 int ret = 0;
2251
2252 lockdep_assert_held(&ctx->lock);
2253
2254 if (event->state <= PERF_EVENT_STATE_OFF)
2255 return 0;
2256
2257 WRITE_ONCE(event->oncpu, smp_processor_id());
2258
2259
2260
2261
2262
2263 smp_wmb();
2264 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2265
2266
2267
2268
2269
2270
2271 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2272 perf_log_throttle(event, 1);
2273 event->hw.interrupts = 0;
2274 }
2275
2276 perf_pmu_disable(event->pmu);
2277
2278 perf_set_shadow_time(event, ctx);
2279
2280 perf_log_itrace_start(event);
2281
2282 if (event->pmu->add(event, PERF_EF_START)) {
2283 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2284 event->oncpu = -1;
2285 ret = -EAGAIN;
2286 goto out;
2287 }
2288
2289 if (!is_software_event(event))
2290 cpuctx->active_oncpu++;
2291 if (!ctx->nr_active++)
2292 perf_event_ctx_activate(ctx);
2293 if (event->attr.freq && event->attr.sample_freq)
2294 ctx->nr_freq++;
2295
2296 if (event->attr.exclusive)
2297 cpuctx->exclusive = 1;
2298
2299out:
2300 perf_pmu_enable(event->pmu);
2301
2302 return ret;
2303}
2304
2305static int
2306group_sched_in(struct perf_event *group_event,
2307 struct perf_cpu_context *cpuctx,
2308 struct perf_event_context *ctx)
2309{
2310 struct perf_event *event, *partial_group = NULL;
2311 struct pmu *pmu = ctx->pmu;
2312
2313 if (group_event->state == PERF_EVENT_STATE_OFF)
2314 return 0;
2315
2316 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2317
2318 if (event_sched_in(group_event, cpuctx, ctx)) {
2319 pmu->cancel_txn(pmu);
2320 perf_mux_hrtimer_restart(cpuctx);
2321 return -EAGAIN;
2322 }
2323
2324
2325
2326
2327 for_each_sibling_event(event, group_event) {
2328 if (event_sched_in(event, cpuctx, ctx)) {
2329 partial_group = event;
2330 goto group_error;
2331 }
2332 }
2333
2334 if (!pmu->commit_txn(pmu))
2335 return 0;
2336
2337group_error:
2338
2339
2340
2341
2342
2343 for_each_sibling_event(event, group_event) {
2344 if (event == partial_group)
2345 break;
2346
2347 event_sched_out(event, cpuctx, ctx);
2348 }
2349 event_sched_out(group_event, cpuctx, ctx);
2350
2351 pmu->cancel_txn(pmu);
2352
2353 perf_mux_hrtimer_restart(cpuctx);
2354
2355 return -EAGAIN;
2356}
2357
2358
2359
2360
2361static int group_can_go_on(struct perf_event *event,
2362 struct perf_cpu_context *cpuctx,
2363 int can_add_hw)
2364{
2365
2366
2367
2368 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2369 return 1;
2370
2371
2372
2373
2374 if (cpuctx->exclusive)
2375 return 0;
2376
2377
2378
2379
2380 if (event->attr.exclusive && cpuctx->active_oncpu)
2381 return 0;
2382
2383
2384
2385
2386 return can_add_hw;
2387}
2388
2389static void add_event_to_ctx(struct perf_event *event,
2390 struct perf_event_context *ctx)
2391{
2392 list_add_event(event, ctx);
2393 perf_group_attach(event);
2394}
2395
2396static void ctx_sched_out(struct perf_event_context *ctx,
2397 struct perf_cpu_context *cpuctx,
2398 enum event_type_t event_type);
2399static void
2400ctx_sched_in(struct perf_event_context *ctx,
2401 struct perf_cpu_context *cpuctx,
2402 enum event_type_t event_type,
2403 struct task_struct *task);
2404
2405static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2406 struct perf_event_context *ctx,
2407 enum event_type_t event_type)
2408{
2409 if (!cpuctx->task_ctx)
2410 return;
2411
2412 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2413 return;
2414
2415 ctx_sched_out(ctx, cpuctx, event_type);
2416}
2417
2418static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2419 struct perf_event_context *ctx,
2420 struct task_struct *task)
2421{
2422 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2423 if (ctx)
2424 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2425 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2426 if (ctx)
2427 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2428}
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445static void ctx_resched(struct perf_cpu_context *cpuctx,
2446 struct perf_event_context *task_ctx,
2447 enum event_type_t event_type)
2448{
2449 enum event_type_t ctx_event_type;
2450 bool cpu_event = !!(event_type & EVENT_CPU);
2451
2452
2453
2454
2455
2456 if (event_type & EVENT_PINNED)
2457 event_type |= EVENT_FLEXIBLE;
2458
2459 ctx_event_type = event_type & EVENT_ALL;
2460
2461 perf_pmu_disable(cpuctx->ctx.pmu);
2462 if (task_ctx)
2463 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2464
2465
2466
2467
2468
2469
2470
2471
2472 if (cpu_event)
2473 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2474 else if (ctx_event_type & EVENT_PINNED)
2475 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2476
2477 perf_event_sched_in(cpuctx, task_ctx, current);
2478 perf_pmu_enable(cpuctx->ctx.pmu);
2479}
2480
2481void perf_pmu_resched(struct pmu *pmu)
2482{
2483 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2484 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2485
2486 perf_ctx_lock(cpuctx, task_ctx);
2487 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2488 perf_ctx_unlock(cpuctx, task_ctx);
2489}
2490
2491
2492
2493
2494
2495
2496
2497static int __perf_install_in_context(void *info)
2498{
2499 struct perf_event *event = info;
2500 struct perf_event_context *ctx = event->ctx;
2501 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2502 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2503 bool reprogram = true;
2504 int ret = 0;
2505
2506 raw_spin_lock(&cpuctx->ctx.lock);
2507 if (ctx->task) {
2508 raw_spin_lock(&ctx->lock);
2509 task_ctx = ctx;
2510
2511 reprogram = (ctx->task == current);
2512
2513
2514
2515
2516
2517
2518
2519
2520 if (task_curr(ctx->task) && !reprogram) {
2521 ret = -ESRCH;
2522 goto unlock;
2523 }
2524
2525 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2526 } else if (task_ctx) {
2527 raw_spin_lock(&task_ctx->lock);
2528 }
2529
2530#ifdef CONFIG_CGROUP_PERF
2531 if (is_cgroup_event(event)) {
2532
2533
2534
2535
2536 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2537 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2538 event->cgrp->css.cgroup);
2539 }
2540#endif
2541
2542 if (reprogram) {
2543 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2544 add_event_to_ctx(event, ctx);
2545 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2546 } else {
2547 add_event_to_ctx(event, ctx);
2548 }
2549
2550unlock:
2551 perf_ctx_unlock(cpuctx, task_ctx);
2552
2553 return ret;
2554}
2555
2556static bool exclusive_event_installable(struct perf_event *event,
2557 struct perf_event_context *ctx);
2558
2559
2560
2561
2562
2563
2564static void
2565perf_install_in_context(struct perf_event_context *ctx,
2566 struct perf_event *event,
2567 int cpu)
2568{
2569 struct task_struct *task = READ_ONCE(ctx->task);
2570
2571 lockdep_assert_held(&ctx->mutex);
2572
2573 WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2574
2575 if (event->cpu != -1)
2576 event->cpu = cpu;
2577
2578
2579
2580
2581
2582 smp_store_release(&event->ctx, ctx);
2583
2584 if (!task) {
2585 cpu_function_call(cpu, __perf_install_in_context, event);
2586 return;
2587 }
2588
2589
2590
2591
2592 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2593 return;
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625 smp_mb();
2626again:
2627 if (!task_function_call(task, __perf_install_in_context, event))
2628 return;
2629
2630 raw_spin_lock_irq(&ctx->lock);
2631 task = ctx->task;
2632 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2633
2634
2635
2636
2637
2638 raw_spin_unlock_irq(&ctx->lock);
2639 return;
2640 }
2641
2642
2643
2644
2645 if (task_curr(task)) {
2646 raw_spin_unlock_irq(&ctx->lock);
2647 goto again;
2648 }
2649 add_event_to_ctx(event, ctx);
2650 raw_spin_unlock_irq(&ctx->lock);
2651}
2652
2653
2654
2655
2656static void __perf_event_enable(struct perf_event *event,
2657 struct perf_cpu_context *cpuctx,
2658 struct perf_event_context *ctx,
2659 void *info)
2660{
2661 struct perf_event *leader = event->group_leader;
2662 struct perf_event_context *task_ctx;
2663
2664 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2665 event->state <= PERF_EVENT_STATE_ERROR)
2666 return;
2667
2668 if (ctx->is_active)
2669 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2670
2671 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2672
2673 if (!ctx->is_active)
2674 return;
2675
2676 if (!event_filter_match(event)) {
2677 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2678 return;
2679 }
2680
2681
2682
2683
2684
2685 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2686 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2687 return;
2688 }
2689
2690 task_ctx = cpuctx->task_ctx;
2691 if (ctx->task)
2692 WARN_ON_ONCE(task_ctx != ctx);
2693
2694 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2695}
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706static void _perf_event_enable(struct perf_event *event)
2707{
2708 struct perf_event_context *ctx = event->ctx;
2709
2710 raw_spin_lock_irq(&ctx->lock);
2711 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2712 event->state < PERF_EVENT_STATE_ERROR) {
2713 raw_spin_unlock_irq(&ctx->lock);
2714 return;
2715 }
2716
2717
2718
2719
2720
2721
2722
2723
2724 if (event->state == PERF_EVENT_STATE_ERROR)
2725 event->state = PERF_EVENT_STATE_OFF;
2726 raw_spin_unlock_irq(&ctx->lock);
2727
2728 event_function_call(event, __perf_event_enable, NULL);
2729}
2730
2731
2732
2733
2734void perf_event_enable(struct perf_event *event)
2735{
2736 struct perf_event_context *ctx;
2737
2738 ctx = perf_event_ctx_lock(event);
2739 _perf_event_enable(event);
2740 perf_event_ctx_unlock(event, ctx);
2741}
2742EXPORT_SYMBOL_GPL(perf_event_enable);
2743
2744struct stop_event_data {
2745 struct perf_event *event;
2746 unsigned int restart;
2747};
2748
2749static int __perf_event_stop(void *info)
2750{
2751 struct stop_event_data *sd = info;
2752 struct perf_event *event = sd->event;
2753
2754
2755 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2756 return 0;
2757
2758
2759 smp_rmb();
2760
2761
2762
2763
2764
2765 if (READ_ONCE(event->oncpu) != smp_processor_id())
2766 return -EAGAIN;
2767
2768 event->pmu->stop(event, PERF_EF_UPDATE);
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779 if (sd->restart)
2780 event->pmu->start(event, 0);
2781
2782 return 0;
2783}
2784
2785static int perf_event_stop(struct perf_event *event, int restart)
2786{
2787 struct stop_event_data sd = {
2788 .event = event,
2789 .restart = restart,
2790 };
2791 int ret = 0;
2792
2793 do {
2794 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2795 return 0;
2796
2797
2798 smp_rmb();
2799
2800
2801
2802
2803
2804
2805 ret = cpu_function_call(READ_ONCE(event->oncpu),
2806 __perf_event_stop, &sd);
2807 } while (ret == -EAGAIN);
2808
2809 return ret;
2810}
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834void perf_event_addr_filters_sync(struct perf_event *event)
2835{
2836 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2837
2838 if (!has_addr_filter(event))
2839 return;
2840
2841 raw_spin_lock(&ifh->lock);
2842 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2843 event->pmu->addr_filters_sync(event);
2844 event->hw.addr_filters_gen = event->addr_filters_gen;
2845 }
2846 raw_spin_unlock(&ifh->lock);
2847}
2848EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2849
2850static int _perf_event_refresh(struct perf_event *event, int refresh)
2851{
2852
2853
2854
2855 if (event->attr.inherit || !is_sampling_event(event))
2856 return -EINVAL;
2857
2858 atomic_add(refresh, &event->event_limit);
2859 _perf_event_enable(event);
2860
2861 return 0;
2862}
2863
2864
2865
2866
2867int perf_event_refresh(struct perf_event *event, int refresh)
2868{
2869 struct perf_event_context *ctx;
2870 int ret;
2871
2872 ctx = perf_event_ctx_lock(event);
2873 ret = _perf_event_refresh(event, refresh);
2874 perf_event_ctx_unlock(event, ctx);
2875
2876 return ret;
2877}
2878EXPORT_SYMBOL_GPL(perf_event_refresh);
2879
2880static int perf_event_modify_breakpoint(struct perf_event *bp,
2881 struct perf_event_attr *attr)
2882{
2883 int err;
2884
2885 _perf_event_disable(bp);
2886
2887 err = modify_user_hw_breakpoint_check(bp, attr, true);
2888
2889 if (!bp->attr.disabled)
2890 _perf_event_enable(bp);
2891
2892 return err;
2893}
2894
2895static int perf_event_modify_attr(struct perf_event *event,
2896 struct perf_event_attr *attr)
2897{
2898 if (event->attr.type != attr->type)
2899 return -EINVAL;
2900
2901 switch (event->attr.type) {
2902 case PERF_TYPE_BREAKPOINT:
2903 return perf_event_modify_breakpoint(event, attr);
2904 default:
2905
2906 return -EOPNOTSUPP;
2907 }
2908}
2909
2910static void ctx_sched_out(struct perf_event_context *ctx,
2911 struct perf_cpu_context *cpuctx,
2912 enum event_type_t event_type)
2913{
2914 struct perf_event *event, *tmp;
2915 int is_active = ctx->is_active;
2916
2917 lockdep_assert_held(&ctx->lock);
2918
2919 if (likely(!ctx->nr_events)) {
2920
2921
2922
2923 WARN_ON_ONCE(ctx->is_active);
2924 if (ctx->task)
2925 WARN_ON_ONCE(cpuctx->task_ctx);
2926 return;
2927 }
2928
2929 ctx->is_active &= ~event_type;
2930 if (!(ctx->is_active & EVENT_ALL))
2931 ctx->is_active = 0;
2932
2933 if (ctx->task) {
2934 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2935 if (!ctx->is_active)
2936 cpuctx->task_ctx = NULL;
2937 }
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949 if (is_active & EVENT_TIME) {
2950
2951 update_context_time(ctx);
2952 update_cgrp_time_from_cpuctx(cpuctx);
2953 }
2954
2955 is_active ^= ctx->is_active;
2956
2957 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2958 return;
2959
2960
2961
2962
2963
2964 ctx->rotate_necessary = 0;
2965
2966 perf_pmu_disable(ctx->pmu);
2967 if (is_active & EVENT_PINNED) {
2968 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2969 group_sched_out(event, cpuctx, ctx);
2970 }
2971
2972 if (is_active & EVENT_FLEXIBLE) {
2973 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2974 group_sched_out(event, cpuctx, ctx);
2975 }
2976 perf_pmu_enable(ctx->pmu);
2977}
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987static int context_equiv(struct perf_event_context *ctx1,
2988 struct perf_event_context *ctx2)
2989{
2990 lockdep_assert_held(&ctx1->lock);
2991 lockdep_assert_held(&ctx2->lock);
2992
2993
2994 if (ctx1->pin_count || ctx2->pin_count)
2995 return 0;
2996
2997
2998 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2999 return 1;
3000
3001
3002 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3003 return 1;
3004
3005
3006
3007
3008
3009 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3010 ctx1->parent_gen == ctx2->parent_gen)
3011 return 1;
3012
3013
3014 return 0;
3015}
3016
3017static void __perf_event_sync_stat(struct perf_event *event,
3018 struct perf_event *next_event)
3019{
3020 u64 value;
3021
3022 if (!event->attr.inherit_stat)
3023 return;
3024
3025
3026
3027
3028
3029
3030
3031
3032 if (event->state == PERF_EVENT_STATE_ACTIVE)
3033 event->pmu->read(event);
3034
3035 perf_event_update_time(event);
3036
3037
3038
3039
3040
3041 value = local64_read(&next_event->count);
3042 value = local64_xchg(&event->count, value);
3043 local64_set(&next_event->count, value);
3044
3045 swap(event->total_time_enabled, next_event->total_time_enabled);
3046 swap(event->total_time_running, next_event->total_time_running);
3047
3048
3049
3050
3051 perf_event_update_userpage(event);
3052 perf_event_update_userpage(next_event);
3053}
3054
3055static void perf_event_sync_stat(struct perf_event_context *ctx,
3056 struct perf_event_context *next_ctx)
3057{
3058 struct perf_event *event, *next_event;
3059
3060 if (!ctx->nr_stat)
3061 return;
3062
3063 update_context_time(ctx);
3064
3065 event = list_first_entry(&ctx->event_list,
3066 struct perf_event, event_entry);
3067
3068 next_event = list_first_entry(&next_ctx->event_list,
3069 struct perf_event, event_entry);
3070
3071 while (&event->event_entry != &ctx->event_list &&
3072 &next_event->event_entry != &next_ctx->event_list) {
3073
3074 __perf_event_sync_stat(event, next_event);
3075
3076 event = list_next_entry(event, event_entry);
3077 next_event = list_next_entry(next_event, event_entry);
3078 }
3079}
3080
3081static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3082 struct task_struct *next)
3083{
3084 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3085 struct perf_event_context *next_ctx;
3086 struct perf_event_context *parent, *next_parent;
3087 struct perf_cpu_context *cpuctx;
3088 int do_switch = 1;
3089
3090 if (likely(!ctx))
3091 return;
3092
3093 cpuctx = __get_cpu_context(ctx);
3094 if (!cpuctx->task_ctx)
3095 return;
3096
3097 rcu_read_lock();
3098 next_ctx = next->perf_event_ctxp[ctxn];
3099 if (!next_ctx)
3100 goto unlock;
3101
3102 parent = rcu_dereference(ctx->parent_ctx);
3103 next_parent = rcu_dereference(next_ctx->parent_ctx);
3104
3105
3106 if (!parent && !next_parent)
3107 goto unlock;
3108
3109 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119 raw_spin_lock(&ctx->lock);
3120 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3121 if (context_equiv(ctx, next_ctx)) {
3122 WRITE_ONCE(ctx->task, next);
3123 WRITE_ONCE(next_ctx->task, task);
3124
3125 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3126
3127
3128
3129
3130
3131
3132
3133
3134 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3135 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3136
3137 do_switch = 0;
3138
3139 perf_event_sync_stat(ctx, next_ctx);
3140 }
3141 raw_spin_unlock(&next_ctx->lock);
3142 raw_spin_unlock(&ctx->lock);
3143 }
3144unlock:
3145 rcu_read_unlock();
3146
3147 if (do_switch) {
3148 raw_spin_lock(&ctx->lock);
3149 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3150 raw_spin_unlock(&ctx->lock);
3151 }
3152}
3153
3154static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3155
3156void perf_sched_cb_dec(struct pmu *pmu)
3157{
3158 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3159
3160 this_cpu_dec(perf_sched_cb_usages);
3161
3162 if (!--cpuctx->sched_cb_usage)
3163 list_del(&cpuctx->sched_cb_entry);
3164}
3165
3166
3167void perf_sched_cb_inc(struct pmu *pmu)
3168{
3169 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3170
3171 if (!cpuctx->sched_cb_usage++)
3172 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3173
3174 this_cpu_inc(perf_sched_cb_usages);
3175}
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185static void perf_pmu_sched_task(struct task_struct *prev,
3186 struct task_struct *next,
3187 bool sched_in)
3188{
3189 struct perf_cpu_context *cpuctx;
3190 struct pmu *pmu;
3191
3192 if (prev == next)
3193 return;
3194
3195 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3196 pmu = cpuctx->ctx.pmu;
3197
3198 if (WARN_ON_ONCE(!pmu->sched_task))
3199 continue;
3200
3201 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3202 perf_pmu_disable(pmu);
3203
3204 pmu->sched_task(cpuctx->task_ctx, sched_in);
3205
3206 perf_pmu_enable(pmu);
3207 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3208 }
3209}
3210
3211static void perf_event_switch(struct task_struct *task,
3212 struct task_struct *next_prev, bool sched_in);
3213
3214#define for_each_task_context_nr(ctxn) \
3215 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228void __perf_event_task_sched_out(struct task_struct *task,
3229 struct task_struct *next)
3230{
3231 int ctxn;
3232
3233 if (__this_cpu_read(perf_sched_cb_usages))
3234 perf_pmu_sched_task(task, next, false);
3235
3236 if (atomic_read(&nr_switch_events))
3237 perf_event_switch(task, next, false);
3238
3239 for_each_task_context_nr(ctxn)
3240 perf_event_context_sched_out(task, ctxn, next);
3241
3242
3243
3244
3245
3246
3247 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3248 perf_cgroup_sched_out(task, next);
3249}
3250
3251
3252
3253
3254static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3255 enum event_type_t event_type)
3256{
3257 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3258}
3259
3260static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3261 int (*func)(struct perf_event *, void *), void *data)
3262{
3263 struct perf_event **evt, *evt1, *evt2;
3264 int ret;
3265
3266 evt1 = perf_event_groups_first(groups, -1);
3267 evt2 = perf_event_groups_first(groups, cpu);
3268
3269 while (evt1 || evt2) {
3270 if (evt1 && evt2) {
3271 if (evt1->group_index < evt2->group_index)
3272 evt = &evt1;
3273 else
3274 evt = &evt2;
3275 } else if (evt1) {
3276 evt = &evt1;
3277 } else {
3278 evt = &evt2;
3279 }
3280
3281 ret = func(*evt, data);
3282 if (ret)
3283 return ret;
3284
3285 *evt = perf_event_groups_next(*evt);
3286 }
3287
3288 return 0;
3289}
3290
3291struct sched_in_data {
3292 struct perf_event_context *ctx;
3293 struct perf_cpu_context *cpuctx;
3294 int can_add_hw;
3295};
3296
3297static int pinned_sched_in(struct perf_event *event, void *data)
3298{
3299 struct sched_in_data *sid = data;
3300
3301 if (event->state <= PERF_EVENT_STATE_OFF)
3302 return 0;
3303
3304 if (!event_filter_match(event))
3305 return 0;
3306
3307 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3308 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3309 list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3310 }
3311
3312
3313
3314
3315
3316 if (event->state == PERF_EVENT_STATE_INACTIVE)
3317 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3318
3319 return 0;
3320}
3321
3322static int flexible_sched_in(struct perf_event *event, void *data)
3323{
3324 struct sched_in_data *sid = data;
3325
3326 if (event->state <= PERF_EVENT_STATE_OFF)
3327 return 0;
3328
3329 if (!event_filter_match(event))
3330 return 0;
3331
3332 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3333 int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
3334 if (ret) {
3335 sid->can_add_hw = 0;
3336 sid->ctx->rotate_necessary = 1;
3337 return 0;
3338 }
3339 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3340 }
3341
3342 return 0;
3343}
3344
3345static void
3346ctx_pinned_sched_in(struct perf_event_context *ctx,
3347 struct perf_cpu_context *cpuctx)
3348{
3349 struct sched_in_data sid = {
3350 .ctx = ctx,
3351 .cpuctx = cpuctx,
3352 .can_add_hw = 1,
3353 };
3354
3355 visit_groups_merge(&ctx->pinned_groups,
3356 smp_processor_id(),
3357 pinned_sched_in, &sid);
3358}
3359
3360static void
3361ctx_flexible_sched_in(struct perf_event_context *ctx,
3362 struct perf_cpu_context *cpuctx)
3363{
3364 struct sched_in_data sid = {
3365 .ctx = ctx,
3366 .cpuctx = cpuctx,
3367 .can_add_hw = 1,
3368 };
3369
3370 visit_groups_merge(&ctx->flexible_groups,
3371 smp_processor_id(),
3372 flexible_sched_in, &sid);
3373}
3374
3375static void
3376ctx_sched_in(struct perf_event_context *ctx,
3377 struct perf_cpu_context *cpuctx,
3378 enum event_type_t event_type,
3379 struct task_struct *task)
3380{
3381 int is_active = ctx->is_active;
3382 u64 now;
3383
3384 lockdep_assert_held(&ctx->lock);
3385
3386 if (likely(!ctx->nr_events))
3387 return;
3388
3389 ctx->is_active |= (event_type | EVENT_TIME);
3390 if (ctx->task) {
3391 if (!is_active)
3392 cpuctx->task_ctx = ctx;
3393 else
3394 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3395 }
3396
3397 is_active ^= ctx->is_active;
3398
3399 if (is_active & EVENT_TIME) {
3400
3401 now = perf_clock();
3402 ctx->timestamp = now;
3403 perf_cgroup_set_timestamp(task, ctx);
3404 }
3405
3406
3407
3408
3409
3410 if (is_active & EVENT_PINNED)
3411 ctx_pinned_sched_in(ctx, cpuctx);
3412
3413
3414 if (is_active & EVENT_FLEXIBLE)
3415 ctx_flexible_sched_in(ctx, cpuctx);
3416}
3417
3418static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3419 enum event_type_t event_type,
3420 struct task_struct *task)
3421{
3422 struct perf_event_context *ctx = &cpuctx->ctx;
3423
3424 ctx_sched_in(ctx, cpuctx, event_type, task);
3425}
3426
3427static void perf_event_context_sched_in(struct perf_event_context *ctx,
3428 struct task_struct *task)
3429{
3430 struct perf_cpu_context *cpuctx;
3431
3432 cpuctx = __get_cpu_context(ctx);
3433 if (cpuctx->task_ctx == ctx)
3434 return;
3435
3436 perf_ctx_lock(cpuctx, ctx);
3437
3438
3439
3440
3441 if (!ctx->nr_events)
3442 goto unlock;
3443
3444 perf_pmu_disable(ctx->pmu);
3445
3446
3447
3448
3449
3450
3451
3452
3453 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3454 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3455 perf_event_sched_in(cpuctx, ctx, task);
3456 perf_pmu_enable(ctx->pmu);
3457
3458unlock:
3459 perf_ctx_unlock(cpuctx, ctx);
3460}
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473void __perf_event_task_sched_in(struct task_struct *prev,
3474 struct task_struct *task)
3475{
3476 struct perf_event_context *ctx;
3477 int ctxn;
3478
3479
3480
3481
3482
3483
3484
3485
3486 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3487 perf_cgroup_sched_in(prev, task);
3488
3489 for_each_task_context_nr(ctxn) {
3490 ctx = task->perf_event_ctxp[ctxn];
3491 if (likely(!ctx))
3492 continue;
3493
3494 perf_event_context_sched_in(ctx, task);
3495 }
3496
3497 if (atomic_read(&nr_switch_events))
3498 perf_event_switch(task, prev, true);
3499
3500 if (__this_cpu_read(perf_sched_cb_usages))
3501 perf_pmu_sched_task(prev, task, true);
3502}
3503
3504static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3505{
3506 u64 frequency = event->attr.sample_freq;
3507 u64 sec = NSEC_PER_SEC;
3508 u64 divisor, dividend;
3509
3510 int count_fls, nsec_fls, frequency_fls, sec_fls;
3511
3512 count_fls = fls64(count);
3513 nsec_fls = fls64(nsec);
3514 frequency_fls = fls64(frequency);
3515 sec_fls = 30;
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531#define REDUCE_FLS(a, b) \
3532do { \
3533 if (a##_fls > b##_fls) { \
3534 a >>= 1; \
3535 a##_fls--; \
3536 } else { \
3537 b >>= 1; \
3538 b##_fls--; \
3539 } \
3540} while (0)
3541
3542
3543
3544
3545
3546 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3547 REDUCE_FLS(nsec, frequency);
3548 REDUCE_FLS(sec, count);
3549 }
3550
3551 if (count_fls + sec_fls > 64) {
3552 divisor = nsec * frequency;
3553
3554 while (count_fls + sec_fls > 64) {
3555 REDUCE_FLS(count, sec);
3556 divisor >>= 1;
3557 }
3558
3559 dividend = count * sec;
3560 } else {
3561 dividend = count * sec;
3562
3563 while (nsec_fls + frequency_fls > 64) {
3564 REDUCE_FLS(nsec, frequency);
3565 dividend >>= 1;
3566 }
3567
3568 divisor = nsec * frequency;
3569 }
3570
3571 if (!divisor)
3572 return dividend;
3573
3574 return div64_u64(dividend, divisor);
3575}
3576
3577static DEFINE_PER_CPU(int, perf_throttled_count);
3578static DEFINE_PER_CPU(u64, perf_throttled_seq);
3579
3580static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3581{
3582 struct hw_perf_event *hwc = &event->hw;
3583 s64 period, sample_period;
3584 s64 delta;
3585
3586 period = perf_calculate_period(event, nsec, count);
3587
3588 delta = (s64)(period - hwc->sample_period);
3589 delta = (delta + 7) / 8;
3590
3591 sample_period = hwc->sample_period + delta;
3592
3593 if (!sample_period)
3594 sample_period = 1;
3595
3596 hwc->sample_period = sample_period;
3597
3598 if (local64_read(&hwc->period_left) > 8*sample_period) {
3599 if (disable)
3600 event->pmu->stop(event, PERF_EF_UPDATE);
3601
3602 local64_set(&hwc->period_left, 0);
3603
3604 if (disable)
3605 event->pmu->start(event, PERF_EF_RELOAD);
3606 }
3607}
3608
3609
3610
3611
3612
3613
3614static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3615 int needs_unthr)
3616{
3617 struct perf_event *event;
3618 struct hw_perf_event *hwc;
3619 u64 now, period = TICK_NSEC;
3620 s64 delta;
3621
3622
3623
3624
3625
3626
3627 if (!(ctx->nr_freq || needs_unthr))
3628 return;
3629
3630 raw_spin_lock(&ctx->lock);
3631 perf_pmu_disable(ctx->pmu);
3632
3633 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3634 if (event->state != PERF_EVENT_STATE_ACTIVE)
3635 continue;
3636
3637 if (!event_filter_match(event))
3638 continue;
3639
3640 perf_pmu_disable(event->pmu);
3641
3642 hwc = &event->hw;
3643
3644 if (hwc->interrupts == MAX_INTERRUPTS) {
3645 hwc->interrupts = 0;
3646 perf_log_throttle(event, 1);
3647 event->pmu->start(event, 0);
3648 }
3649
3650 if (!event->attr.freq || !event->attr.sample_freq)
3651 goto next;
3652
3653
3654
3655
3656 event->pmu->stop(event, PERF_EF_UPDATE);
3657
3658 now = local64_read(&event->count);
3659 delta = now - hwc->freq_count_stamp;
3660 hwc->freq_count_stamp = now;
3661
3662
3663
3664
3665
3666
3667
3668
3669 if (delta > 0)
3670 perf_adjust_period(event, period, delta, false);
3671
3672 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3673 next:
3674 perf_pmu_enable(event->pmu);
3675 }
3676
3677 perf_pmu_enable(ctx->pmu);
3678 raw_spin_unlock(&ctx->lock);
3679}
3680
3681
3682
3683
3684static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3685{
3686
3687
3688
3689
3690 if (ctx->rotate_disable)
3691 return;
3692
3693 perf_event_groups_delete(&ctx->flexible_groups, event);
3694 perf_event_groups_insert(&ctx->flexible_groups, event);
3695}
3696
3697static inline struct perf_event *
3698ctx_first_active(struct perf_event_context *ctx)
3699{
3700 return list_first_entry_or_null(&ctx->flexible_active,
3701 struct perf_event, active_list);
3702}
3703
3704static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3705{
3706 struct perf_event *cpu_event = NULL, *task_event = NULL;
3707 struct perf_event_context *task_ctx = NULL;
3708 int cpu_rotate, task_rotate;
3709
3710
3711
3712
3713
3714
3715 cpu_rotate = cpuctx->ctx.rotate_necessary;
3716 task_ctx = cpuctx->task_ctx;
3717 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3718
3719 if (!(cpu_rotate || task_rotate))
3720 return false;
3721
3722 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3723 perf_pmu_disable(cpuctx->ctx.pmu);
3724
3725 if (task_rotate)
3726 task_event = ctx_first_active(task_ctx);
3727 if (cpu_rotate)
3728 cpu_event = ctx_first_active(&cpuctx->ctx);
3729
3730
3731
3732
3733
3734 if (task_event || (task_ctx && cpu_event))
3735 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3736 if (cpu_event)
3737 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3738
3739 if (task_event)
3740 rotate_ctx(task_ctx, task_event);
3741 if (cpu_event)
3742 rotate_ctx(&cpuctx->ctx, cpu_event);
3743
3744 perf_event_sched_in(cpuctx, task_ctx, current);
3745
3746 perf_pmu_enable(cpuctx->ctx.pmu);
3747 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3748
3749 return true;
3750}
3751
3752void perf_event_task_tick(void)
3753{
3754 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3755 struct perf_event_context *ctx, *tmp;
3756 int throttled;
3757
3758 lockdep_assert_irqs_disabled();
3759
3760 __this_cpu_inc(perf_throttled_seq);
3761 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3762 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3763
3764 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3765 perf_adjust_freq_unthr_context(ctx, throttled);
3766}
3767
3768static int event_enable_on_exec(struct perf_event *event,
3769 struct perf_event_context *ctx)
3770{
3771 if (!event->attr.enable_on_exec)
3772 return 0;
3773
3774 event->attr.enable_on_exec = 0;
3775 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3776 return 0;
3777
3778 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3779
3780 return 1;
3781}
3782
3783
3784
3785
3786
3787static void perf_event_enable_on_exec(int ctxn)
3788{
3789 struct perf_event_context *ctx, *clone_ctx = NULL;
3790 enum event_type_t event_type = 0;
3791 struct perf_cpu_context *cpuctx;
3792 struct perf_event *event;
3793 unsigned long flags;
3794 int enabled = 0;
3795
3796 local_irq_save(flags);
3797 ctx = current->perf_event_ctxp[ctxn];
3798 if (!ctx || !ctx->nr_events)
3799 goto out;
3800
3801 cpuctx = __get_cpu_context(ctx);
3802 perf_ctx_lock(cpuctx, ctx);
3803 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3804 list_for_each_entry(event, &ctx->event_list, event_entry) {
3805 enabled |= event_enable_on_exec(event, ctx);
3806 event_type |= get_event_type(event);
3807 }
3808
3809
3810
3811
3812 if (enabled) {
3813 clone_ctx = unclone_ctx(ctx);
3814 ctx_resched(cpuctx, ctx, event_type);
3815 } else {
3816 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3817 }
3818 perf_ctx_unlock(cpuctx, ctx);
3819
3820out:
3821 local_irq_restore(flags);
3822
3823 if (clone_ctx)
3824 put_ctx(clone_ctx);
3825}
3826
3827struct perf_read_data {
3828 struct perf_event *event;
3829 bool group;
3830 int ret;
3831};
3832
3833static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3834{
3835 u16 local_pkg, event_pkg;
3836
3837 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3838 int local_cpu = smp_processor_id();
3839
3840 event_pkg = topology_physical_package_id(event_cpu);
3841 local_pkg = topology_physical_package_id(local_cpu);
3842
3843 if (event_pkg == local_pkg)
3844 return local_cpu;
3845 }
3846
3847 return event_cpu;
3848}
3849
3850
3851
3852
3853static void __perf_event_read(void *info)
3854{
3855 struct perf_read_data *data = info;
3856 struct perf_event *sub, *event = data->event;
3857 struct perf_event_context *ctx = event->ctx;
3858 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3859 struct pmu *pmu = event->pmu;
3860
3861
3862
3863
3864
3865
3866
3867
3868 if (ctx->task && cpuctx->task_ctx != ctx)
3869 return;
3870
3871 raw_spin_lock(&ctx->lock);
3872 if (ctx->is_active & EVENT_TIME) {
3873 update_context_time(ctx);
3874 update_cgrp_time_from_event(event);
3875 }
3876
3877 perf_event_update_time(event);
3878 if (data->group)
3879 perf_event_update_sibling_time(event);
3880
3881 if (event->state != PERF_EVENT_STATE_ACTIVE)
3882 goto unlock;
3883
3884 if (!data->group) {
3885 pmu->read(event);
3886 data->ret = 0;
3887 goto unlock;
3888 }
3889
3890 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3891
3892 pmu->read(event);
3893
3894 for_each_sibling_event(sub, event) {
3895 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3896
3897
3898
3899
3900 sub->pmu->read(sub);
3901 }
3902 }
3903
3904 data->ret = pmu->commit_txn(pmu);
3905
3906unlock:
3907 raw_spin_unlock(&ctx->lock);
3908}
3909
3910static inline u64 perf_event_count(struct perf_event *event)
3911{
3912 return local64_read(&event->count) + atomic64_read(&event->child_count);
3913}
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923int perf_event_read_local(struct perf_event *event, u64 *value,
3924 u64 *enabled, u64 *running)
3925{
3926 unsigned long flags;
3927 int ret = 0;
3928
3929
3930
3931
3932
3933 local_irq_save(flags);
3934
3935
3936
3937
3938
3939 if (event->attr.inherit) {
3940 ret = -EOPNOTSUPP;
3941 goto out;
3942 }
3943
3944
3945 if ((event->attach_state & PERF_ATTACH_TASK) &&
3946 event->hw.target != current) {
3947 ret = -EINVAL;
3948 goto out;
3949 }
3950
3951
3952 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3953 event->cpu != smp_processor_id()) {
3954 ret = -EINVAL;
3955 goto out;
3956 }
3957
3958
3959 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3960 ret = -EBUSY;
3961 goto out;
3962 }
3963
3964
3965
3966
3967
3968
3969 if (event->oncpu == smp_processor_id())
3970 event->pmu->read(event);
3971
3972 *value = local64_read(&event->count);
3973 if (enabled || running) {
3974 u64 now = event->shadow_ctx_time + perf_clock();
3975 u64 __enabled, __running;
3976
3977 __perf_update_times(event, now, &__enabled, &__running);
3978 if (enabled)
3979 *enabled = __enabled;
3980 if (running)
3981 *running = __running;
3982 }
3983out:
3984 local_irq_restore(flags);
3985
3986 return ret;
3987}
3988
3989static int perf_event_read(struct perf_event *event, bool group)
3990{
3991 enum perf_event_state state = READ_ONCE(event->state);
3992 int event_cpu, ret = 0;
3993
3994
3995
3996
3997
3998again:
3999 if (state == PERF_EVENT_STATE_ACTIVE) {
4000 struct perf_read_data data;
4001
4002
4003
4004
4005
4006
4007
4008 smp_rmb();
4009
4010 event_cpu = READ_ONCE(event->oncpu);
4011 if ((unsigned)event_cpu >= nr_cpu_ids)
4012 return 0;
4013
4014 data = (struct perf_read_data){
4015 .event = event,
4016 .group = group,
4017 .ret = 0,
4018 };
4019
4020 preempt_disable();
4021 event_cpu = __perf_event_read_cpu(event, event_cpu);
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4034 preempt_enable();
4035 ret = data.ret;
4036
4037 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4038 struct perf_event_context *ctx = event->ctx;
4039 unsigned long flags;
4040
4041 raw_spin_lock_irqsave(&ctx->lock, flags);
4042 state = event->state;
4043 if (state != PERF_EVENT_STATE_INACTIVE) {
4044 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4045 goto again;
4046 }
4047
4048
4049
4050
4051
4052 if (ctx->is_active & EVENT_TIME) {
4053 update_context_time(ctx);
4054 update_cgrp_time_from_event(event);
4055 }
4056
4057 perf_event_update_time(event);
4058 if (group)
4059 perf_event_update_sibling_time(event);
4060 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4061 }
4062
4063 return ret;
4064}
4065
4066
4067
4068
4069static void __perf_event_init_context(struct perf_event_context *ctx)
4070{
4071 raw_spin_lock_init(&ctx->lock);
4072 mutex_init(&ctx->mutex);
4073 INIT_LIST_HEAD(&ctx->active_ctx_list);
4074 perf_event_groups_init(&ctx->pinned_groups);
4075 perf_event_groups_init(&ctx->flexible_groups);
4076 INIT_LIST_HEAD(&ctx->event_list);
4077 INIT_LIST_HEAD(&ctx->pinned_active);
4078 INIT_LIST_HEAD(&ctx->flexible_active);
4079 refcount_set(&ctx->refcount, 1);
4080}
4081
4082static struct perf_event_context *
4083alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4084{
4085 struct perf_event_context *ctx;
4086
4087 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4088 if (!ctx)
4089 return NULL;
4090
4091 __perf_event_init_context(ctx);
4092 if (task) {
4093 ctx->task = task;
4094 get_task_struct(task);
4095 }
4096 ctx->pmu = pmu;
4097
4098 return ctx;
4099}
4100
4101static struct task_struct *
4102find_lively_task_by_vpid(pid_t vpid)
4103{
4104 struct task_struct *task;
4105
4106 rcu_read_lock();
4107 if (!vpid)
4108 task = current;
4109 else
4110 task = find_task_by_vpid(vpid);
4111 if (task)
4112 get_task_struct(task);
4113 rcu_read_unlock();
4114
4115 if (!task)
4116 return ERR_PTR(-ESRCH);
4117
4118 return task;
4119}
4120
4121
4122
4123
4124static struct perf_event_context *
4125find_get_context(struct pmu *pmu, struct task_struct *task,
4126 struct perf_event *event)
4127{
4128 struct perf_event_context *ctx, *clone_ctx = NULL;
4129 struct perf_cpu_context *cpuctx;
4130 void *task_ctx_data = NULL;
4131 unsigned long flags;
4132 int ctxn, err;
4133 int cpu = event->cpu;
4134
4135 if (!task) {
4136
4137 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4138 return ERR_PTR(-EACCES);
4139
4140 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4141 ctx = &cpuctx->ctx;
4142 get_ctx(ctx);
4143 ++ctx->pin_count;
4144
4145 return ctx;
4146 }
4147
4148 err = -EINVAL;
4149 ctxn = pmu->task_ctx_nr;
4150 if (ctxn < 0)
4151 goto errout;
4152
4153 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4154 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4155 if (!task_ctx_data) {
4156 err = -ENOMEM;
4157 goto errout;
4158 }
4159 }
4160
4161retry:
4162 ctx = perf_lock_task_context(task, ctxn, &flags);
4163 if (ctx) {
4164 clone_ctx = unclone_ctx(ctx);
4165 ++ctx->pin_count;
4166
4167 if (task_ctx_data && !ctx->task_ctx_data) {
4168 ctx->task_ctx_data = task_ctx_data;
4169 task_ctx_data = NULL;
4170 }
4171 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4172
4173 if (clone_ctx)
4174 put_ctx(clone_ctx);
4175 } else {
4176 ctx = alloc_perf_context(pmu, task);
4177 err = -ENOMEM;
4178 if (!ctx)
4179 goto errout;
4180
4181 if (task_ctx_data) {
4182 ctx->task_ctx_data = task_ctx_data;
4183 task_ctx_data = NULL;
4184 }
4185
4186 err = 0;
4187 mutex_lock(&task->perf_event_mutex);
4188
4189
4190
4191
4192 if (task->flags & PF_EXITING)
4193 err = -ESRCH;
4194 else if (task->perf_event_ctxp[ctxn])
4195 err = -EAGAIN;
4196 else {
4197 get_ctx(ctx);
4198 ++ctx->pin_count;
4199 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4200 }
4201 mutex_unlock(&task->perf_event_mutex);
4202
4203 if (unlikely(err)) {
4204 put_ctx(ctx);
4205
4206 if (err == -EAGAIN)
4207 goto retry;
4208 goto errout;
4209 }
4210 }
4211
4212 kfree(task_ctx_data);
4213 return ctx;
4214
4215errout:
4216 kfree(task_ctx_data);
4217 return ERR_PTR(err);
4218}
4219
4220static void perf_event_free_filter(struct perf_event *event);
4221static void perf_event_free_bpf_prog(struct perf_event *event);
4222
4223static void free_event_rcu(struct rcu_head *head)
4224{
4225 struct perf_event *event;
4226
4227 event = container_of(head, struct perf_event, rcu_head);
4228 if (event->ns)
4229 put_pid_ns(event->ns);
4230 perf_event_free_filter(event);
4231 kfree(event);
4232}
4233
4234static void ring_buffer_attach(struct perf_event *event,
4235 struct ring_buffer *rb);
4236
4237static void detach_sb_event(struct perf_event *event)
4238{
4239 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4240
4241 raw_spin_lock(&pel->lock);
4242 list_del_rcu(&event->sb_list);
4243 raw_spin_unlock(&pel->lock);
4244}
4245
4246static bool is_sb_event(struct perf_event *event)
4247{
4248 struct perf_event_attr *attr = &event->attr;
4249
4250 if (event->parent)
4251 return false;
4252
4253 if (event->attach_state & PERF_ATTACH_TASK)
4254 return false;
4255
4256 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4257 attr->comm || attr->comm_exec ||
4258 attr->task || attr->ksymbol ||
4259 attr->context_switch ||
4260 attr->bpf_event)
4261 return true;
4262 return false;
4263}
4264
4265static void unaccount_pmu_sb_event(struct perf_event *event)
4266{
4267 if (is_sb_event(event))
4268 detach_sb_event(event);
4269}
4270
4271static void unaccount_event_cpu(struct perf_event *event, int cpu)
4272{
4273 if (event->parent)
4274 return;
4275
4276 if (is_cgroup_event(event))
4277 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4278}
4279
4280#ifdef CONFIG_NO_HZ_FULL
4281static DEFINE_SPINLOCK(nr_freq_lock);
4282#endif
4283
4284static void unaccount_freq_event_nohz(void)
4285{
4286#ifdef CONFIG_NO_HZ_FULL
4287 spin_lock(&nr_freq_lock);
4288 if (atomic_dec_and_test(&nr_freq_events))
4289 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4290 spin_unlock(&nr_freq_lock);
4291#endif
4292}
4293
4294static void unaccount_freq_event(void)
4295{
4296 if (tick_nohz_full_enabled())
4297 unaccount_freq_event_nohz();
4298 else
4299 atomic_dec(&nr_freq_events);
4300}
4301
4302static void unaccount_event(struct perf_event *event)
4303{
4304 bool dec = false;
4305
4306 if (event->parent)
4307 return;
4308
4309 if (event->attach_state & PERF_ATTACH_TASK)
4310 dec = true;
4311 if (event->attr.mmap || event->attr.mmap_data)
4312 atomic_dec(&nr_mmap_events);
4313 if (event->attr.comm)
4314 atomic_dec(&nr_comm_events);
4315 if (event->attr.namespaces)
4316 atomic_dec(&nr_namespaces_events);
4317 if (event->attr.task)
4318 atomic_dec(&nr_task_events);
4319 if (event->attr.freq)
4320 unaccount_freq_event();
4321 if (event->attr.context_switch) {
4322 dec = true;
4323 atomic_dec(&nr_switch_events);
4324 }
4325 if (is_cgroup_event(event))
4326 dec = true;
4327 if (has_branch_stack(event))
4328 dec = true;
4329 if (event->attr.ksymbol)
4330 atomic_dec(&nr_ksymbol_events);
4331 if (event->attr.bpf_event)
4332 atomic_dec(&nr_bpf_events);
4333
4334 if (dec) {
4335 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4336 schedule_delayed_work(&perf_sched_work, HZ);
4337 }
4338
4339 unaccount_event_cpu(event, event->cpu);
4340
4341 unaccount_pmu_sb_event(event);
4342}
4343
4344static void perf_sched_delayed(struct work_struct *work)
4345{
4346 mutex_lock(&perf_sched_mutex);
4347 if (atomic_dec_and_test(&perf_sched_count))
4348 static_branch_disable(&perf_sched_events);
4349 mutex_unlock(&perf_sched_mutex);
4350}
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364static int exclusive_event_init(struct perf_event *event)
4365{
4366 struct pmu *pmu = event->pmu;
4367
4368 if (!is_exclusive_pmu(pmu))
4369 return 0;
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384 if (event->attach_state & PERF_ATTACH_TASK) {
4385 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4386 return -EBUSY;
4387 } else {
4388 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4389 return -EBUSY;
4390 }
4391
4392 return 0;
4393}
4394
4395static void exclusive_event_destroy(struct perf_event *event)
4396{
4397 struct pmu *pmu = event->pmu;
4398
4399 if (!is_exclusive_pmu(pmu))
4400 return;
4401
4402
4403 if (event->attach_state & PERF_ATTACH_TASK)
4404 atomic_dec(&pmu->exclusive_cnt);
4405 else
4406 atomic_inc(&pmu->exclusive_cnt);
4407}
4408
4409static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4410{
4411 if ((e1->pmu == e2->pmu) &&
4412 (e1->cpu == e2->cpu ||
4413 e1->cpu == -1 ||
4414 e2->cpu == -1))
4415 return true;
4416 return false;
4417}
4418
4419static bool exclusive_event_installable(struct perf_event *event,
4420 struct perf_event_context *ctx)
4421{
4422 struct perf_event *iter_event;
4423 struct pmu *pmu = event->pmu;
4424
4425 lockdep_assert_held(&ctx->mutex);
4426
4427 if (!is_exclusive_pmu(pmu))
4428 return true;
4429
4430 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4431 if (exclusive_event_match(iter_event, event))
4432 return false;
4433 }
4434
4435 return true;
4436}
4437
4438static void perf_addr_filters_splice(struct perf_event *event,
4439 struct list_head *head);
4440
4441static void _free_event(struct perf_event *event)
4442{
4443 irq_work_sync(&event->pending);
4444
4445 unaccount_event(event);
4446
4447 if (event->rb) {
4448
4449
4450
4451
4452
4453
4454 mutex_lock(&event->mmap_mutex);
4455 ring_buffer_attach(event, NULL);
4456 mutex_unlock(&event->mmap_mutex);
4457 }
4458
4459 if (is_cgroup_event(event))
4460 perf_detach_cgroup(event);
4461
4462 if (!event->parent) {
4463 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4464 put_callchain_buffers();
4465 }
4466
4467 perf_event_free_bpf_prog(event);
4468 perf_addr_filters_splice(event, NULL);
4469 kfree(event->addr_filter_ranges);
4470
4471 if (event->destroy)
4472 event->destroy(event);
4473
4474
4475
4476
4477
4478 if (event->hw.target)
4479 put_task_struct(event->hw.target);
4480
4481
4482
4483
4484
4485 if (event->ctx)
4486 put_ctx(event->ctx);
4487
4488 exclusive_event_destroy(event);
4489 module_put(event->pmu->module);
4490
4491 call_rcu(&event->rcu_head, free_event_rcu);
4492}
4493
4494
4495
4496
4497
4498static void free_event(struct perf_event *event)
4499{
4500 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4501 "unexpected event refcount: %ld; ptr=%p\n",
4502 atomic_long_read(&event->refcount), event)) {
4503
4504 return;
4505 }
4506
4507 _free_event(event);
4508}
4509
4510
4511
4512
4513static void perf_remove_from_owner(struct perf_event *event)
4514{
4515 struct task_struct *owner;
4516
4517 rcu_read_lock();
4518
4519
4520
4521
4522
4523
4524 owner = READ_ONCE(event->owner);
4525 if (owner) {
4526
4527
4528
4529
4530
4531 get_task_struct(owner);
4532 }
4533 rcu_read_unlock();
4534
4535 if (owner) {
4536
4537
4538
4539
4540
4541
4542
4543
4544 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4545
4546
4547
4548
4549
4550
4551
4552 if (event->owner) {
4553 list_del_init(&event->owner_entry);
4554 smp_store_release(&event->owner, NULL);
4555 }
4556 mutex_unlock(&owner->perf_event_mutex);
4557 put_task_struct(owner);
4558 }
4559}
4560
4561static void put_event(struct perf_event *event)
4562{
4563 if (!atomic_long_dec_and_test(&event->refcount))
4564 return;
4565
4566 _free_event(event);
4567}
4568
4569
4570
4571
4572
4573
4574int perf_event_release_kernel(struct perf_event *event)
4575{
4576 struct perf_event_context *ctx = event->ctx;
4577 struct perf_event *child, *tmp;
4578 LIST_HEAD(free_list);
4579
4580
4581
4582
4583
4584 if (!ctx) {
4585 WARN_ON_ONCE(event->attach_state &
4586 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4587 goto no_ctx;
4588 }
4589
4590 if (!is_kernel_event(event))
4591 perf_remove_from_owner(event);
4592
4593 ctx = perf_event_ctx_lock(event);
4594 WARN_ON_ONCE(ctx->parent_ctx);
4595 perf_remove_from_context(event, DETACH_GROUP);
4596
4597 raw_spin_lock_irq(&ctx->lock);
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609 event->state = PERF_EVENT_STATE_DEAD;
4610 raw_spin_unlock_irq(&ctx->lock);
4611
4612 perf_event_ctx_unlock(event, ctx);
4613
4614again:
4615 mutex_lock(&event->child_mutex);
4616 list_for_each_entry(child, &event->child_list, child_list) {
4617
4618
4619
4620
4621
4622 ctx = READ_ONCE(child->ctx);
4623
4624
4625
4626
4627
4628
4629
4630
4631 get_ctx(ctx);
4632
4633
4634
4635
4636
4637
4638 mutex_unlock(&event->child_mutex);
4639 mutex_lock(&ctx->mutex);
4640 mutex_lock(&event->child_mutex);
4641
4642
4643
4644
4645
4646
4647 tmp = list_first_entry_or_null(&event->child_list,
4648 struct perf_event, child_list);
4649 if (tmp == child) {
4650 perf_remove_from_context(child, DETACH_GROUP);
4651 list_move(&child->child_list, &free_list);
4652
4653
4654
4655
4656 put_event(event);
4657 }
4658
4659 mutex_unlock(&event->child_mutex);
4660 mutex_unlock(&ctx->mutex);
4661 put_ctx(ctx);
4662 goto again;
4663 }
4664 mutex_unlock(&event->child_mutex);
4665
4666 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4667 void *var = &child->ctx->refcount;
4668
4669 list_del(&child->child_list);
4670 free_event(child);
4671
4672
4673
4674
4675
4676 smp_mb();
4677 wake_up_var(var);
4678 }
4679
4680no_ctx:
4681 put_event(event);
4682 return 0;
4683}
4684EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4685
4686
4687
4688
4689static int perf_release(struct inode *inode, struct file *file)
4690{
4691 perf_event_release_kernel(file->private_data);
4692 return 0;
4693}
4694
4695static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4696{
4697 struct perf_event *child;
4698 u64 total = 0;
4699
4700 *enabled = 0;
4701 *running = 0;
4702
4703 mutex_lock(&event->child_mutex);
4704
4705 (void)perf_event_read(event, false);
4706 total += perf_event_count(event);
4707
4708 *enabled += event->total_time_enabled +
4709 atomic64_read(&event->child_total_time_enabled);
4710 *running += event->total_time_running +
4711 atomic64_read(&event->child_total_time_running);
4712
4713 list_for_each_entry(child, &event->child_list, child_list) {
4714 (void)perf_event_read(child, false);
4715 total += perf_event_count(child);
4716 *enabled += child->total_time_enabled;
4717 *running += child->total_time_running;
4718 }
4719 mutex_unlock(&event->child_mutex);
4720
4721 return total;
4722}
4723
4724u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4725{
4726 struct perf_event_context *ctx;
4727 u64 count;
4728
4729 ctx = perf_event_ctx_lock(event);
4730 count = __perf_event_read_value(event, enabled, running);
4731 perf_event_ctx_unlock(event, ctx);
4732
4733 return count;
4734}
4735EXPORT_SYMBOL_GPL(perf_event_read_value);
4736
4737static int __perf_read_group_add(struct perf_event *leader,
4738 u64 read_format, u64 *values)
4739{
4740 struct perf_event_context *ctx = leader->ctx;
4741 struct perf_event *sub;
4742 unsigned long flags;
4743 int n = 1;
4744 int ret;
4745
4746 ret = perf_event_read(leader, true);
4747 if (ret)
4748 return ret;
4749
4750 raw_spin_lock_irqsave(&ctx->lock, flags);
4751
4752
4753
4754
4755
4756
4757 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4758 values[n++] += leader->total_time_enabled +
4759 atomic64_read(&leader->child_total_time_enabled);
4760 }
4761
4762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4763 values[n++] += leader->total_time_running +
4764 atomic64_read(&leader->child_total_time_running);
4765 }
4766
4767
4768
4769
4770 values[n++] += perf_event_count(leader);
4771 if (read_format & PERF_FORMAT_ID)
4772 values[n++] = primary_event_id(leader);
4773
4774 for_each_sibling_event(sub, leader) {
4775 values[n++] += perf_event_count(sub);
4776 if (read_format & PERF_FORMAT_ID)
4777 values[n++] = primary_event_id(sub);
4778 }
4779
4780 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4781 return 0;
4782}
4783
4784static int perf_read_group(struct perf_event *event,
4785 u64 read_format, char __user *buf)
4786{
4787 struct perf_event *leader = event->group_leader, *child;
4788 struct perf_event_context *ctx = leader->ctx;
4789 int ret;
4790 u64 *values;
4791
4792 lockdep_assert_held(&ctx->mutex);
4793
4794 values = kzalloc(event->read_size, GFP_KERNEL);
4795 if (!values)
4796 return -ENOMEM;
4797
4798 values[0] = 1 + leader->nr_siblings;
4799
4800
4801
4802
4803
4804 mutex_lock(&leader->child_mutex);
4805
4806 ret = __perf_read_group_add(leader, read_format, values);
4807 if (ret)
4808 goto unlock;
4809
4810 list_for_each_entry(child, &leader->child_list, child_list) {
4811 ret = __perf_read_group_add(child, read_format, values);
4812 if (ret)
4813 goto unlock;
4814 }
4815
4816 mutex_unlock(&leader->child_mutex);
4817
4818 ret = event->read_size;
4819 if (copy_to_user(buf, values, event->read_size))
4820 ret = -EFAULT;
4821 goto out;
4822
4823unlock:
4824 mutex_unlock(&leader->child_mutex);
4825out:
4826 kfree(values);
4827 return ret;
4828}
4829
4830static int perf_read_one(struct perf_event *event,
4831 u64 read_format, char __user *buf)
4832{
4833 u64 enabled, running;
4834 u64 values[4];
4835 int n = 0;
4836
4837 values[n++] = __perf_event_read_value(event, &enabled, &running);
4838 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4839 values[n++] = enabled;
4840 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4841 values[n++] = running;
4842 if (read_format & PERF_FORMAT_ID)
4843 values[n++] = primary_event_id(event);
4844
4845 if (copy_to_user(buf, values, n * sizeof(u64)))
4846 return -EFAULT;
4847
4848 return n * sizeof(u64);
4849}
4850
4851static bool is_event_hup(struct perf_event *event)
4852{
4853 bool no_children;
4854
4855 if (event->state > PERF_EVENT_STATE_EXIT)
4856 return false;
4857
4858 mutex_lock(&event->child_mutex);
4859 no_children = list_empty(&event->child_list);
4860 mutex_unlock(&event->child_mutex);
4861 return no_children;
4862}
4863
4864
4865
4866
4867static ssize_t
4868__perf_read(struct perf_event *event, char __user *buf, size_t count)
4869{
4870 u64 read_format = event->attr.read_format;
4871 int ret;
4872
4873
4874
4875
4876
4877
4878 if (event->state == PERF_EVENT_STATE_ERROR)
4879 return 0;
4880
4881 if (count < event->read_size)
4882 return -ENOSPC;
4883
4884 WARN_ON_ONCE(event->ctx->parent_ctx);
4885 if (read_format & PERF_FORMAT_GROUP)
4886 ret = perf_read_group(event, read_format, buf);
4887 else
4888 ret = perf_read_one(event, read_format, buf);
4889
4890 return ret;
4891}
4892
4893static ssize_t
4894perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4895{
4896 struct perf_event *event = file->private_data;
4897 struct perf_event_context *ctx;
4898 int ret;
4899
4900 ctx = perf_event_ctx_lock(event);
4901 ret = __perf_read(event, buf, count);
4902 perf_event_ctx_unlock(event, ctx);
4903
4904 return ret;
4905}
4906
4907static __poll_t perf_poll(struct file *file, poll_table *wait)
4908{
4909 struct perf_event *event = file->private_data;
4910 struct ring_buffer *rb;
4911 __poll_t events = EPOLLHUP;
4912
4913 poll_wait(file, &event->waitq, wait);
4914
4915 if (is_event_hup(event))
4916 return events;
4917
4918
4919
4920
4921
4922 mutex_lock(&event->mmap_mutex);
4923 rb = event->rb;
4924 if (rb)
4925 events = atomic_xchg(&rb->poll, 0);
4926 mutex_unlock(&event->mmap_mutex);
4927 return events;
4928}
4929
4930static void _perf_event_reset(struct perf_event *event)
4931{
4932 (void)perf_event_read(event, false);
4933 local64_set(&event->count, 0);
4934 perf_event_update_userpage(event);
4935}
4936
4937
4938
4939
4940
4941
4942
4943static void perf_event_for_each_child(struct perf_event *event,
4944 void (*func)(struct perf_event *))
4945{
4946 struct perf_event *child;
4947
4948 WARN_ON_ONCE(event->ctx->parent_ctx);
4949
4950 mutex_lock(&event->child_mutex);
4951 func(event);
4952 list_for_each_entry(child, &event->child_list, child_list)
4953 func(child);
4954 mutex_unlock(&event->child_mutex);
4955}
4956
4957static void perf_event_for_each(struct perf_event *event,
4958 void (*func)(struct perf_event *))
4959{
4960 struct perf_event_context *ctx = event->ctx;
4961 struct perf_event *sibling;
4962
4963 lockdep_assert_held(&ctx->mutex);
4964
4965 event = event->group_leader;
4966
4967 perf_event_for_each_child(event, func);
4968 for_each_sibling_event(sibling, event)
4969 perf_event_for_each_child(sibling, func);
4970}
4971
4972static void __perf_event_period(struct perf_event *event,
4973 struct perf_cpu_context *cpuctx,
4974 struct perf_event_context *ctx,
4975 void *info)
4976{
4977 u64 value = *((u64 *)info);
4978 bool active;
4979
4980 if (event->attr.freq) {
4981 event->attr.sample_freq = value;
4982 } else {
4983 event->attr.sample_period = value;
4984 event->hw.sample_period = value;
4985 }
4986
4987 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4988 if (active) {
4989 perf_pmu_disable(ctx->pmu);
4990
4991
4992
4993
4994 if (event->hw.interrupts == MAX_INTERRUPTS) {
4995 event->hw.interrupts = 0;
4996 perf_log_throttle(event, 1);
4997 }
4998 event->pmu->stop(event, PERF_EF_UPDATE);
4999 }
5000
5001 local64_set(&event->hw.period_left, 0);
5002
5003 if (active) {
5004 event->pmu->start(event, PERF_EF_RELOAD);
5005 perf_pmu_enable(ctx->pmu);
5006 }
5007}
5008
5009static int perf_event_check_period(struct perf_event *event, u64 value)
5010{
5011 return event->pmu->check_period(event, value);
5012}
5013
5014static int perf_event_period(struct perf_event *event, u64 __user *arg)
5015{
5016 u64 value;
5017
5018 if (!is_sampling_event(event))
5019 return -EINVAL;
5020
5021 if (copy_from_user(&value, arg, sizeof(value)))
5022 return -EFAULT;
5023
5024 if (!value)
5025 return -EINVAL;
5026
5027 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5028 return -EINVAL;
5029
5030 if (perf_event_check_period(event, value))
5031 return -EINVAL;
5032
5033 if (!event->attr.freq && (value & (1ULL << 63)))
5034 return -EINVAL;
5035
5036 event_function_call(event, __perf_event_period, &value);
5037
5038 return 0;
5039}
5040
5041static const struct file_operations perf_fops;
5042
5043static inline int perf_fget_light(int fd, struct fd *p)
5044{
5045 struct fd f = fdget(fd);
5046 if (!f.file)
5047 return -EBADF;
5048
5049 if (f.file->f_op != &perf_fops) {
5050 fdput(f);
5051 return -EBADF;
5052 }
5053 *p = f;
5054 return 0;
5055}
5056
5057static int perf_event_set_output(struct perf_event *event,
5058 struct perf_event *output_event);
5059static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5060static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5061static int perf_copy_attr(struct perf_event_attr __user *uattr,
5062 struct perf_event_attr *attr);
5063
5064static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5065{
5066 void (*func)(struct perf_event *);
5067 u32 flags = arg;
5068
5069 switch (cmd) {
5070 case PERF_EVENT_IOC_ENABLE:
5071 func = _perf_event_enable;
5072 break;
5073 case PERF_EVENT_IOC_DISABLE:
5074 func = _perf_event_disable;
5075 break;
5076 case PERF_EVENT_IOC_RESET:
5077 func = _perf_event_reset;
5078 break;
5079
5080 case PERF_EVENT_IOC_REFRESH:
5081 return _perf_event_refresh(event, arg);
5082
5083 case PERF_EVENT_IOC_PERIOD:
5084 return perf_event_period(event, (u64 __user *)arg);
5085
5086 case PERF_EVENT_IOC_ID:
5087 {
5088 u64 id = primary_event_id(event);
5089
5090 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5091 return -EFAULT;
5092 return 0;
5093 }
5094
5095 case PERF_EVENT_IOC_SET_OUTPUT:
5096 {
5097 int ret;
5098 if (arg != -1) {
5099 struct perf_event *output_event;
5100 struct fd output;
5101 ret = perf_fget_light(arg, &output);
5102 if (ret)
5103 return ret;
5104 output_event = output.file->private_data;
5105 ret = perf_event_set_output(event, output_event);
5106 fdput(output);
5107 } else {
5108 ret = perf_event_set_output(event, NULL);
5109 }
5110 return ret;
5111 }
5112
5113 case PERF_EVENT_IOC_SET_FILTER:
5114 return perf_event_set_filter(event, (void __user *)arg);
5115
5116 case PERF_EVENT_IOC_SET_BPF:
5117 return perf_event_set_bpf_prog(event, arg);
5118
5119 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5120 struct ring_buffer *rb;
5121
5122 rcu_read_lock();
5123 rb = rcu_dereference(event->rb);
5124 if (!rb || !rb->nr_pages) {
5125 rcu_read_unlock();
5126 return -EINVAL;
5127 }
5128 rb_toggle_paused(rb, !!arg);
5129 rcu_read_unlock();
5130 return 0;
5131 }
5132
5133 case PERF_EVENT_IOC_QUERY_BPF:
5134 return perf_event_query_prog_array(event, (void __user *)arg);
5135
5136 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5137 struct perf_event_attr new_attr;
5138 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5139 &new_attr);
5140
5141 if (err)
5142 return err;
5143
5144 return perf_event_modify_attr(event, &new_attr);
5145 }
5146 default:
5147 return -ENOTTY;
5148 }
5149
5150 if (flags & PERF_IOC_FLAG_GROUP)
5151 perf_event_for_each(event, func);
5152 else
5153 perf_event_for_each_child(event, func);
5154
5155 return 0;
5156}
5157
5158static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5159{
5160 struct perf_event *event = file->private_data;
5161 struct perf_event_context *ctx;
5162 long ret;
5163
5164 ctx = perf_event_ctx_lock(event);
5165 ret = _perf_ioctl(event, cmd, arg);
5166 perf_event_ctx_unlock(event, ctx);
5167
5168 return ret;
5169}
5170
5171#ifdef CONFIG_COMPAT
5172static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5173 unsigned long arg)
5174{
5175 switch (_IOC_NR(cmd)) {
5176 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5177 case _IOC_NR(PERF_EVENT_IOC_ID):
5178 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5179 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5180
5181 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5182 cmd &= ~IOCSIZE_MASK;
5183 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5184 }
5185 break;
5186 }
5187 return perf_ioctl(file, cmd, arg);
5188}
5189#else
5190# define perf_compat_ioctl NULL
5191#endif
5192
5193int perf_event_task_enable(void)
5194{
5195 struct perf_event_context *ctx;
5196 struct perf_event *event;
5197
5198 mutex_lock(¤t->perf_event_mutex);
5199 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5200 ctx = perf_event_ctx_lock(event);
5201 perf_event_for_each_child(event, _perf_event_enable);
5202 perf_event_ctx_unlock(event, ctx);
5203 }
5204 mutex_unlock(¤t->perf_event_mutex);
5205
5206 return 0;
5207}
5208
5209int perf_event_task_disable(void)
5210{
5211 struct perf_event_context *ctx;
5212 struct perf_event *event;
5213
5214 mutex_lock(¤t->perf_event_mutex);
5215 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5216 ctx = perf_event_ctx_lock(event);
5217 perf_event_for_each_child(event, _perf_event_disable);
5218 perf_event_ctx_unlock(event, ctx);
5219 }
5220 mutex_unlock(¤t->perf_event_mutex);
5221
5222 return 0;
5223}
5224
5225static int perf_event_index(struct perf_event *event)
5226{
5227 if (event->hw.state & PERF_HES_STOPPED)
5228 return 0;
5229
5230 if (event->state != PERF_EVENT_STATE_ACTIVE)
5231 return 0;
5232
5233 return event->pmu->event_idx(event);
5234}
5235
5236static void calc_timer_values(struct perf_event *event,
5237 u64 *now,
5238 u64 *enabled,
5239 u64 *running)
5240{
5241 u64 ctx_time;
5242
5243 *now = perf_clock();
5244 ctx_time = event->shadow_ctx_time + *now;
5245 __perf_update_times(event, ctx_time, enabled, running);
5246}
5247
5248static void perf_event_init_userpage(struct perf_event *event)
5249{
5250 struct perf_event_mmap_page *userpg;
5251 struct ring_buffer *rb;
5252
5253 rcu_read_lock();
5254 rb = rcu_dereference(event->rb);
5255 if (!rb)
5256 goto unlock;
5257
5258 userpg = rb->user_page;
5259
5260
5261 userpg->cap_bit0_is_deprecated = 1;
5262 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5263 userpg->data_offset = PAGE_SIZE;
5264 userpg->data_size = perf_data_size(rb);
5265
5266unlock:
5267 rcu_read_unlock();
5268}
5269
5270void __weak arch_perf_update_userpage(
5271 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5272{
5273}
5274
5275
5276
5277
5278
5279
5280void perf_event_update_userpage(struct perf_event *event)
5281{
5282 struct perf_event_mmap_page *userpg;
5283 struct ring_buffer *rb;
5284 u64 enabled, running, now;
5285
5286 rcu_read_lock();
5287 rb = rcu_dereference(event->rb);
5288 if (!rb)
5289 goto unlock;
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300 calc_timer_values(event, &now, &enabled, &running);
5301
5302 userpg = rb->user_page;
5303
5304
5305
5306
5307 preempt_disable();
5308 ++userpg->lock;
5309 barrier();
5310 userpg->index = perf_event_index(event);
5311 userpg->offset = perf_event_count(event);
5312 if (userpg->index)
5313 userpg->offset -= local64_read(&event->hw.prev_count);
5314
5315 userpg->time_enabled = enabled +
5316 atomic64_read(&event->child_total_time_enabled);
5317
5318 userpg->time_running = running +
5319 atomic64_read(&event->child_total_time_running);
5320
5321 arch_perf_update_userpage(event, userpg, now);
5322
5323 barrier();
5324 ++userpg->lock;
5325 preempt_enable();
5326unlock:
5327 rcu_read_unlock();
5328}
5329EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5330
5331static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5332{
5333 struct perf_event *event = vmf->vma->vm_file->private_data;
5334 struct ring_buffer *rb;
5335 vm_fault_t ret = VM_FAULT_SIGBUS;
5336
5337 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5338 if (vmf->pgoff == 0)
5339 ret = 0;
5340 return ret;
5341 }
5342
5343 rcu_read_lock();
5344 rb = rcu_dereference(event->rb);
5345 if (!rb)
5346 goto unlock;
5347
5348 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5349 goto unlock;
5350
5351 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5352 if (!vmf->page)
5353 goto unlock;
5354
5355 get_page(vmf->page);
5356 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5357 vmf->page->index = vmf->pgoff;
5358
5359 ret = 0;
5360unlock:
5361 rcu_read_unlock();
5362
5363 return ret;
5364}
5365
5366static void ring_buffer_attach(struct perf_event *event,
5367 struct ring_buffer *rb)
5368{
5369 struct ring_buffer *old_rb = NULL;
5370 unsigned long flags;
5371
5372 if (event->rb) {
5373
5374
5375
5376
5377 WARN_ON_ONCE(event->rcu_pending);
5378
5379 old_rb = event->rb;
5380 spin_lock_irqsave(&old_rb->event_lock, flags);
5381 list_del_rcu(&event->rb_entry);
5382 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5383
5384 event->rcu_batches = get_state_synchronize_rcu();
5385 event->rcu_pending = 1;
5386 }
5387
5388 if (rb) {
5389 if (event->rcu_pending) {
5390 cond_synchronize_rcu(event->rcu_batches);
5391 event->rcu_pending = 0;
5392 }
5393
5394 spin_lock_irqsave(&rb->event_lock, flags);
5395 list_add_rcu(&event->rb_entry, &rb->event_list);
5396 spin_unlock_irqrestore(&rb->event_lock, flags);
5397 }
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409 if (has_aux(event))
5410 perf_event_stop(event, 0);
5411
5412 rcu_assign_pointer(event->rb, rb);
5413
5414 if (old_rb) {
5415 ring_buffer_put(old_rb);
5416
5417
5418
5419
5420
5421 wake_up_all(&event->waitq);
5422 }
5423}
5424
5425static void ring_buffer_wakeup(struct perf_event *event)
5426{
5427 struct ring_buffer *rb;
5428
5429 rcu_read_lock();
5430 rb = rcu_dereference(event->rb);
5431 if (rb) {
5432 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5433 wake_up_all(&event->waitq);
5434 }
5435 rcu_read_unlock();
5436}
5437
5438struct ring_buffer *ring_buffer_get(struct perf_event *event)
5439{
5440 struct ring_buffer *rb;
5441
5442 rcu_read_lock();
5443 rb = rcu_dereference(event->rb);
5444 if (rb) {
5445 if (!refcount_inc_not_zero(&rb->refcount))
5446 rb = NULL;
5447 }
5448 rcu_read_unlock();
5449
5450 return rb;
5451}
5452
5453void ring_buffer_put(struct ring_buffer *rb)
5454{
5455 if (!refcount_dec_and_test(&rb->refcount))
5456 return;
5457
5458 WARN_ON_ONCE(!list_empty(&rb->event_list));
5459
5460 call_rcu(&rb->rcu_head, rb_free_rcu);
5461}
5462
5463static void perf_mmap_open(struct vm_area_struct *vma)
5464{
5465 struct perf_event *event = vma->vm_file->private_data;
5466
5467 atomic_inc(&event->mmap_count);
5468 atomic_inc(&event->rb->mmap_count);
5469
5470 if (vma->vm_pgoff)
5471 atomic_inc(&event->rb->aux_mmap_count);
5472
5473 if (event->pmu->event_mapped)
5474 event->pmu->event_mapped(event, vma->vm_mm);
5475}
5476
5477static void perf_pmu_output_stop(struct perf_event *event);
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487static void perf_mmap_close(struct vm_area_struct *vma)
5488{
5489 struct perf_event *event = vma->vm_file->private_data;
5490
5491 struct ring_buffer *rb = ring_buffer_get(event);
5492 struct user_struct *mmap_user = rb->mmap_user;
5493 int mmap_locked = rb->mmap_locked;
5494 unsigned long size = perf_data_size(rb);
5495
5496 if (event->pmu->event_unmapped)
5497 event->pmu->event_unmapped(event, vma->vm_mm);
5498
5499
5500
5501
5502
5503
5504 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5505 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5506
5507
5508
5509
5510
5511
5512 perf_pmu_output_stop(event);
5513
5514
5515 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5516 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5517
5518
5519 rb_free_aux(rb);
5520 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5521
5522 mutex_unlock(&event->mmap_mutex);
5523 }
5524
5525 atomic_dec(&rb->mmap_count);
5526
5527 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5528 goto out_put;
5529
5530 ring_buffer_attach(event, NULL);
5531 mutex_unlock(&event->mmap_mutex);
5532
5533
5534 if (atomic_read(&rb->mmap_count))
5535 goto out_put;
5536
5537
5538
5539
5540
5541
5542again:
5543 rcu_read_lock();
5544 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5545 if (!atomic_long_inc_not_zero(&event->refcount)) {
5546
5547
5548
5549
5550 continue;
5551 }
5552 rcu_read_unlock();
5553
5554 mutex_lock(&event->mmap_mutex);
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565 if (event->rb == rb)
5566 ring_buffer_attach(event, NULL);
5567
5568 mutex_unlock(&event->mmap_mutex);
5569 put_event(event);
5570
5571
5572
5573
5574
5575 goto again;
5576 }
5577 rcu_read_unlock();
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5589 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5590 free_uid(mmap_user);
5591
5592out_put:
5593 ring_buffer_put(rb);
5594}
5595
5596static const struct vm_operations_struct perf_mmap_vmops = {
5597 .open = perf_mmap_open,
5598 .close = perf_mmap_close,
5599 .fault = perf_mmap_fault,
5600 .page_mkwrite = perf_mmap_fault,
5601};
5602
5603static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5604{
5605 struct perf_event *event = file->private_data;
5606 unsigned long user_locked, user_lock_limit;
5607 struct user_struct *user = current_user();
5608 unsigned long locked, lock_limit;
5609 struct ring_buffer *rb = NULL;
5610 unsigned long vma_size;
5611 unsigned long nr_pages;
5612 long user_extra = 0, extra = 0;
5613 int ret = 0, flags = 0;
5614
5615
5616
5617
5618
5619
5620 if (event->cpu == -1 && event->attr.inherit)
5621 return -EINVAL;
5622
5623 if (!(vma->vm_flags & VM_SHARED))
5624 return -EINVAL;
5625
5626 vma_size = vma->vm_end - vma->vm_start;
5627
5628 if (vma->vm_pgoff == 0) {
5629 nr_pages = (vma_size / PAGE_SIZE) - 1;
5630 } else {
5631
5632
5633
5634
5635
5636 u64 aux_offset, aux_size;
5637
5638 if (!event->rb)
5639 return -EINVAL;
5640
5641 nr_pages = vma_size / PAGE_SIZE;
5642
5643 mutex_lock(&event->mmap_mutex);
5644 ret = -EINVAL;
5645
5646 rb = event->rb;
5647 if (!rb)
5648 goto aux_unlock;
5649
5650 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5651 aux_size = READ_ONCE(rb->user_page->aux_size);
5652
5653 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5654 goto aux_unlock;
5655
5656 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5657 goto aux_unlock;
5658
5659
5660 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5661 goto aux_unlock;
5662
5663 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5664 goto aux_unlock;
5665
5666
5667 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5668 goto aux_unlock;
5669
5670 if (!is_power_of_2(nr_pages))
5671 goto aux_unlock;
5672
5673 if (!atomic_inc_not_zero(&rb->mmap_count))
5674 goto aux_unlock;
5675
5676 if (rb_has_aux(rb)) {
5677 atomic_inc(&rb->aux_mmap_count);
5678 ret = 0;
5679 goto unlock;
5680 }
5681
5682 atomic_set(&rb->aux_mmap_count, 1);
5683 user_extra = nr_pages;
5684
5685 goto accounting;
5686 }
5687
5688
5689
5690
5691
5692 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5693 return -EINVAL;
5694
5695 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5696 return -EINVAL;
5697
5698 WARN_ON_ONCE(event->ctx->parent_ctx);
5699again:
5700 mutex_lock(&event->mmap_mutex);
5701 if (event->rb) {
5702 if (event->rb->nr_pages != nr_pages) {
5703 ret = -EINVAL;
5704 goto unlock;
5705 }
5706
5707 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5708
5709
5710
5711
5712
5713 mutex_unlock(&event->mmap_mutex);
5714 goto again;
5715 }
5716
5717 goto unlock;
5718 }
5719
5720 user_extra = nr_pages + 1;
5721
5722accounting:
5723 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5724
5725
5726
5727
5728 user_lock_limit *= num_online_cpus();
5729
5730 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5731
5732 if (user_locked > user_lock_limit)
5733 extra = user_locked - user_lock_limit;
5734
5735 lock_limit = rlimit(RLIMIT_MEMLOCK);
5736 lock_limit >>= PAGE_SHIFT;
5737 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
5738
5739 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5740 !capable(CAP_IPC_LOCK)) {
5741 ret = -EPERM;
5742 goto unlock;
5743 }
5744
5745 WARN_ON(!rb && event->rb);
5746
5747 if (vma->vm_flags & VM_WRITE)
5748 flags |= RING_BUFFER_WRITABLE;
5749
5750 if (!rb) {
5751 rb = rb_alloc(nr_pages,
5752 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5753 event->cpu, flags);
5754
5755 if (!rb) {
5756 ret = -ENOMEM;
5757 goto unlock;
5758 }
5759
5760 atomic_set(&rb->mmap_count, 1);
5761 rb->mmap_user = get_current_user();
5762 rb->mmap_locked = extra;
5763
5764 ring_buffer_attach(event, rb);
5765
5766 perf_event_init_userpage(event);
5767 perf_event_update_userpage(event);
5768 } else {
5769 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5770 event->attr.aux_watermark, flags);
5771 if (!ret)
5772 rb->aux_mmap_locked = extra;
5773 }
5774
5775unlock:
5776 if (!ret) {
5777 atomic_long_add(user_extra, &user->locked_vm);
5778 atomic64_add(extra, &vma->vm_mm->pinned_vm);
5779
5780 atomic_inc(&event->mmap_count);
5781 } else if (rb) {
5782 atomic_dec(&rb->mmap_count);
5783 }
5784aux_unlock:
5785 mutex_unlock(&event->mmap_mutex);
5786
5787
5788
5789
5790
5791 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5792 vma->vm_ops = &perf_mmap_vmops;
5793
5794 if (event->pmu->event_mapped)
5795 event->pmu->event_mapped(event, vma->vm_mm);
5796
5797 return ret;
5798}
5799
5800static int perf_fasync(int fd, struct file *filp, int on)
5801{
5802 struct inode *inode = file_inode(filp);
5803 struct perf_event *event = filp->private_data;
5804 int retval;
5805
5806 inode_lock(inode);
5807 retval = fasync_helper(fd, filp, on, &event->fasync);
5808 inode_unlock(inode);
5809
5810 if (retval < 0)
5811 return retval;
5812
5813 return 0;
5814}
5815
5816static const struct file_operations perf_fops = {
5817 .llseek = no_llseek,
5818 .release = perf_release,
5819 .read = perf_read,
5820 .poll = perf_poll,
5821 .unlocked_ioctl = perf_ioctl,
5822 .compat_ioctl = perf_compat_ioctl,
5823 .mmap = perf_mmap,
5824 .fasync = perf_fasync,
5825};
5826
5827
5828
5829
5830
5831
5832
5833
5834static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5835{
5836
5837 if (event->parent)
5838 event = event->parent;
5839 return &event->fasync;
5840}
5841
5842void perf_event_wakeup(struct perf_event *event)
5843{
5844 ring_buffer_wakeup(event);
5845
5846 if (event->pending_kill) {
5847 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5848 event->pending_kill = 0;
5849 }
5850}
5851
5852static void perf_pending_event_disable(struct perf_event *event)
5853{
5854 int cpu = READ_ONCE(event->pending_disable);
5855
5856 if (cpu < 0)
5857 return;
5858
5859 if (cpu == smp_processor_id()) {
5860 WRITE_ONCE(event->pending_disable, -1);
5861 perf_event_disable_local(event);
5862 return;
5863 }
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885 irq_work_queue_on(&event->pending, cpu);
5886}
5887
5888static void perf_pending_event(struct irq_work *entry)
5889{
5890 struct perf_event *event = container_of(entry, struct perf_event, pending);
5891 int rctx;
5892
5893 rctx = perf_swevent_get_recursion_context();
5894
5895
5896
5897
5898
5899 perf_pending_event_disable(event);
5900
5901 if (event->pending_wakeup) {
5902 event->pending_wakeup = 0;
5903 perf_event_wakeup(event);
5904 }
5905
5906 if (rctx >= 0)
5907 perf_swevent_put_recursion_context(rctx);
5908}
5909
5910
5911
5912
5913
5914
5915struct perf_guest_info_callbacks *perf_guest_cbs;
5916
5917int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5918{
5919 perf_guest_cbs = cbs;
5920 return 0;
5921}
5922EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5923
5924int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5925{
5926 perf_guest_cbs = NULL;
5927 return 0;
5928}
5929EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5930
5931static void
5932perf_output_sample_regs(struct perf_output_handle *handle,
5933 struct pt_regs *regs, u64 mask)
5934{
5935 int bit;
5936 DECLARE_BITMAP(_mask, 64);
5937
5938 bitmap_from_u64(_mask, mask);
5939 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5940 u64 val;
5941
5942 val = perf_reg_value(regs, bit);
5943 perf_output_put(handle, val);
5944 }
5945}
5946
5947static void perf_sample_regs_user(struct perf_regs *regs_user,
5948 struct pt_regs *regs,
5949 struct pt_regs *regs_user_copy)
5950{
5951 if (user_mode(regs)) {
5952 regs_user->abi = perf_reg_abi(current);
5953 regs_user->regs = regs;
5954 } else if (!(current->flags & PF_KTHREAD)) {
5955 perf_get_regs_user(regs_user, regs, regs_user_copy);
5956 } else {
5957 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5958 regs_user->regs = NULL;
5959 }
5960}
5961
5962static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5963 struct pt_regs *regs)
5964{
5965 regs_intr->regs = regs;
5966 regs_intr->abi = perf_reg_abi(current);
5967}
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977static u64 perf_ustack_task_size(struct pt_regs *regs)
5978{
5979 unsigned long addr = perf_user_stack_pointer(regs);
5980
5981 if (!addr || addr >= TASK_SIZE)
5982 return 0;
5983
5984 return TASK_SIZE - addr;
5985}
5986
5987static u16
5988perf_sample_ustack_size(u16 stack_size, u16 header_size,
5989 struct pt_regs *regs)
5990{
5991 u64 task_size;
5992
5993
5994 if (!regs)
5995 return 0;
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6008 stack_size = min(stack_size, (u16) task_size);
6009
6010
6011 header_size += 2 * sizeof(u64);
6012
6013
6014 if ((u16) (header_size + stack_size) < header_size) {
6015
6016
6017
6018
6019 stack_size = USHRT_MAX - header_size - sizeof(u64);
6020 stack_size = round_up(stack_size, sizeof(u64));
6021 }
6022
6023 return stack_size;
6024}
6025
6026static void
6027perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6028 struct pt_regs *regs)
6029{
6030
6031 if (!regs) {
6032 u64 size = 0;
6033 perf_output_put(handle, size);
6034 } else {
6035 unsigned long sp;
6036 unsigned int rem;
6037 u64 dyn_size;
6038 mm_segment_t fs;
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052 perf_output_put(handle, dump_size);
6053
6054
6055 sp = perf_user_stack_pointer(regs);
6056 fs = get_fs();
6057 set_fs(USER_DS);
6058 rem = __output_copy_user(handle, (void *) sp, dump_size);
6059 set_fs(fs);
6060 dyn_size = dump_size - rem;
6061
6062 perf_output_skip(handle, rem);
6063
6064
6065 perf_output_put(handle, dyn_size);
6066 }
6067}
6068
6069static void __perf_event_header__init_id(struct perf_event_header *header,
6070 struct perf_sample_data *data,
6071 struct perf_event *event)
6072{
6073 u64 sample_type = event->attr.sample_type;
6074
6075 data->type = sample_type;
6076 header->size += event->id_header_size;
6077
6078 if (sample_type & PERF_SAMPLE_TID) {
6079
6080 data->tid_entry.pid = perf_event_pid(event, current);
6081 data->tid_entry.tid = perf_event_tid(event, current);
6082 }
6083
6084 if (sample_type & PERF_SAMPLE_TIME)
6085 data->time = perf_event_clock(event);
6086
6087 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6088 data->id = primary_event_id(event);
6089
6090 if (sample_type & PERF_SAMPLE_STREAM_ID)
6091 data->stream_id = event->id;
6092
6093 if (sample_type & PERF_SAMPLE_CPU) {
6094 data->cpu_entry.cpu = raw_smp_processor_id();
6095 data->cpu_entry.reserved = 0;
6096 }
6097}
6098
6099void perf_event_header__init_id(struct perf_event_header *header,
6100 struct perf_sample_data *data,
6101 struct perf_event *event)
6102{
6103 if (event->attr.sample_id_all)
6104 __perf_event_header__init_id(header, data, event);
6105}
6106
6107static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6108 struct perf_sample_data *data)
6109{
6110 u64 sample_type = data->type;
6111
6112 if (sample_type & PERF_SAMPLE_TID)
6113 perf_output_put(handle, data->tid_entry);
6114
6115 if (sample_type & PERF_SAMPLE_TIME)
6116 perf_output_put(handle, data->time);
6117
6118 if (sample_type & PERF_SAMPLE_ID)
6119 perf_output_put(handle, data->id);
6120
6121 if (sample_type & PERF_SAMPLE_STREAM_ID)
6122 perf_output_put(handle, data->stream_id);
6123
6124 if (sample_type & PERF_SAMPLE_CPU)
6125 perf_output_put(handle, data->cpu_entry);
6126
6127 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6128 perf_output_put(handle, data->id);
6129}
6130
6131void perf_event__output_id_sample(struct perf_event *event,
6132 struct perf_output_handle *handle,
6133 struct perf_sample_data *sample)
6134{
6135 if (event->attr.sample_id_all)
6136 __perf_event__output_id_sample(handle, sample);
6137}
6138
6139static void perf_output_read_one(struct perf_output_handle *handle,
6140 struct perf_event *event,
6141 u64 enabled, u64 running)
6142{
6143 u64 read_format = event->attr.read_format;
6144 u64 values[4];
6145 int n = 0;
6146
6147 values[n++] = perf_event_count(event);
6148 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6149 values[n++] = enabled +
6150 atomic64_read(&event->child_total_time_enabled);
6151 }
6152 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6153 values[n++] = running +
6154 atomic64_read(&event->child_total_time_running);
6155 }
6156 if (read_format & PERF_FORMAT_ID)
6157 values[n++] = primary_event_id(event);
6158
6159 __output_copy(handle, values, n * sizeof(u64));
6160}
6161
6162static void perf_output_read_group(struct perf_output_handle *handle,
6163 struct perf_event *event,
6164 u64 enabled, u64 running)
6165{
6166 struct perf_event *leader = event->group_leader, *sub;
6167 u64 read_format = event->attr.read_format;
6168 u64 values[5];
6169 int n = 0;
6170
6171 values[n++] = 1 + leader->nr_siblings;
6172
6173 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6174 values[n++] = enabled;
6175
6176 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6177 values[n++] = running;
6178
6179 if ((leader != event) &&
6180 (leader->state == PERF_EVENT_STATE_ACTIVE))
6181 leader->pmu->read(leader);
6182
6183 values[n++] = perf_event_count(leader);
6184 if (read_format & PERF_FORMAT_ID)
6185 values[n++] = primary_event_id(leader);
6186
6187 __output_copy(handle, values, n * sizeof(u64));
6188
6189 for_each_sibling_event(sub, leader) {
6190 n = 0;
6191
6192 if ((sub != event) &&
6193 (sub->state == PERF_EVENT_STATE_ACTIVE))
6194 sub->pmu->read(sub);
6195
6196 values[n++] = perf_event_count(sub);
6197 if (read_format & PERF_FORMAT_ID)
6198 values[n++] = primary_event_id(sub);
6199
6200 __output_copy(handle, values, n * sizeof(u64));
6201 }
6202}
6203
6204#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6205 PERF_FORMAT_TOTAL_TIME_RUNNING)
6206
6207
6208
6209
6210
6211
6212
6213
6214static void perf_output_read(struct perf_output_handle *handle,
6215 struct perf_event *event)
6216{
6217 u64 enabled = 0, running = 0, now;
6218 u64 read_format = event->attr.read_format;
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6230 calc_timer_values(event, &now, &enabled, &running);
6231
6232 if (event->attr.read_format & PERF_FORMAT_GROUP)
6233 perf_output_read_group(handle, event, enabled, running);
6234 else
6235 perf_output_read_one(handle, event, enabled, running);
6236}
6237
6238void perf_output_sample(struct perf_output_handle *handle,
6239 struct perf_event_header *header,
6240 struct perf_sample_data *data,
6241 struct perf_event *event)
6242{
6243 u64 sample_type = data->type;
6244
6245 perf_output_put(handle, *header);
6246
6247 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6248 perf_output_put(handle, data->id);
6249
6250 if (sample_type & PERF_SAMPLE_IP)
6251 perf_output_put(handle, data->ip);
6252
6253 if (sample_type & PERF_SAMPLE_TID)
6254 perf_output_put(handle, data->tid_entry);
6255
6256 if (sample_type & PERF_SAMPLE_TIME)
6257 perf_output_put(handle, data->time);
6258
6259 if (sample_type & PERF_SAMPLE_ADDR)
6260 perf_output_put(handle, data->addr);
6261
6262 if (sample_type & PERF_SAMPLE_ID)
6263 perf_output_put(handle, data->id);
6264
6265 if (sample_type & PERF_SAMPLE_STREAM_ID)
6266 perf_output_put(handle, data->stream_id);
6267
6268 if (sample_type & PERF_SAMPLE_CPU)
6269 perf_output_put(handle, data->cpu_entry);
6270
6271 if (sample_type & PERF_SAMPLE_PERIOD)
6272 perf_output_put(handle, data->period);
6273
6274 if (sample_type & PERF_SAMPLE_READ)
6275 perf_output_read(handle, event);
6276
6277 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6278 int size = 1;
6279
6280 size += data->callchain->nr;
6281 size *= sizeof(u64);
6282 __output_copy(handle, data->callchain, size);
6283 }
6284
6285 if (sample_type & PERF_SAMPLE_RAW) {
6286 struct perf_raw_record *raw = data->raw;
6287
6288 if (raw) {
6289 struct perf_raw_frag *frag = &raw->frag;
6290
6291 perf_output_put(handle, raw->size);
6292 do {
6293 if (frag->copy) {
6294 __output_custom(handle, frag->copy,
6295 frag->data, frag->size);
6296 } else {
6297 __output_copy(handle, frag->data,
6298 frag->size);
6299 }
6300 if (perf_raw_frag_last(frag))
6301 break;
6302 frag = frag->next;
6303 } while (1);
6304 if (frag->pad)
6305 __output_skip(handle, NULL, frag->pad);
6306 } else {
6307 struct {
6308 u32 size;
6309 u32 data;
6310 } raw = {
6311 .size = sizeof(u32),
6312 .data = 0,
6313 };
6314 perf_output_put(handle, raw);
6315 }
6316 }
6317
6318 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6319 if (data->br_stack) {
6320 size_t size;
6321
6322 size = data->br_stack->nr
6323 * sizeof(struct perf_branch_entry);
6324
6325 perf_output_put(handle, data->br_stack->nr);
6326 perf_output_copy(handle, data->br_stack->entries, size);
6327 } else {
6328
6329
6330
6331 u64 nr = 0;
6332 perf_output_put(handle, nr);
6333 }
6334 }
6335
6336 if (sample_type & PERF_SAMPLE_REGS_USER) {
6337 u64 abi = data->regs_user.abi;
6338
6339
6340
6341
6342
6343 perf_output_put(handle, abi);
6344
6345 if (abi) {
6346 u64 mask = event->attr.sample_regs_user;
6347 perf_output_sample_regs(handle,
6348 data->regs_user.regs,
6349 mask);
6350 }
6351 }
6352
6353 if (sample_type & PERF_SAMPLE_STACK_USER) {
6354 perf_output_sample_ustack(handle,
6355 data->stack_user_size,
6356 data->regs_user.regs);
6357 }
6358
6359 if (sample_type & PERF_SAMPLE_WEIGHT)
6360 perf_output_put(handle, data->weight);
6361
6362 if (sample_type & PERF_SAMPLE_DATA_SRC)
6363 perf_output_put(handle, data->data_src.val);
6364
6365 if (sample_type & PERF_SAMPLE_TRANSACTION)
6366 perf_output_put(handle, data->txn);
6367
6368 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6369 u64 abi = data->regs_intr.abi;
6370
6371
6372
6373
6374 perf_output_put(handle, abi);
6375
6376 if (abi) {
6377 u64 mask = event->attr.sample_regs_intr;
6378
6379 perf_output_sample_regs(handle,
6380 data->regs_intr.regs,
6381 mask);
6382 }
6383 }
6384
6385 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6386 perf_output_put(handle, data->phys_addr);
6387
6388 if (!event->attr.watermark) {
6389 int wakeup_events = event->attr.wakeup_events;
6390
6391 if (wakeup_events) {
6392 struct ring_buffer *rb = handle->rb;
6393 int events = local_inc_return(&rb->events);
6394
6395 if (events >= wakeup_events) {
6396 local_sub(wakeup_events, &rb->events);
6397 local_inc(&rb->wakeup);
6398 }
6399 }
6400 }
6401}
6402
6403static u64 perf_virt_to_phys(u64 virt)
6404{
6405 u64 phys_addr = 0;
6406 struct page *p = NULL;
6407
6408 if (!virt)
6409 return 0;
6410
6411 if (virt >= TASK_SIZE) {
6412
6413 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6414 !(virt >= VMALLOC_START && virt < VMALLOC_END))
6415 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6416 } else {
6417
6418
6419
6420
6421
6422
6423
6424 if ((current->mm != NULL) &&
6425 (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6426 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6427
6428 if (p)
6429 put_page(p);
6430 }
6431
6432 return phys_addr;
6433}
6434
6435static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6436
6437struct perf_callchain_entry *
6438perf_callchain(struct perf_event *event, struct pt_regs *regs)
6439{
6440 bool kernel = !event->attr.exclude_callchain_kernel;
6441 bool user = !event->attr.exclude_callchain_user;
6442
6443 bool crosstask = event->ctx->task && event->ctx->task != current;
6444 const u32 max_stack = event->attr.sample_max_stack;
6445 struct perf_callchain_entry *callchain;
6446
6447 if (!kernel && !user)
6448 return &__empty_callchain;
6449
6450 callchain = get_perf_callchain(regs, 0, kernel, user,
6451 max_stack, crosstask, true);
6452 return callchain ?: &__empty_callchain;
6453}
6454
6455void perf_prepare_sample(struct perf_event_header *header,
6456 struct perf_sample_data *data,
6457 struct perf_event *event,
6458 struct pt_regs *regs)
6459{
6460 u64 sample_type = event->attr.sample_type;
6461
6462 header->type = PERF_RECORD_SAMPLE;
6463 header->size = sizeof(*header) + event->header_size;
6464
6465 header->misc = 0;
6466 header->misc |= perf_misc_flags(regs);
6467
6468 __perf_event_header__init_id(header, data, event);
6469
6470 if (sample_type & PERF_SAMPLE_IP)
6471 data->ip = perf_instruction_pointer(regs);
6472
6473 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6474 int size = 1;
6475
6476 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
6477 data->callchain = perf_callchain(event, regs);
6478
6479 size += data->callchain->nr;
6480
6481 header->size += size * sizeof(u64);
6482 }
6483
6484 if (sample_type & PERF_SAMPLE_RAW) {
6485 struct perf_raw_record *raw = data->raw;
6486 int size;
6487
6488 if (raw) {
6489 struct perf_raw_frag *frag = &raw->frag;
6490 u32 sum = 0;
6491
6492 do {
6493 sum += frag->size;
6494 if (perf_raw_frag_last(frag))
6495 break;
6496 frag = frag->next;
6497 } while (1);
6498
6499 size = round_up(sum + sizeof(u32), sizeof(u64));
6500 raw->size = size - sizeof(u32);
6501 frag->pad = raw->size - sum;
6502 } else {
6503 size = sizeof(u64);
6504 }
6505
6506 header->size += size;
6507 }
6508
6509 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6510 int size = sizeof(u64);
6511 if (data->br_stack) {
6512 size += data->br_stack->nr
6513 * sizeof(struct perf_branch_entry);
6514 }
6515 header->size += size;
6516 }
6517
6518 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6519 perf_sample_regs_user(&data->regs_user, regs,
6520 &data->regs_user_copy);
6521
6522 if (sample_type & PERF_SAMPLE_REGS_USER) {
6523
6524 int size = sizeof(u64);
6525
6526 if (data->regs_user.regs) {
6527 u64 mask = event->attr.sample_regs_user;
6528 size += hweight64(mask) * sizeof(u64);
6529 }
6530
6531 header->size += size;
6532 }
6533
6534 if (sample_type & PERF_SAMPLE_STACK_USER) {
6535
6536
6537
6538
6539
6540
6541 u16 stack_size = event->attr.sample_stack_user;
6542 u16 size = sizeof(u64);
6543
6544 stack_size = perf_sample_ustack_size(stack_size, header->size,
6545 data->regs_user.regs);
6546
6547
6548
6549
6550
6551
6552 if (stack_size)
6553 size += sizeof(u64) + stack_size;
6554
6555 data->stack_user_size = stack_size;
6556 header->size += size;
6557 }
6558
6559 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6560
6561 int size = sizeof(u64);
6562
6563 perf_sample_regs_intr(&data->regs_intr, regs);
6564
6565 if (data->regs_intr.regs) {
6566 u64 mask = event->attr.sample_regs_intr;
6567
6568 size += hweight64(mask) * sizeof(u64);
6569 }
6570
6571 header->size += size;
6572 }
6573
6574 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6575 data->phys_addr = perf_virt_to_phys(data->addr);
6576}
6577
6578static __always_inline int
6579__perf_event_output(struct perf_event *event,
6580 struct perf_sample_data *data,
6581 struct pt_regs *regs,
6582 int (*output_begin)(struct perf_output_handle *,
6583 struct perf_event *,
6584 unsigned int))
6585{
6586 struct perf_output_handle handle;
6587 struct perf_event_header header;
6588 int err;
6589
6590
6591 rcu_read_lock();
6592
6593 perf_prepare_sample(&header, data, event, regs);
6594
6595 err = output_begin(&handle, event, header.size);
6596 if (err)
6597 goto exit;
6598
6599 perf_output_sample(&handle, &header, data, event);
6600
6601 perf_output_end(&handle);
6602
6603exit:
6604 rcu_read_unlock();
6605 return err;
6606}
6607
6608void
6609perf_event_output_forward(struct perf_event *event,
6610 struct perf_sample_data *data,
6611 struct pt_regs *regs)
6612{
6613 __perf_event_output(event, data, regs, perf_output_begin_forward);
6614}
6615
6616void
6617perf_event_output_backward(struct perf_event *event,
6618 struct perf_sample_data *data,
6619 struct pt_regs *regs)
6620{
6621 __perf_event_output(event, data, regs, perf_output_begin_backward);
6622}
6623
6624int
6625perf_event_output(struct perf_event *event,
6626 struct perf_sample_data *data,
6627 struct pt_regs *regs)
6628{
6629 return __perf_event_output(event, data, regs, perf_output_begin);
6630}
6631
6632
6633
6634
6635
6636struct perf_read_event {
6637 struct perf_event_header header;
6638
6639 u32 pid;
6640 u32 tid;
6641};
6642
6643static void
6644perf_event_read_event(struct perf_event *event,
6645 struct task_struct *task)
6646{
6647 struct perf_output_handle handle;
6648 struct perf_sample_data sample;
6649 struct perf_read_event read_event = {
6650 .header = {
6651 .type = PERF_RECORD_READ,
6652 .misc = 0,
6653 .size = sizeof(read_event) + event->read_size,
6654 },
6655 .pid = perf_event_pid(event, task),
6656 .tid = perf_event_tid(event, task),
6657 };
6658 int ret;
6659
6660 perf_event_header__init_id(&read_event.header, &sample, event);
6661 ret = perf_output_begin(&handle, event, read_event.header.size);
6662 if (ret)
6663 return;
6664
6665 perf_output_put(&handle, read_event);
6666 perf_output_read(&handle, event);
6667 perf_event__output_id_sample(event, &handle, &sample);
6668
6669 perf_output_end(&handle);
6670}
6671
6672typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6673
6674static void
6675perf_iterate_ctx(struct perf_event_context *ctx,
6676 perf_iterate_f output,
6677 void *data, bool all)
6678{
6679 struct perf_event *event;
6680
6681 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6682 if (!all) {
6683 if (event->state < PERF_EVENT_STATE_INACTIVE)
6684 continue;
6685 if (!event_filter_match(event))
6686 continue;
6687 }
6688
6689 output(event, data);
6690 }
6691}
6692
6693static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6694{
6695 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6696 struct perf_event *event;
6697
6698 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6699
6700
6701
6702
6703
6704 if (!smp_load_acquire(&event->ctx))
6705 continue;
6706
6707 if (event->state < PERF_EVENT_STATE_INACTIVE)
6708 continue;
6709 if (!event_filter_match(event))
6710 continue;
6711 output(event, data);
6712 }
6713}
6714
6715
6716
6717
6718
6719
6720
6721static void
6722perf_iterate_sb(perf_iterate_f output, void *data,
6723 struct perf_event_context *task_ctx)
6724{
6725 struct perf_event_context *ctx;
6726 int ctxn;
6727
6728 rcu_read_lock();
6729 preempt_disable();
6730
6731
6732
6733
6734
6735
6736 if (task_ctx) {
6737 perf_iterate_ctx(task_ctx, output, data, false);
6738 goto done;
6739 }
6740
6741 perf_iterate_sb_cpu(output, data);
6742
6743 for_each_task_context_nr(ctxn) {
6744 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6745 if (ctx)
6746 perf_iterate_ctx(ctx, output, data, false);
6747 }
6748done:
6749 preempt_enable();
6750 rcu_read_unlock();
6751}
6752
6753
6754
6755
6756
6757static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6758{
6759 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6760 struct perf_addr_filter *filter;
6761 unsigned int restart = 0, count = 0;
6762 unsigned long flags;
6763
6764 if (!has_addr_filter(event))
6765 return;
6766
6767 raw_spin_lock_irqsave(&ifh->lock, flags);
6768 list_for_each_entry(filter, &ifh->list, entry) {
6769 if (filter->path.dentry) {
6770 event->addr_filter_ranges[count].start = 0;
6771 event->addr_filter_ranges[count].size = 0;
6772 restart++;
6773 }
6774
6775 count++;
6776 }
6777
6778 if (restart)
6779 event->addr_filters_gen++;
6780 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6781
6782 if (restart)
6783 perf_event_stop(event, 1);
6784}
6785
6786void perf_event_exec(void)
6787{
6788 struct perf_event_context *ctx;
6789 int ctxn;
6790
6791 rcu_read_lock();
6792 for_each_task_context_nr(ctxn) {
6793 ctx = current->perf_event_ctxp[ctxn];
6794 if (!ctx)
6795 continue;
6796
6797 perf_event_enable_on_exec(ctxn);
6798
6799 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6800 true);
6801 }
6802 rcu_read_unlock();
6803}
6804
6805struct remote_output {
6806 struct ring_buffer *rb;
6807 int err;
6808};
6809
6810static void __perf_event_output_stop(struct perf_event *event, void *data)
6811{
6812 struct perf_event *parent = event->parent;
6813 struct remote_output *ro = data;
6814 struct ring_buffer *rb = ro->rb;
6815 struct stop_event_data sd = {
6816 .event = event,
6817 };
6818
6819 if (!has_aux(event))
6820 return;
6821
6822 if (!parent)
6823 parent = event;
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835 if (rcu_dereference(parent->rb) == rb)
6836 ro->err = __perf_event_stop(&sd);
6837}
6838
6839static int __perf_pmu_output_stop(void *info)
6840{
6841 struct perf_event *event = info;
6842 struct pmu *pmu = event->pmu;
6843 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6844 struct remote_output ro = {
6845 .rb = event->rb,
6846 };
6847
6848 rcu_read_lock();
6849 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6850 if (cpuctx->task_ctx)
6851 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6852 &ro, false);
6853 rcu_read_unlock();
6854
6855 return ro.err;
6856}
6857
6858static void perf_pmu_output_stop(struct perf_event *event)
6859{
6860 struct perf_event *iter;
6861 int err, cpu;
6862
6863restart:
6864 rcu_read_lock();
6865 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6866
6867
6868
6869
6870
6871
6872 cpu = iter->cpu;
6873 if (cpu == -1)
6874 cpu = READ_ONCE(iter->oncpu);
6875
6876 if (cpu == -1)
6877 continue;
6878
6879 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6880 if (err == -EAGAIN) {
6881 rcu_read_unlock();
6882 goto restart;
6883 }
6884 }
6885 rcu_read_unlock();
6886}
6887
6888
6889
6890
6891
6892
6893
6894struct perf_task_event {
6895 struct task_struct *task;
6896 struct perf_event_context *task_ctx;
6897
6898 struct {
6899 struct perf_event_header header;
6900
6901 u32 pid;
6902 u32 ppid;
6903 u32 tid;
6904 u32 ptid;
6905 u64 time;
6906 } event_id;
6907};
6908
6909static int perf_event_task_match(struct perf_event *event)
6910{
6911 return event->attr.comm || event->attr.mmap ||
6912 event->attr.mmap2 || event->attr.mmap_data ||
6913 event->attr.task;
6914}
6915
6916static void perf_event_task_output(struct perf_event *event,
6917 void *data)
6918{
6919 struct perf_task_event *task_event = data;
6920 struct perf_output_handle handle;
6921 struct perf_sample_data sample;
6922 struct task_struct *task = task_event->task;
6923 int ret, size = task_event->event_id.header.size;
6924
6925 if (!perf_event_task_match(event))
6926 return;
6927
6928 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6929
6930 ret = perf_output_begin(&handle, event,
6931 task_event->event_id.header.size);
6932 if (ret)
6933 goto out;
6934
6935 task_event->event_id.pid = perf_event_pid(event, task);
6936 task_event->event_id.ppid = perf_event_pid(event, current);
6937
6938 task_event->event_id.tid = perf_event_tid(event, task);
6939 task_event->event_id.ptid = perf_event_tid(event, current);
6940
6941 task_event->event_id.time = perf_event_clock(event);
6942
6943 perf_output_put(&handle, task_event->event_id);
6944
6945 perf_event__output_id_sample(event, &handle, &sample);
6946
6947 perf_output_end(&handle);
6948out:
6949 task_event->event_id.header.size = size;
6950}
6951
6952static void perf_event_task(struct task_struct *task,
6953 struct perf_event_context *task_ctx,
6954 int new)
6955{
6956 struct perf_task_event task_event;
6957
6958 if (!atomic_read(&nr_comm_events) &&
6959 !atomic_read(&nr_mmap_events) &&
6960 !atomic_read(&nr_task_events))
6961 return;
6962
6963 task_event = (struct perf_task_event){
6964 .task = task,
6965 .task_ctx = task_ctx,
6966 .event_id = {
6967 .header = {
6968 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6969 .misc = 0,
6970 .size = sizeof(task_event.event_id),
6971 },
6972
6973
6974
6975
6976
6977 },
6978 };
6979
6980 perf_iterate_sb(perf_event_task_output,
6981 &task_event,
6982 task_ctx);
6983}
6984
6985void perf_event_fork(struct task_struct *task)
6986{
6987 perf_event_task(task, NULL, 1);
6988 perf_event_namespaces(task);
6989}
6990
6991
6992
6993
6994
6995struct perf_comm_event {
6996 struct task_struct *task;
6997 char *comm;
6998 int comm_size;
6999
7000 struct {
7001 struct perf_event_header header;
7002
7003 u32 pid;
7004 u32 tid;
7005 } event_id;
7006};
7007
7008static int perf_event_comm_match(struct perf_event *event)
7009{
7010 return event->attr.comm;
7011}
7012
7013static void perf_event_comm_output(struct perf_event *event,
7014 void *data)
7015{
7016 struct perf_comm_event *comm_event = data;
7017 struct perf_output_handle handle;
7018 struct perf_sample_data sample;
7019 int size = comm_event->event_id.header.size;
7020 int ret;
7021
7022 if (!perf_event_comm_match(event))
7023 return;
7024
7025 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7026 ret = perf_output_begin(&handle, event,
7027 comm_event->event_id.header.size);
7028
7029 if (ret)
7030 goto out;
7031
7032 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7033 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7034
7035 perf_output_put(&handle, comm_event->event_id);
7036 __output_copy(&handle, comm_event->comm,
7037 comm_event->comm_size);
7038
7039 perf_event__output_id_sample(event, &handle, &sample);
7040
7041 perf_output_end(&handle);
7042out:
7043 comm_event->event_id.header.size = size;
7044}
7045
7046static void perf_event_comm_event(struct perf_comm_event *comm_event)
7047{
7048 char comm[TASK_COMM_LEN];
7049 unsigned int size;
7050
7051 memset(comm, 0, sizeof(comm));
7052 strlcpy(comm, comm_event->task->comm, sizeof(comm));
7053 size = ALIGN(strlen(comm)+1, sizeof(u64));
7054
7055 comm_event->comm = comm;
7056 comm_event->comm_size = size;
7057
7058 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7059
7060 perf_iterate_sb(perf_event_comm_output,
7061 comm_event,
7062 NULL);
7063}
7064
7065void perf_event_comm(struct task_struct *task, bool exec)
7066{
7067 struct perf_comm_event comm_event;
7068
7069 if (!atomic_read(&nr_comm_events))
7070 return;
7071
7072 comm_event = (struct perf_comm_event){
7073 .task = task,
7074
7075
7076 .event_id = {
7077 .header = {
7078 .type = PERF_RECORD_COMM,
7079 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7080
7081 },
7082
7083
7084 },
7085 };
7086
7087 perf_event_comm_event(&comm_event);
7088}
7089
7090
7091
7092
7093
7094struct perf_namespaces_event {
7095 struct task_struct *task;
7096
7097 struct {
7098 struct perf_event_header header;
7099
7100 u32 pid;
7101 u32 tid;
7102 u64 nr_namespaces;
7103 struct perf_ns_link_info link_info[NR_NAMESPACES];
7104 } event_id;
7105};
7106
7107static int perf_event_namespaces_match(struct perf_event *event)
7108{
7109 return event->attr.namespaces;
7110}
7111
7112static void perf_event_namespaces_output(struct perf_event *event,
7113 void *data)
7114{
7115 struct perf_namespaces_event *namespaces_event = data;
7116 struct perf_output_handle handle;
7117 struct perf_sample_data sample;
7118 u16 header_size = namespaces_event->event_id.header.size;
7119 int ret;
7120
7121 if (!perf_event_namespaces_match(event))
7122 return;
7123
7124 perf_event_header__init_id(&namespaces_event->event_id.header,
7125 &sample, event);
7126 ret = perf_output_begin(&handle, event,
7127 namespaces_event->event_id.header.size);
7128 if (ret)
7129 goto out;
7130
7131 namespaces_event->event_id.pid = perf_event_pid(event,
7132 namespaces_event->task);
7133 namespaces_event->event_id.tid = perf_event_tid(event,
7134 namespaces_event->task);
7135
7136 perf_output_put(&handle, namespaces_event->event_id);
7137
7138 perf_event__output_id_sample(event, &handle, &sample);
7139
7140 perf_output_end(&handle);
7141out:
7142 namespaces_event->event_id.header.size = header_size;
7143}
7144
7145static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7146 struct task_struct *task,
7147 const struct proc_ns_operations *ns_ops)
7148{
7149 struct path ns_path;
7150 struct inode *ns_inode;
7151 void *error;
7152
7153 error = ns_get_path(&ns_path, task, ns_ops);
7154 if (!error) {
7155 ns_inode = ns_path.dentry->d_inode;
7156 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7157 ns_link_info->ino = ns_inode->i_ino;
7158 path_put(&ns_path);
7159 }
7160}
7161
7162void perf_event_namespaces(struct task_struct *task)
7163{
7164 struct perf_namespaces_event namespaces_event;
7165 struct perf_ns_link_info *ns_link_info;
7166
7167 if (!atomic_read(&nr_namespaces_events))
7168 return;
7169
7170 namespaces_event = (struct perf_namespaces_event){
7171 .task = task,
7172 .event_id = {
7173 .header = {
7174 .type = PERF_RECORD_NAMESPACES,
7175 .misc = 0,
7176 .size = sizeof(namespaces_event.event_id),
7177 },
7178
7179
7180 .nr_namespaces = NR_NAMESPACES,
7181
7182 },
7183 };
7184
7185 ns_link_info = namespaces_event.event_id.link_info;
7186
7187 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7188 task, &mntns_operations);
7189
7190#ifdef CONFIG_USER_NS
7191 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7192 task, &userns_operations);
7193#endif
7194#ifdef CONFIG_NET_NS
7195 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7196 task, &netns_operations);
7197#endif
7198#ifdef CONFIG_UTS_NS
7199 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7200 task, &utsns_operations);
7201#endif
7202#ifdef CONFIG_IPC_NS
7203 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7204 task, &ipcns_operations);
7205#endif
7206#ifdef CONFIG_PID_NS
7207 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7208 task, &pidns_operations);
7209#endif
7210#ifdef CONFIG_CGROUPS
7211 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7212 task, &cgroupns_operations);
7213#endif
7214
7215 perf_iterate_sb(perf_event_namespaces_output,
7216 &namespaces_event,
7217 NULL);
7218}
7219
7220
7221
7222
7223
7224struct perf_mmap_event {
7225 struct vm_area_struct *vma;
7226
7227 const char *file_name;
7228 int file_size;
7229 int maj, min;
7230 u64 ino;
7231 u64 ino_generation;
7232 u32 prot, flags;
7233
7234 struct {
7235 struct perf_event_header header;
7236
7237 u32 pid;
7238 u32 tid;
7239 u64 start;
7240 u64 len;
7241 u64 pgoff;
7242 } event_id;
7243};
7244
7245static int perf_event_mmap_match(struct perf_event *event,
7246 void *data)
7247{
7248 struct perf_mmap_event *mmap_event = data;
7249 struct vm_area_struct *vma = mmap_event->vma;
7250 int executable = vma->vm_flags & VM_EXEC;
7251
7252 return (!executable && event->attr.mmap_data) ||
7253 (executable && (event->attr.mmap || event->attr.mmap2));
7254}
7255
7256static void perf_event_mmap_output(struct perf_event *event,
7257 void *data)
7258{
7259 struct perf_mmap_event *mmap_event = data;
7260 struct perf_output_handle handle;
7261 struct perf_sample_data sample;
7262 int size = mmap_event->event_id.header.size;
7263 u32 type = mmap_event->event_id.header.type;
7264 int ret;
7265
7266 if (!perf_event_mmap_match(event, data))
7267 return;
7268
7269 if (event->attr.mmap2) {
7270 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7271 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7272 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7273 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7274 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7275 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7276 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7277 }
7278
7279 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7280 ret = perf_output_begin(&handle, event,
7281 mmap_event->event_id.header.size);
7282 if (ret)
7283 goto out;
7284
7285 mmap_event->event_id.pid = perf_event_pid(event, current);
7286 mmap_event->event_id.tid = perf_event_tid(event, current);
7287
7288 perf_output_put(&handle, mmap_event->event_id);
7289
7290 if (event->attr.mmap2) {
7291 perf_output_put(&handle, mmap_event->maj);
7292 perf_output_put(&handle, mmap_event->min);
7293 perf_output_put(&handle, mmap_event->ino);
7294 perf_output_put(&handle, mmap_event->ino_generation);
7295 perf_output_put(&handle, mmap_event->prot);
7296 perf_output_put(&handle, mmap_event->flags);
7297 }
7298
7299 __output_copy(&handle, mmap_event->file_name,
7300 mmap_event->file_size);
7301
7302 perf_event__output_id_sample(event, &handle, &sample);
7303
7304 perf_output_end(&handle);
7305out:
7306 mmap_event->event_id.header.size = size;
7307 mmap_event->event_id.header.type = type;
7308}
7309
7310static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7311{
7312 struct vm_area_struct *vma = mmap_event->vma;
7313 struct file *file = vma->vm_file;
7314 int maj = 0, min = 0;
7315 u64 ino = 0, gen = 0;
7316 u32 prot = 0, flags = 0;
7317 unsigned int size;
7318 char tmp[16];
7319 char *buf = NULL;
7320 char *name;
7321
7322 if (vma->vm_flags & VM_READ)
7323 prot |= PROT_READ;
7324 if (vma->vm_flags & VM_WRITE)
7325 prot |= PROT_WRITE;
7326 if (vma->vm_flags & VM_EXEC)
7327 prot |= PROT_EXEC;
7328
7329 if (vma->vm_flags & VM_MAYSHARE)
7330 flags = MAP_SHARED;
7331 else
7332 flags = MAP_PRIVATE;
7333
7334 if (vma->vm_flags & VM_DENYWRITE)
7335 flags |= MAP_DENYWRITE;
7336 if (vma->vm_flags & VM_MAYEXEC)
7337 flags |= MAP_EXECUTABLE;
7338 if (vma->vm_flags & VM_LOCKED)
7339 flags |= MAP_LOCKED;
7340 if (vma->vm_flags & VM_HUGETLB)
7341 flags |= MAP_HUGETLB;
7342
7343 if (file) {
7344 struct inode *inode;
7345 dev_t dev;
7346
7347 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7348 if (!buf) {
7349 name = "//enomem";
7350 goto cpy_name;
7351 }
7352
7353
7354
7355
7356
7357 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7358 if (IS_ERR(name)) {
7359 name = "//toolong";
7360 goto cpy_name;
7361 }
7362 inode = file_inode(vma->vm_file);
7363 dev = inode->i_sb->s_dev;
7364 ino = inode->i_ino;
7365 gen = inode->i_generation;
7366 maj = MAJOR(dev);
7367 min = MINOR(dev);
7368
7369 goto got_name;
7370 } else {
7371 if (vma->vm_ops && vma->vm_ops->name) {
7372 name = (char *) vma->vm_ops->name(vma);
7373 if (name)
7374 goto cpy_name;
7375 }
7376
7377 name = (char *)arch_vma_name(vma);
7378 if (name)
7379 goto cpy_name;
7380
7381 if (vma->vm_start <= vma->vm_mm->start_brk &&
7382 vma->vm_end >= vma->vm_mm->brk) {
7383 name = "[heap]";
7384 goto cpy_name;
7385 }
7386 if (vma->vm_start <= vma->vm_mm->start_stack &&
7387 vma->vm_end >= vma->vm_mm->start_stack) {
7388 name = "[stack]";
7389 goto cpy_name;
7390 }
7391
7392 name = "//anon";
7393 goto cpy_name;
7394 }
7395
7396cpy_name:
7397 strlcpy(tmp, name, sizeof(tmp));
7398 name = tmp;
7399got_name:
7400
7401
7402
7403
7404
7405 size = strlen(name)+1;
7406 while (!IS_ALIGNED(size, sizeof(u64)))
7407 name[size++] = '\0';
7408
7409 mmap_event->file_name = name;
7410 mmap_event->file_size = size;
7411 mmap_event->maj = maj;
7412 mmap_event->min = min;
7413 mmap_event->ino = ino;
7414 mmap_event->ino_generation = gen;
7415 mmap_event->prot = prot;
7416 mmap_event->flags = flags;
7417
7418 if (!(vma->vm_flags & VM_EXEC))
7419 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7420
7421 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7422
7423 perf_iterate_sb(perf_event_mmap_output,
7424 mmap_event,
7425 NULL);
7426
7427 kfree(buf);
7428}
7429
7430
7431
7432
7433static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7434 struct file *file, unsigned long offset,
7435 unsigned long size)
7436{
7437
7438 if (!filter->path.dentry)
7439 return false;
7440
7441 if (d_inode(filter->path.dentry) != file_inode(file))
7442 return false;
7443
7444 if (filter->offset > offset + size)
7445 return false;
7446
7447 if (filter->offset + filter->size < offset)
7448 return false;
7449
7450 return true;
7451}
7452
7453static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
7454 struct vm_area_struct *vma,
7455 struct perf_addr_filter_range *fr)
7456{
7457 unsigned long vma_size = vma->vm_end - vma->vm_start;
7458 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
7459 struct file *file = vma->vm_file;
7460
7461 if (!perf_addr_filter_match(filter, file, off, vma_size))
7462 return false;
7463
7464 if (filter->offset < off) {
7465 fr->start = vma->vm_start;
7466 fr->size = min(vma_size, filter->size - (off - filter->offset));
7467 } else {
7468 fr->start = vma->vm_start + filter->offset - off;
7469 fr->size = min(vma->vm_end - fr->start, filter->size);
7470 }
7471
7472 return true;
7473}
7474
7475static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7476{
7477 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7478 struct vm_area_struct *vma = data;
7479 struct perf_addr_filter *filter;
7480 unsigned int restart = 0, count = 0;
7481 unsigned long flags;
7482
7483 if (!has_addr_filter(event))
7484 return;
7485
7486 if (!vma->vm_file)
7487 return;
7488
7489 raw_spin_lock_irqsave(&ifh->lock, flags);
7490 list_for_each_entry(filter, &ifh->list, entry) {
7491 if (perf_addr_filter_vma_adjust(filter, vma,
7492 &event->addr_filter_ranges[count]))
7493 restart++;
7494
7495 count++;
7496 }
7497
7498 if (restart)
7499 event->addr_filters_gen++;
7500 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7501
7502 if (restart)
7503 perf_event_stop(event, 1);
7504}
7505
7506
7507
7508
7509static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7510{
7511 struct perf_event_context *ctx;
7512 int ctxn;
7513
7514
7515
7516
7517
7518 if (!(vma->vm_flags & VM_EXEC))
7519 return;
7520
7521 rcu_read_lock();
7522 for_each_task_context_nr(ctxn) {
7523 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7524 if (!ctx)
7525 continue;
7526
7527 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7528 }
7529 rcu_read_unlock();
7530}
7531
7532void perf_event_mmap(struct vm_area_struct *vma)
7533{
7534 struct perf_mmap_event mmap_event;
7535
7536 if (!atomic_read(&nr_mmap_events))
7537 return;
7538
7539 mmap_event = (struct perf_mmap_event){
7540 .vma = vma,
7541
7542
7543 .event_id = {
7544 .header = {
7545 .type = PERF_RECORD_MMAP,
7546 .misc = PERF_RECORD_MISC_USER,
7547
7548 },
7549
7550
7551 .start = vma->vm_start,
7552 .len = vma->vm_end - vma->vm_start,
7553 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
7554 },
7555
7556
7557
7558
7559
7560
7561 };
7562
7563 perf_addr_filters_adjust(vma);
7564 perf_event_mmap_event(&mmap_event);
7565}
7566
7567void perf_event_aux_event(struct perf_event *event, unsigned long head,
7568 unsigned long size, u64 flags)
7569{
7570 struct perf_output_handle handle;
7571 struct perf_sample_data sample;
7572 struct perf_aux_event {
7573 struct perf_event_header header;
7574 u64 offset;
7575 u64 size;
7576 u64 flags;
7577 } rec = {
7578 .header = {
7579 .type = PERF_RECORD_AUX,
7580 .misc = 0,
7581 .size = sizeof(rec),
7582 },
7583 .offset = head,
7584 .size = size,
7585 .flags = flags,
7586 };
7587 int ret;
7588
7589 perf_event_header__init_id(&rec.header, &sample, event);
7590 ret = perf_output_begin(&handle, event, rec.header.size);
7591
7592 if (ret)
7593 return;
7594
7595 perf_output_put(&handle, rec);
7596 perf_event__output_id_sample(event, &handle, &sample);
7597
7598 perf_output_end(&handle);
7599}
7600
7601
7602
7603
7604void perf_log_lost_samples(struct perf_event *event, u64 lost)
7605{
7606 struct perf_output_handle handle;
7607 struct perf_sample_data sample;
7608 int ret;
7609
7610 struct {
7611 struct perf_event_header header;
7612 u64 lost;
7613 } lost_samples_event = {
7614 .header = {
7615 .type = PERF_RECORD_LOST_SAMPLES,
7616 .misc = 0,
7617 .size = sizeof(lost_samples_event),
7618 },
7619 .lost = lost,
7620 };
7621
7622 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7623
7624 ret = perf_output_begin(&handle, event,
7625 lost_samples_event.header.size);
7626 if (ret)
7627 return;
7628
7629 perf_output_put(&handle, lost_samples_event);
7630 perf_event__output_id_sample(event, &handle, &sample);
7631 perf_output_end(&handle);
7632}
7633
7634
7635
7636
7637
7638struct perf_switch_event {
7639 struct task_struct *task;
7640 struct task_struct *next_prev;
7641
7642 struct {
7643 struct perf_event_header header;
7644 u32 next_prev_pid;
7645 u32 next_prev_tid;
7646 } event_id;
7647};
7648
7649static int perf_event_switch_match(struct perf_event *event)
7650{
7651 return event->attr.context_switch;
7652}
7653
7654static void perf_event_switch_output(struct perf_event *event, void *data)
7655{
7656 struct perf_switch_event *se = data;
7657 struct perf_output_handle handle;
7658 struct perf_sample_data sample;
7659 int ret;
7660
7661 if (!perf_event_switch_match(event))
7662 return;
7663
7664
7665 if (event->ctx->task) {
7666 se->event_id.header.type = PERF_RECORD_SWITCH;
7667 se->event_id.header.size = sizeof(se->event_id.header);
7668 } else {
7669 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7670 se->event_id.header.size = sizeof(se->event_id);
7671 se->event_id.next_prev_pid =
7672 perf_event_pid(event, se->next_prev);
7673 se->event_id.next_prev_tid =
7674 perf_event_tid(event, se->next_prev);
7675 }
7676
7677 perf_event_header__init_id(&se->event_id.header, &sample, event);
7678
7679 ret = perf_output_begin(&handle, event, se->event_id.header.size);
7680 if (ret)
7681 return;
7682
7683 if (event->ctx->task)
7684 perf_output_put(&handle, se->event_id.header);
7685 else
7686 perf_output_put(&handle, se->event_id);
7687
7688 perf_event__output_id_sample(event, &handle, &sample);
7689
7690 perf_output_end(&handle);
7691}
7692
7693static void perf_event_switch(struct task_struct *task,
7694 struct task_struct *next_prev, bool sched_in)
7695{
7696 struct perf_switch_event switch_event;
7697
7698
7699
7700 switch_event = (struct perf_switch_event){
7701 .task = task,
7702 .next_prev = next_prev,
7703 .event_id = {
7704 .header = {
7705
7706 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7707
7708 },
7709
7710
7711 },
7712 };
7713
7714 if (!sched_in && task->state == TASK_RUNNING)
7715 switch_event.event_id.header.misc |=
7716 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
7717
7718 perf_iterate_sb(perf_event_switch_output,
7719 &switch_event,
7720 NULL);
7721}
7722
7723
7724
7725
7726
7727static void perf_log_throttle(struct perf_event *event, int enable)
7728{
7729 struct perf_output_handle handle;
7730 struct perf_sample_data sample;
7731 int ret;
7732
7733 struct {
7734 struct perf_event_header header;
7735 u64 time;
7736 u64 id;
7737 u64 stream_id;
7738 } throttle_event = {
7739 .header = {
7740 .type = PERF_RECORD_THROTTLE,
7741 .misc = 0,
7742 .size = sizeof(throttle_event),
7743 },
7744 .time = perf_event_clock(event),
7745 .id = primary_event_id(event),
7746 .stream_id = event->id,
7747 };
7748
7749 if (enable)
7750 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7751
7752 perf_event_header__init_id(&throttle_event.header, &sample, event);
7753
7754 ret = perf_output_begin(&handle, event,
7755 throttle_event.header.size);
7756 if (ret)
7757 return;
7758
7759 perf_output_put(&handle, throttle_event);
7760 perf_event__output_id_sample(event, &handle, &sample);
7761 perf_output_end(&handle);
7762}
7763
7764
7765
7766
7767
7768struct perf_ksymbol_event {
7769 const char *name;
7770 int name_len;
7771 struct {
7772 struct perf_event_header header;
7773 u64 addr;
7774 u32 len;
7775 u16 ksym_type;
7776 u16 flags;
7777 } event_id;
7778};
7779
7780static int perf_event_ksymbol_match(struct perf_event *event)
7781{
7782 return event->attr.ksymbol;
7783}
7784
7785static void perf_event_ksymbol_output(struct perf_event *event, void *data)
7786{
7787 struct perf_ksymbol_event *ksymbol_event = data;
7788 struct perf_output_handle handle;
7789 struct perf_sample_data sample;
7790 int ret;
7791
7792 if (!perf_event_ksymbol_match(event))
7793 return;
7794
7795 perf_event_header__init_id(&ksymbol_event->event_id.header,
7796 &sample, event);
7797 ret = perf_output_begin(&handle, event,
7798 ksymbol_event->event_id.header.size);
7799 if (ret)
7800 return;
7801
7802 perf_output_put(&handle, ksymbol_event->event_id);
7803 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
7804 perf_event__output_id_sample(event, &handle, &sample);
7805
7806 perf_output_end(&handle);
7807}
7808
7809void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
7810 const char *sym)
7811{
7812 struct perf_ksymbol_event ksymbol_event;
7813 char name[KSYM_NAME_LEN];
7814 u16 flags = 0;
7815 int name_len;
7816
7817 if (!atomic_read(&nr_ksymbol_events))
7818 return;
7819
7820 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
7821 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
7822 goto err;
7823
7824 strlcpy(name, sym, KSYM_NAME_LEN);
7825 name_len = strlen(name) + 1;
7826 while (!IS_ALIGNED(name_len, sizeof(u64)))
7827 name[name_len++] = '\0';
7828 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
7829
7830 if (unregister)
7831 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
7832
7833 ksymbol_event = (struct perf_ksymbol_event){
7834 .name = name,
7835 .name_len = name_len,
7836 .event_id = {
7837 .header = {
7838 .type = PERF_RECORD_KSYMBOL,
7839 .size = sizeof(ksymbol_event.event_id) +
7840 name_len,
7841 },
7842 .addr = addr,
7843 .len = len,
7844 .ksym_type = ksym_type,
7845 .flags = flags,
7846 },
7847 };
7848
7849 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
7850 return;
7851err:
7852 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
7853}
7854
7855
7856
7857
7858
7859struct perf_bpf_event {
7860 struct bpf_prog *prog;
7861 struct {
7862 struct perf_event_header header;
7863 u16 type;
7864 u16 flags;
7865 u32 id;
7866 u8 tag[BPF_TAG_SIZE];
7867 } event_id;
7868};
7869
7870static int perf_event_bpf_match(struct perf_event *event)
7871{
7872 return event->attr.bpf_event;
7873}
7874
7875static void perf_event_bpf_output(struct perf_event *event, void *data)
7876{
7877 struct perf_bpf_event *bpf_event = data;
7878 struct perf_output_handle handle;
7879 struct perf_sample_data sample;
7880 int ret;
7881
7882 if (!perf_event_bpf_match(event))
7883 return;
7884
7885 perf_event_header__init_id(&bpf_event->event_id.header,
7886 &sample, event);
7887 ret = perf_output_begin(&handle, event,
7888 bpf_event->event_id.header.size);
7889 if (ret)
7890 return;
7891
7892 perf_output_put(&handle, bpf_event->event_id);
7893 perf_event__output_id_sample(event, &handle, &sample);
7894
7895 perf_output_end(&handle);
7896}
7897
7898static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
7899 enum perf_bpf_event_type type)
7900{
7901 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
7902 char sym[KSYM_NAME_LEN];
7903 int i;
7904
7905 if (prog->aux->func_cnt == 0) {
7906 bpf_get_prog_name(prog, sym);
7907 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
7908 (u64)(unsigned long)prog->bpf_func,
7909 prog->jited_len, unregister, sym);
7910 } else {
7911 for (i = 0; i < prog->aux->func_cnt; i++) {
7912 struct bpf_prog *subprog = prog->aux->func[i];
7913
7914 bpf_get_prog_name(subprog, sym);
7915 perf_event_ksymbol(
7916 PERF_RECORD_KSYMBOL_TYPE_BPF,
7917 (u64)(unsigned long)subprog->bpf_func,
7918 subprog->jited_len, unregister, sym);
7919 }
7920 }
7921}
7922
7923void perf_event_bpf_event(struct bpf_prog *prog,
7924 enum perf_bpf_event_type type,
7925 u16 flags)
7926{
7927 struct perf_bpf_event bpf_event;
7928
7929 if (type <= PERF_BPF_EVENT_UNKNOWN ||
7930 type >= PERF_BPF_EVENT_MAX)
7931 return;
7932
7933 switch (type) {
7934 case PERF_BPF_EVENT_PROG_LOAD:
7935 case PERF_BPF_EVENT_PROG_UNLOAD:
7936 if (atomic_read(&nr_ksymbol_events))
7937 perf_event_bpf_emit_ksymbols(prog, type);
7938 break;
7939 default:
7940 break;
7941 }
7942
7943 if (!atomic_read(&nr_bpf_events))
7944 return;
7945
7946 bpf_event = (struct perf_bpf_event){
7947 .prog = prog,
7948 .event_id = {
7949 .header = {
7950 .type = PERF_RECORD_BPF_EVENT,
7951 .size = sizeof(bpf_event.event_id),
7952 },
7953 .type = type,
7954 .flags = flags,
7955 .id = prog->aux->id,
7956 },
7957 };
7958
7959 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
7960
7961 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
7962 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
7963}
7964
7965void perf_event_itrace_started(struct perf_event *event)
7966{
7967 event->attach_state |= PERF_ATTACH_ITRACE;
7968}
7969
7970static void perf_log_itrace_start(struct perf_event *event)
7971{
7972 struct perf_output_handle handle;
7973 struct perf_sample_data sample;
7974 struct perf_aux_event {
7975 struct perf_event_header header;
7976 u32 pid;
7977 u32 tid;
7978 } rec;
7979 int ret;
7980
7981 if (event->parent)
7982 event = event->parent;
7983
7984 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7985 event->attach_state & PERF_ATTACH_ITRACE)
7986 return;
7987
7988 rec.header.type = PERF_RECORD_ITRACE_START;
7989 rec.header.misc = 0;
7990 rec.header.size = sizeof(rec);
7991 rec.pid = perf_event_pid(event, current);
7992 rec.tid = perf_event_tid(event, current);
7993
7994 perf_event_header__init_id(&rec.header, &sample, event);
7995 ret = perf_output_begin(&handle, event, rec.header.size);
7996
7997 if (ret)
7998 return;
7999
8000 perf_output_put(&handle, rec);
8001 perf_event__output_id_sample(event, &handle, &sample);
8002
8003 perf_output_end(&handle);
8004}
8005
8006static int
8007__perf_event_account_interrupt(struct perf_event *event, int throttle)
8008{
8009 struct hw_perf_event *hwc = &event->hw;
8010 int ret = 0;
8011 u64 seq;
8012
8013 seq = __this_cpu_read(perf_throttled_seq);
8014 if (seq != hwc->interrupts_seq) {
8015 hwc->interrupts_seq = seq;
8016 hwc->interrupts = 1;
8017 } else {
8018 hwc->interrupts++;
8019 if (unlikely(throttle
8020 && hwc->interrupts >= max_samples_per_tick)) {
8021 __this_cpu_inc(perf_throttled_count);
8022 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
8023 hwc->interrupts = MAX_INTERRUPTS;
8024 perf_log_throttle(event, 0);
8025 ret = 1;
8026 }
8027 }
8028
8029 if (event->attr.freq) {
8030 u64 now = perf_clock();
8031 s64 delta = now - hwc->freq_time_stamp;
8032
8033 hwc->freq_time_stamp = now;
8034
8035 if (delta > 0 && delta < 2*TICK_NSEC)
8036 perf_adjust_period(event, delta, hwc->last_period, true);
8037 }
8038
8039 return ret;
8040}
8041
8042int perf_event_account_interrupt(struct perf_event *event)
8043{
8044 return __perf_event_account_interrupt(event, 1);
8045}
8046
8047
8048
8049
8050
8051static int __perf_event_overflow(struct perf_event *event,
8052 int throttle, struct perf_sample_data *data,
8053 struct pt_regs *regs)
8054{
8055 int events = atomic_read(&event->event_limit);
8056 int ret = 0;
8057
8058
8059
8060
8061
8062 if (unlikely(!is_sampling_event(event)))
8063 return 0;
8064
8065 ret = __perf_event_account_interrupt(event, throttle);
8066
8067
8068
8069
8070
8071
8072 event->pending_kill = POLL_IN;
8073 if (events && atomic_dec_and_test(&event->event_limit)) {
8074 ret = 1;
8075 event->pending_kill = POLL_HUP;
8076
8077 perf_event_disable_inatomic(event);
8078 }
8079
8080 READ_ONCE(event->overflow_handler)(event, data, regs);
8081
8082 if (*perf_event_fasync(event) && event->pending_kill) {
8083 event->pending_wakeup = 1;
8084 irq_work_queue(&event->pending);
8085 }
8086
8087 return ret;
8088}
8089
8090int perf_event_overflow(struct perf_event *event,
8091 struct perf_sample_data *data,
8092 struct pt_regs *regs)
8093{
8094 return __perf_event_overflow(event, 1, data, regs);
8095}
8096
8097
8098
8099
8100
8101struct swevent_htable {
8102 struct swevent_hlist *swevent_hlist;
8103 struct mutex hlist_mutex;
8104 int hlist_refcount;
8105
8106
8107 int recursion[PERF_NR_CONTEXTS];
8108};
8109
8110static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
8111
8112
8113
8114
8115
8116
8117
8118
8119u64 perf_swevent_set_period(struct perf_event *event)
8120{
8121 struct hw_perf_event *hwc = &event->hw;
8122 u64 period = hwc->last_period;
8123 u64 nr, offset;
8124 s64 old, val;
8125
8126 hwc->last_period = hwc->sample_period;
8127
8128again:
8129 old = val = local64_read(&hwc->period_left);
8130 if (val < 0)
8131 return 0;
8132
8133 nr = div64_u64(period + val, period);
8134 offset = nr * period;
8135 val -= offset;
8136 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
8137 goto again;
8138
8139 return nr;
8140}
8141
8142static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
8143 struct perf_sample_data *data,
8144 struct pt_regs *regs)
8145{
8146 struct hw_perf_event *hwc = &event->hw;
8147 int throttle = 0;
8148
8149 if (!overflow)
8150 overflow = perf_swevent_set_period(event);
8151
8152 if (hwc->interrupts == MAX_INTERRUPTS)
8153 return;
8154
8155 for (; overflow; overflow--) {
8156 if (__perf_event_overflow(event, throttle,
8157 data, regs)) {
8158
8159
8160
8161
8162 break;
8163 }
8164 throttle = 1;
8165 }
8166}
8167
8168static void perf_swevent_event(struct perf_event *event, u64 nr,
8169 struct perf_sample_data *data,
8170 struct pt_regs *regs)
8171{
8172 struct hw_perf_event *hwc = &event->hw;
8173
8174 local64_add(nr, &event->count);
8175
8176 if (!regs)
8177 return;
8178
8179 if (!is_sampling_event(event))
8180 return;
8181
8182 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8183 data->period = nr;
8184 return perf_swevent_overflow(event, 1, data, regs);
8185 } else
8186 data->period = event->hw.last_period;
8187
8188 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8189 return perf_swevent_overflow(event, 1, data, regs);
8190
8191 if (local64_add_negative(nr, &hwc->period_left))
8192 return;
8193
8194 perf_swevent_overflow(event, 0, data, regs);
8195}
8196
8197static int perf_exclude_event(struct perf_event *event,
8198 struct pt_regs *regs)
8199{
8200 if (event->hw.state & PERF_HES_STOPPED)
8201 return 1;
8202
8203 if (regs) {
8204 if (event->attr.exclude_user && user_mode(regs))
8205 return 1;
8206
8207 if (event->attr.exclude_kernel && !user_mode(regs))
8208 return 1;
8209 }
8210
8211 return 0;
8212}
8213
8214static int perf_swevent_match(struct perf_event *event,
8215 enum perf_type_id type,
8216 u32 event_id,
8217 struct perf_sample_data *data,
8218 struct pt_regs *regs)
8219{
8220 if (event->attr.type != type)
8221 return 0;
8222
8223 if (event->attr.config != event_id)
8224 return 0;
8225
8226 if (perf_exclude_event(event, regs))
8227 return 0;
8228
8229 return 1;
8230}
8231
8232static inline u64 swevent_hash(u64 type, u32 event_id)
8233{
8234 u64 val = event_id | (type << 32);
8235
8236 return hash_64(val, SWEVENT_HLIST_BITS);
8237}
8238
8239static inline struct hlist_head *
8240__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
8241{
8242 u64 hash = swevent_hash(type, event_id);
8243
8244 return &hlist->heads[hash];
8245}
8246
8247
8248static inline struct hlist_head *
8249find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
8250{
8251 struct swevent_hlist *hlist;
8252
8253 hlist = rcu_dereference(swhash->swevent_hlist);
8254 if (!hlist)
8255 return NULL;
8256
8257 return __find_swevent_head(hlist, type, event_id);
8258}
8259
8260
8261static inline struct hlist_head *
8262find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
8263{
8264 struct swevent_hlist *hlist;
8265 u32 event_id = event->attr.config;
8266 u64 type = event->attr.type;
8267
8268
8269
8270
8271
8272
8273 hlist = rcu_dereference_protected(swhash->swevent_hlist,
8274 lockdep_is_held(&event->ctx->lock));
8275 if (!hlist)
8276 return NULL;
8277
8278 return __find_swevent_head(hlist, type, event_id);
8279}
8280
8281static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
8282 u64 nr,
8283 struct perf_sample_data *data,
8284 struct pt_regs *regs)
8285{
8286 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8287 struct perf_event *event;
8288 struct hlist_head *head;
8289
8290 rcu_read_lock();
8291 head = find_swevent_head_rcu(swhash, type, event_id);
8292 if (!head)
8293 goto end;
8294
8295 hlist_for_each_entry_rcu(event, head, hlist_entry) {
8296 if (perf_swevent_match(event, type, event_id, data, regs))
8297 perf_swevent_event(event, nr, data, regs);
8298 }
8299end:
8300 rcu_read_unlock();
8301}
8302
8303DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
8304
8305int perf_swevent_get_recursion_context(void)
8306{
8307 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8308
8309 return get_recursion_context(swhash->recursion);
8310}
8311EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
8312
8313void perf_swevent_put_recursion_context(int rctx)
8314{
8315 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8316
8317 put_recursion_context(swhash->recursion, rctx);
8318}
8319
8320void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8321{
8322 struct perf_sample_data data;
8323
8324 if (WARN_ON_ONCE(!regs))
8325 return;
8326
8327 perf_sample_data_init(&data, addr, 0);
8328 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8329}
8330
8331void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8332{
8333 int rctx;
8334
8335 preempt_disable_notrace();
8336 rctx = perf_swevent_get_recursion_context();
8337 if (unlikely(rctx < 0))
8338 goto fail;
8339
8340 ___perf_sw_event(event_id, nr, regs, addr);
8341
8342 perf_swevent_put_recursion_context(rctx);
8343fail:
8344 preempt_enable_notrace();
8345}
8346
8347static void perf_swevent_read(struct perf_event *event)
8348{
8349}
8350
8351static int perf_swevent_add(struct perf_event *event, int flags)
8352{
8353 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8354 struct hw_perf_event *hwc = &event->hw;
8355 struct hlist_head *head;
8356
8357 if (is_sampling_event(event)) {
8358 hwc->last_period = hwc->sample_period;
8359 perf_swevent_set_period(event);
8360 }
8361
8362 hwc->state = !(flags & PERF_EF_START);
8363
8364 head = find_swevent_head(swhash, event);
8365 if (WARN_ON_ONCE(!head))
8366 return -EINVAL;
8367
8368 hlist_add_head_rcu(&event->hlist_entry, head);
8369 perf_event_update_userpage(event);
8370
8371 return 0;
8372}
8373
8374static void perf_swevent_del(struct perf_event *event, int flags)
8375{
8376 hlist_del_rcu(&event->hlist_entry);
8377}
8378
8379static void perf_swevent_start(struct perf_event *event, int flags)
8380{
8381 event->hw.state = 0;
8382}
8383
8384static void perf_swevent_stop(struct perf_event *event, int flags)
8385{
8386 event->hw.state = PERF_HES_STOPPED;
8387}
8388
8389
8390static inline struct swevent_hlist *
8391swevent_hlist_deref(struct swevent_htable *swhash)
8392{
8393 return rcu_dereference_protected(swhash->swevent_hlist,
8394 lockdep_is_held(&swhash->hlist_mutex));
8395}
8396
8397static void swevent_hlist_release(struct swevent_htable *swhash)
8398{
8399 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
8400
8401 if (!hlist)
8402 return;
8403
8404 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
8405 kfree_rcu(hlist, rcu_head);
8406}
8407
8408static void swevent_hlist_put_cpu(int cpu)
8409{
8410 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8411
8412 mutex_lock(&swhash->hlist_mutex);
8413
8414 if (!--swhash->hlist_refcount)
8415 swevent_hlist_release(swhash);
8416
8417 mutex_unlock(&swhash->hlist_mutex);
8418}
8419
8420static void swevent_hlist_put(void)
8421{
8422 int cpu;
8423
8424 for_each_possible_cpu(cpu)
8425 swevent_hlist_put_cpu(cpu);
8426}
8427
8428static int swevent_hlist_get_cpu(int cpu)
8429{
8430 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8431 int err = 0;
8432
8433 mutex_lock(&swhash->hlist_mutex);
8434 if (!swevent_hlist_deref(swhash) &&
8435 cpumask_test_cpu(cpu, perf_online_mask)) {
8436 struct swevent_hlist *hlist;
8437
8438 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
8439 if (!hlist) {
8440 err = -ENOMEM;
8441 goto exit;
8442 }
8443 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8444 }
8445 swhash->hlist_refcount++;
8446exit:
8447 mutex_unlock(&swhash->hlist_mutex);
8448
8449 return err;
8450}
8451
8452static int swevent_hlist_get(void)
8453{
8454 int err, cpu, failed_cpu;
8455
8456 mutex_lock(&pmus_lock);
8457 for_each_possible_cpu(cpu) {
8458 err = swevent_hlist_get_cpu(cpu);
8459 if (err) {
8460 failed_cpu = cpu;
8461 goto fail;
8462 }
8463 }
8464 mutex_unlock(&pmus_lock);
8465 return 0;
8466fail:
8467 for_each_possible_cpu(cpu) {
8468 if (cpu == failed_cpu)
8469 break;
8470 swevent_hlist_put_cpu(cpu);
8471 }
8472 mutex_unlock(&pmus_lock);
8473 return err;
8474}
8475
8476struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
8477
8478static void sw_perf_event_destroy(struct perf_event *event)
8479{
8480 u64 event_id = event->attr.config;
8481
8482 WARN_ON(event->parent);
8483
8484 static_key_slow_dec(&perf_swevent_enabled[event_id]);
8485 swevent_hlist_put();
8486}
8487
8488static int perf_swevent_init(struct perf_event *event)
8489{
8490 u64 event_id = event->attr.config;
8491
8492 if (event->attr.type != PERF_TYPE_SOFTWARE)
8493 return -ENOENT;
8494
8495
8496
8497
8498 if (has_branch_stack(event))
8499 return -EOPNOTSUPP;
8500
8501 switch (event_id) {
8502 case PERF_COUNT_SW_CPU_CLOCK:
8503 case PERF_COUNT_SW_TASK_CLOCK:
8504 return -ENOENT;
8505
8506 default:
8507 break;
8508 }
8509
8510 if (event_id >= PERF_COUNT_SW_MAX)
8511 return -ENOENT;
8512
8513 if (!event->parent) {
8514 int err;
8515
8516 err = swevent_hlist_get();
8517 if (err)
8518 return err;
8519
8520 static_key_slow_inc(&perf_swevent_enabled[event_id]);
8521 event->destroy = sw_perf_event_destroy;
8522 }
8523
8524 return 0;
8525}
8526
8527static struct pmu perf_swevent = {
8528 .task_ctx_nr = perf_sw_context,
8529
8530 .capabilities = PERF_PMU_CAP_NO_NMI,
8531
8532 .event_init = perf_swevent_init,
8533 .add = perf_swevent_add,
8534 .del = perf_swevent_del,
8535 .start = perf_swevent_start,
8536 .stop = perf_swevent_stop,
8537 .read = perf_swevent_read,
8538};
8539
8540#ifdef CONFIG_EVENT_TRACING
8541
8542static int perf_tp_filter_match(struct perf_event *event,
8543 struct perf_sample_data *data)
8544{
8545 void *record = data->raw->frag.data;
8546
8547
8548 if (event->parent)
8549 event = event->parent;
8550
8551 if (likely(!event->filter) || filter_match_preds(event->filter, record))
8552 return 1;
8553 return 0;
8554}
8555
8556static int perf_tp_event_match(struct perf_event *event,
8557 struct perf_sample_data *data,
8558 struct pt_regs *regs)
8559{
8560 if (event->hw.state & PERF_HES_STOPPED)
8561 return 0;
8562
8563
8564
8565 if (event->attr.exclude_kernel && !user_mode(regs))
8566 return 0;
8567
8568 if (!perf_tp_filter_match(event, data))
8569 return 0;
8570
8571 return 1;
8572}
8573
8574void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8575 struct trace_event_call *call, u64 count,
8576 struct pt_regs *regs, struct hlist_head *head,
8577 struct task_struct *task)
8578{
8579 if (bpf_prog_array_valid(call)) {
8580 *(struct pt_regs **)raw_data = regs;
8581 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
8582 perf_swevent_put_recursion_context(rctx);
8583 return;
8584 }
8585 }
8586 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8587 rctx, task);
8588}
8589EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8590
8591void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8592 struct pt_regs *regs, struct hlist_head *head, int rctx,
8593 struct task_struct *task)
8594{
8595 struct perf_sample_data data;
8596 struct perf_event *event;
8597
8598 struct perf_raw_record raw = {
8599 .frag = {
8600 .size = entry_size,
8601 .data = record,
8602 },
8603 };
8604
8605 perf_sample_data_init(&data, 0, 0);
8606 data.raw = &raw;
8607
8608 perf_trace_buf_update(record, event_type);
8609
8610 hlist_for_each_entry_rcu(event, head, hlist_entry) {
8611 if (perf_tp_event_match(event, &data, regs))
8612 perf_swevent_event(event, count, &data, regs);
8613 }
8614
8615
8616
8617
8618
8619 if (task && task != current) {
8620 struct perf_event_context *ctx;
8621 struct trace_entry *entry = record;
8622
8623 rcu_read_lock();
8624 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8625 if (!ctx)
8626 goto unlock;
8627
8628 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8629 if (event->cpu != smp_processor_id())
8630 continue;
8631 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8632 continue;
8633 if (event->attr.config != entry->type)
8634 continue;
8635 if (perf_tp_event_match(event, &data, regs))
8636 perf_swevent_event(event, count, &data, regs);
8637 }
8638unlock:
8639 rcu_read_unlock();
8640 }
8641
8642 perf_swevent_put_recursion_context(rctx);
8643}
8644EXPORT_SYMBOL_GPL(perf_tp_event);
8645
8646static void tp_perf_event_destroy(struct perf_event *event)
8647{
8648 perf_trace_destroy(event);
8649}
8650
8651static int perf_tp_event_init(struct perf_event *event)
8652{
8653 int err;
8654
8655 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8656 return -ENOENT;
8657
8658
8659
8660
8661 if (has_branch_stack(event))
8662 return -EOPNOTSUPP;
8663
8664 err = perf_trace_init(event);
8665 if (err)
8666 return err;
8667
8668 event->destroy = tp_perf_event_destroy;
8669
8670 return 0;
8671}
8672
8673static struct pmu perf_tracepoint = {
8674 .task_ctx_nr = perf_sw_context,
8675
8676 .event_init = perf_tp_event_init,
8677 .add = perf_trace_add,
8678 .del = perf_trace_del,
8679 .start = perf_swevent_start,
8680 .stop = perf_swevent_stop,
8681 .read = perf_swevent_read,
8682};
8683
8684#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699enum perf_probe_config {
8700 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
8701 PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
8702 PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
8703};
8704
8705PMU_FORMAT_ATTR(retprobe, "config:0");
8706#endif
8707
8708#ifdef CONFIG_KPROBE_EVENTS
8709static struct attribute *kprobe_attrs[] = {
8710 &format_attr_retprobe.attr,
8711 NULL,
8712};
8713
8714static struct attribute_group kprobe_format_group = {
8715 .name = "format",
8716 .attrs = kprobe_attrs,
8717};
8718
8719static const struct attribute_group *kprobe_attr_groups[] = {
8720 &kprobe_format_group,
8721 NULL,
8722};
8723
8724static int perf_kprobe_event_init(struct perf_event *event);
8725static struct pmu perf_kprobe = {
8726 .task_ctx_nr = perf_sw_context,
8727 .event_init = perf_kprobe_event_init,
8728 .add = perf_trace_add,
8729 .del = perf_trace_del,
8730 .start = perf_swevent_start,
8731 .stop = perf_swevent_stop,
8732 .read = perf_swevent_read,
8733 .attr_groups = kprobe_attr_groups,
8734};
8735
8736static int perf_kprobe_event_init(struct perf_event *event)
8737{
8738 int err;
8739 bool is_retprobe;
8740
8741 if (event->attr.type != perf_kprobe.type)
8742 return -ENOENT;
8743
8744 if (!capable(CAP_SYS_ADMIN))
8745 return -EACCES;
8746
8747
8748
8749
8750 if (has_branch_stack(event))
8751 return -EOPNOTSUPP;
8752
8753 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8754 err = perf_kprobe_init(event, is_retprobe);
8755 if (err)
8756 return err;
8757
8758 event->destroy = perf_kprobe_destroy;
8759
8760 return 0;
8761}
8762#endif
8763
8764#ifdef CONFIG_UPROBE_EVENTS
8765PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
8766
8767static struct attribute *uprobe_attrs[] = {
8768 &format_attr_retprobe.attr,
8769 &format_attr_ref_ctr_offset.attr,
8770 NULL,
8771};
8772
8773static struct attribute_group uprobe_format_group = {
8774 .name = "format",
8775 .attrs = uprobe_attrs,
8776};
8777
8778static const struct attribute_group *uprobe_attr_groups[] = {
8779 &uprobe_format_group,
8780 NULL,
8781};
8782
8783static int perf_uprobe_event_init(struct perf_event *event);
8784static struct pmu perf_uprobe = {
8785 .task_ctx_nr = perf_sw_context,
8786 .event_init = perf_uprobe_event_init,
8787 .add = perf_trace_add,
8788 .del = perf_trace_del,
8789 .start = perf_swevent_start,
8790 .stop = perf_swevent_stop,
8791 .read = perf_swevent_read,
8792 .attr_groups = uprobe_attr_groups,
8793};
8794
8795static int perf_uprobe_event_init(struct perf_event *event)
8796{
8797 int err;
8798 unsigned long ref_ctr_offset;
8799 bool is_retprobe;
8800
8801 if (event->attr.type != perf_uprobe.type)
8802 return -ENOENT;
8803
8804 if (!capable(CAP_SYS_ADMIN))
8805 return -EACCES;
8806
8807
8808
8809
8810 if (has_branch_stack(event))
8811 return -EOPNOTSUPP;
8812
8813 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8814 ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
8815 err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
8816 if (err)
8817 return err;
8818
8819 event->destroy = perf_uprobe_destroy;
8820
8821 return 0;
8822}
8823#endif
8824
8825static inline void perf_tp_register(void)
8826{
8827 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8828#ifdef CONFIG_KPROBE_EVENTS
8829 perf_pmu_register(&perf_kprobe, "kprobe", -1);
8830#endif
8831#ifdef CONFIG_UPROBE_EVENTS
8832 perf_pmu_register(&perf_uprobe, "uprobe", -1);
8833#endif
8834}
8835
8836static void perf_event_free_filter(struct perf_event *event)
8837{
8838 ftrace_profile_free_filter(event);
8839}
8840
8841#ifdef CONFIG_BPF_SYSCALL
8842static void bpf_overflow_handler(struct perf_event *event,
8843 struct perf_sample_data *data,
8844 struct pt_regs *regs)
8845{
8846 struct bpf_perf_event_data_kern ctx = {
8847 .data = data,
8848 .event = event,
8849 };
8850 int ret = 0;
8851
8852 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8853 preempt_disable();
8854 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8855 goto out;
8856 rcu_read_lock();
8857 ret = BPF_PROG_RUN(event->prog, &ctx);
8858 rcu_read_unlock();
8859out:
8860 __this_cpu_dec(bpf_prog_active);
8861 preempt_enable();
8862 if (!ret)
8863 return;
8864
8865 event->orig_overflow_handler(event, data, regs);
8866}
8867
8868static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8869{
8870 struct bpf_prog *prog;
8871
8872 if (event->overflow_handler_context)
8873
8874 return -EINVAL;
8875
8876 if (event->prog)
8877 return -EEXIST;
8878
8879 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8880 if (IS_ERR(prog))
8881 return PTR_ERR(prog);
8882
8883 event->prog = prog;
8884 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8885 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8886 return 0;
8887}
8888
8889static void perf_event_free_bpf_handler(struct perf_event *event)
8890{
8891 struct bpf_prog *prog = event->prog;
8892
8893 if (!prog)
8894 return;
8895
8896 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8897 event->prog = NULL;
8898 bpf_prog_put(prog);
8899}
8900#else
8901static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8902{
8903 return -EOPNOTSUPP;
8904}
8905static void perf_event_free_bpf_handler(struct perf_event *event)
8906{
8907}
8908#endif
8909
8910
8911
8912
8913
8914static inline bool perf_event_is_tracing(struct perf_event *event)
8915{
8916 if (event->pmu == &perf_tracepoint)
8917 return true;
8918#ifdef CONFIG_KPROBE_EVENTS
8919 if (event->pmu == &perf_kprobe)
8920 return true;
8921#endif
8922#ifdef CONFIG_UPROBE_EVENTS
8923 if (event->pmu == &perf_uprobe)
8924 return true;
8925#endif
8926 return false;
8927}
8928
8929static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8930{
8931 bool is_kprobe, is_tracepoint, is_syscall_tp;
8932 struct bpf_prog *prog;
8933 int ret;
8934
8935 if (!perf_event_is_tracing(event))
8936 return perf_event_set_bpf_handler(event, prog_fd);
8937
8938 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8939 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8940 is_syscall_tp = is_syscall_trace_event(event->tp_event);
8941 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8942
8943 return -EINVAL;
8944
8945 prog = bpf_prog_get(prog_fd);
8946 if (IS_ERR(prog))
8947 return PTR_ERR(prog);
8948
8949 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8950 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8951 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8952
8953 bpf_prog_put(prog);
8954 return -EINVAL;
8955 }
8956
8957
8958 if (prog->kprobe_override &&
8959 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8960 bpf_prog_put(prog);
8961 return -EINVAL;
8962 }
8963
8964 if (is_tracepoint || is_syscall_tp) {
8965 int off = trace_event_get_offsets(event->tp_event);
8966
8967 if (prog->aux->max_ctx_offset > off) {
8968 bpf_prog_put(prog);
8969 return -EACCES;
8970 }
8971 }
8972
8973 ret = perf_event_attach_bpf_prog(event, prog);
8974 if (ret)
8975 bpf_prog_put(prog);
8976 return ret;
8977}
8978
8979static void perf_event_free_bpf_prog(struct perf_event *event)
8980{
8981 if (!perf_event_is_tracing(event)) {
8982 perf_event_free_bpf_handler(event);
8983 return;
8984 }
8985 perf_event_detach_bpf_prog(event);
8986}
8987
8988#else
8989
8990static inline void perf_tp_register(void)
8991{
8992}
8993
8994static void perf_event_free_filter(struct perf_event *event)
8995{
8996}
8997
8998static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8999{
9000 return -ENOENT;
9001}
9002
9003static void perf_event_free_bpf_prog(struct perf_event *event)
9004{
9005}
9006#endif
9007
9008#ifdef CONFIG_HAVE_HW_BREAKPOINT
9009void perf_bp_event(struct perf_event *bp, void *data)
9010{
9011 struct perf_sample_data sample;
9012 struct pt_regs *regs = data;
9013
9014 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
9015
9016 if (!bp->hw.state && !perf_exclude_event(bp, regs))
9017 perf_swevent_event(bp, 1, &sample, regs);
9018}
9019#endif
9020
9021
9022
9023
9024static struct perf_addr_filter *
9025perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
9026{
9027 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
9028 struct perf_addr_filter *filter;
9029
9030 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
9031 if (!filter)
9032 return NULL;
9033
9034 INIT_LIST_HEAD(&filter->entry);
9035 list_add_tail(&filter->entry, filters);
9036
9037 return filter;
9038}
9039
9040static void free_filters_list(struct list_head *filters)
9041{
9042 struct perf_addr_filter *filter, *iter;
9043
9044 list_for_each_entry_safe(filter, iter, filters, entry) {
9045 path_put(&filter->path);
9046 list_del(&filter->entry);
9047 kfree(filter);
9048 }
9049}
9050
9051
9052
9053
9054static void perf_addr_filters_splice(struct perf_event *event,
9055 struct list_head *head)
9056{
9057 unsigned long flags;
9058 LIST_HEAD(list);
9059
9060 if (!has_addr_filter(event))
9061 return;
9062
9063
9064 if (event->parent)
9065 return;
9066
9067 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
9068
9069 list_splice_init(&event->addr_filters.list, &list);
9070 if (head)
9071 list_splice(head, &event->addr_filters.list);
9072
9073 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
9074
9075 free_filters_list(&list);
9076}
9077
9078
9079
9080
9081
9082
9083static void perf_addr_filter_apply(struct perf_addr_filter *filter,
9084 struct mm_struct *mm,
9085 struct perf_addr_filter_range *fr)
9086{
9087 struct vm_area_struct *vma;
9088
9089 for (vma = mm->mmap; vma; vma = vma->vm_next) {
9090 if (!vma->vm_file)
9091 continue;
9092
9093 if (perf_addr_filter_vma_adjust(filter, vma, fr))
9094 return;
9095 }
9096}
9097
9098
9099
9100
9101
9102static void perf_event_addr_filters_apply(struct perf_event *event)
9103{
9104 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9105 struct task_struct *task = READ_ONCE(event->ctx->task);
9106 struct perf_addr_filter *filter;
9107 struct mm_struct *mm = NULL;
9108 unsigned int count = 0;
9109 unsigned long flags;
9110
9111
9112
9113
9114
9115 if (task == TASK_TOMBSTONE)
9116 return;
9117
9118 if (ifh->nr_file_filters) {
9119 mm = get_task_mm(event->ctx->task);
9120 if (!mm)
9121 goto restart;
9122
9123 down_read(&mm->mmap_sem);
9124 }
9125
9126 raw_spin_lock_irqsave(&ifh->lock, flags);
9127 list_for_each_entry(filter, &ifh->list, entry) {
9128 if (filter->path.dentry) {
9129
9130
9131
9132
9133 event->addr_filter_ranges[count].start = 0;
9134 event->addr_filter_ranges[count].size = 0;
9135
9136 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
9137 } else {
9138 event->addr_filter_ranges[count].start = filter->offset;
9139 event->addr_filter_ranges[count].size = filter->size;
9140 }
9141
9142 count++;
9143 }
9144
9145 event->addr_filters_gen++;
9146 raw_spin_unlock_irqrestore(&ifh->lock, flags);
9147
9148 if (ifh->nr_file_filters) {
9149 up_read(&mm->mmap_sem);
9150
9151 mmput(mm);
9152 }
9153
9154restart:
9155 perf_event_stop(event, 1);
9156}
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177enum {
9178 IF_ACT_NONE = -1,
9179 IF_ACT_FILTER,
9180 IF_ACT_START,
9181 IF_ACT_STOP,
9182 IF_SRC_FILE,
9183 IF_SRC_KERNEL,
9184 IF_SRC_FILEADDR,
9185 IF_SRC_KERNELADDR,
9186};
9187
9188enum {
9189 IF_STATE_ACTION = 0,
9190 IF_STATE_SOURCE,
9191 IF_STATE_END,
9192};
9193
9194static const match_table_t if_tokens = {
9195 { IF_ACT_FILTER, "filter" },
9196 { IF_ACT_START, "start" },
9197 { IF_ACT_STOP, "stop" },
9198 { IF_SRC_FILE, "%u/%u@%s" },
9199 { IF_SRC_KERNEL, "%u/%u" },
9200 { IF_SRC_FILEADDR, "%u@%s" },
9201 { IF_SRC_KERNELADDR, "%u" },
9202 { IF_ACT_NONE, NULL },
9203};
9204
9205
9206
9207
9208static int
9209perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
9210 struct list_head *filters)
9211{
9212 struct perf_addr_filter *filter = NULL;
9213 char *start, *orig, *filename = NULL;
9214 substring_t args[MAX_OPT_ARGS];
9215 int state = IF_STATE_ACTION, token;
9216 unsigned int kernel = 0;
9217 int ret = -EINVAL;
9218
9219 orig = fstr = kstrdup(fstr, GFP_KERNEL);
9220 if (!fstr)
9221 return -ENOMEM;
9222
9223 while ((start = strsep(&fstr, " ,\n")) != NULL) {
9224 static const enum perf_addr_filter_action_t actions[] = {
9225 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
9226 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
9227 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
9228 };
9229 ret = -EINVAL;
9230
9231 if (!*start)
9232 continue;
9233
9234
9235 if (state == IF_STATE_ACTION) {
9236 filter = perf_addr_filter_new(event, filters);
9237 if (!filter)
9238 goto fail;
9239 }
9240
9241 token = match_token(start, if_tokens, args);
9242 switch (token) {
9243 case IF_ACT_FILTER:
9244 case IF_ACT_START:
9245 case IF_ACT_STOP:
9246 if (state != IF_STATE_ACTION)
9247 goto fail;
9248
9249 filter->action = actions[token];
9250 state = IF_STATE_SOURCE;
9251 break;
9252
9253 case IF_SRC_KERNELADDR:
9254 case IF_SRC_KERNEL:
9255 kernel = 1;
9256
9257
9258 case IF_SRC_FILEADDR:
9259 case IF_SRC_FILE:
9260 if (state != IF_STATE_SOURCE)
9261 goto fail;
9262
9263 *args[0].to = 0;
9264 ret = kstrtoul(args[0].from, 0, &filter->offset);
9265 if (ret)
9266 goto fail;
9267
9268 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
9269 *args[1].to = 0;
9270 ret = kstrtoul(args[1].from, 0, &filter->size);
9271 if (ret)
9272 goto fail;
9273 }
9274
9275 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
9276 int fpos = token == IF_SRC_FILE ? 2 : 1;
9277
9278 filename = match_strdup(&args[fpos]);
9279 if (!filename) {
9280 ret = -ENOMEM;
9281 goto fail;
9282 }
9283 }
9284
9285 state = IF_STATE_END;
9286 break;
9287
9288 default:
9289 goto fail;
9290 }
9291
9292
9293
9294
9295
9296
9297 if (state == IF_STATE_END) {
9298 ret = -EINVAL;
9299 if (kernel && event->attr.exclude_kernel)
9300 goto fail;
9301
9302
9303
9304
9305
9306 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
9307 !filter->size)
9308 goto fail;
9309
9310 if (!kernel) {
9311 if (!filename)
9312 goto fail;
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322 ret = -EOPNOTSUPP;
9323 if (!event->ctx->task)
9324 goto fail_free_name;
9325
9326
9327 ret = kern_path(filename, LOOKUP_FOLLOW,
9328 &filter->path);
9329 if (ret)
9330 goto fail_free_name;
9331
9332 kfree(filename);
9333 filename = NULL;
9334
9335 ret = -EINVAL;
9336 if (!filter->path.dentry ||
9337 !S_ISREG(d_inode(filter->path.dentry)
9338 ->i_mode))
9339 goto fail;
9340
9341 event->addr_filters.nr_file_filters++;
9342 }
9343
9344
9345 state = IF_STATE_ACTION;
9346 filter = NULL;
9347 }
9348 }
9349
9350 if (state != IF_STATE_ACTION)
9351 goto fail;
9352
9353 kfree(orig);
9354
9355 return 0;
9356
9357fail_free_name:
9358 kfree(filename);
9359fail:
9360 free_filters_list(filters);
9361 kfree(orig);
9362
9363 return ret;
9364}
9365
9366static int
9367perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
9368{
9369 LIST_HEAD(filters);
9370 int ret;
9371
9372
9373
9374
9375
9376 lockdep_assert_held(&event->ctx->mutex);
9377
9378 if (WARN_ON_ONCE(event->parent))
9379 return -EINVAL;
9380
9381 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
9382 if (ret)
9383 goto fail_clear_files;
9384
9385 ret = event->pmu->addr_filters_validate(&filters);
9386 if (ret)
9387 goto fail_free_filters;
9388
9389
9390 perf_addr_filters_splice(event, &filters);
9391
9392
9393 perf_event_for_each_child(event, perf_event_addr_filters_apply);
9394
9395 return ret;
9396
9397fail_free_filters:
9398 free_filters_list(&filters);
9399
9400fail_clear_files:
9401 event->addr_filters.nr_file_filters = 0;
9402
9403 return ret;
9404}
9405
9406static int perf_event_set_filter(struct perf_event *event, void __user *arg)
9407{
9408 int ret = -EINVAL;
9409 char *filter_str;
9410
9411 filter_str = strndup_user(arg, PAGE_SIZE);
9412 if (IS_ERR(filter_str))
9413 return PTR_ERR(filter_str);
9414
9415#ifdef CONFIG_EVENT_TRACING
9416 if (perf_event_is_tracing(event)) {
9417 struct perf_event_context *ctx = event->ctx;
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430 mutex_unlock(&ctx->mutex);
9431 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9432 mutex_lock(&ctx->mutex);
9433 } else
9434#endif
9435 if (has_addr_filter(event))
9436 ret = perf_event_set_addr_filter(event, filter_str);
9437
9438 kfree(filter_str);
9439 return ret;
9440}
9441
9442
9443
9444
9445
9446static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
9447{
9448 enum hrtimer_restart ret = HRTIMER_RESTART;
9449 struct perf_sample_data data;
9450 struct pt_regs *regs;
9451 struct perf_event *event;
9452 u64 period;
9453
9454 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
9455
9456 if (event->state != PERF_EVENT_STATE_ACTIVE)
9457 return HRTIMER_NORESTART;
9458
9459 event->pmu->read(event);
9460
9461 perf_sample_data_init(&data, 0, event->hw.last_period);
9462 regs = get_irq_regs();
9463
9464 if (regs && !perf_exclude_event(event, regs)) {
9465 if (!(event->attr.exclude_idle && is_idle_task(current)))
9466 if (__perf_event_overflow(event, 1, &data, regs))
9467 ret = HRTIMER_NORESTART;
9468 }
9469
9470 period = max_t(u64, 10000, event->hw.sample_period);
9471 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
9472
9473 return ret;
9474}
9475
9476static void perf_swevent_start_hrtimer(struct perf_event *event)
9477{
9478 struct hw_perf_event *hwc = &event->hw;
9479 s64 period;
9480
9481 if (!is_sampling_event(event))
9482 return;
9483
9484 period = local64_read(&hwc->period_left);
9485 if (period) {
9486 if (period < 0)
9487 period = 10000;
9488
9489 local64_set(&hwc->period_left, 0);
9490 } else {
9491 period = max_t(u64, 10000, hwc->sample_period);
9492 }
9493 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9494 HRTIMER_MODE_REL_PINNED);
9495}
9496
9497static void perf_swevent_cancel_hrtimer(struct perf_event *event)
9498{
9499 struct hw_perf_event *hwc = &event->hw;
9500
9501 if (is_sampling_event(event)) {
9502 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
9503 local64_set(&hwc->period_left, ktime_to_ns(remaining));
9504
9505 hrtimer_cancel(&hwc->hrtimer);
9506 }
9507}
9508
9509static void perf_swevent_init_hrtimer(struct perf_event *event)
9510{
9511 struct hw_perf_event *hwc = &event->hw;
9512
9513 if (!is_sampling_event(event))
9514 return;
9515
9516 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
9517 hwc->hrtimer.function = perf_swevent_hrtimer;
9518
9519
9520
9521
9522
9523 if (event->attr.freq) {
9524 long freq = event->attr.sample_freq;
9525
9526 event->attr.sample_period = NSEC_PER_SEC / freq;
9527 hwc->sample_period = event->attr.sample_period;
9528 local64_set(&hwc->period_left, hwc->sample_period);
9529 hwc->last_period = hwc->sample_period;
9530 event->attr.freq = 0;
9531 }
9532}
9533
9534
9535
9536
9537
9538static void cpu_clock_event_update(struct perf_event *event)
9539{
9540 s64 prev;
9541 u64 now;
9542
9543 now = local_clock();
9544 prev = local64_xchg(&event->hw.prev_count, now);
9545 local64_add(now - prev, &event->count);
9546}
9547
9548static void cpu_clock_event_start(struct perf_event *event, int flags)
9549{
9550 local64_set(&event->hw.prev_count, local_clock());
9551 perf_swevent_start_hrtimer(event);
9552}
9553
9554static void cpu_clock_event_stop(struct perf_event *event, int flags)
9555{
9556 perf_swevent_cancel_hrtimer(event);
9557 cpu_clock_event_update(event);
9558}
9559
9560static int cpu_clock_event_add(struct perf_event *event, int flags)
9561{
9562 if (flags & PERF_EF_START)
9563 cpu_clock_event_start(event, flags);
9564 perf_event_update_userpage(event);
9565
9566 return 0;
9567}
9568
9569static void cpu_clock_event_del(struct perf_event *event, int flags)
9570{
9571 cpu_clock_event_stop(event, flags);
9572}
9573
9574static void cpu_clock_event_read(struct perf_event *event)
9575{
9576 cpu_clock_event_update(event);
9577}
9578
9579static int cpu_clock_event_init(struct perf_event *event)
9580{
9581 if (event->attr.type != PERF_TYPE_SOFTWARE)
9582 return -ENOENT;
9583
9584 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
9585 return -ENOENT;
9586
9587
9588
9589
9590 if (has_branch_stack(event))
9591 return -EOPNOTSUPP;
9592
9593 perf_swevent_init_hrtimer(event);
9594
9595 return 0;
9596}
9597
9598static struct pmu perf_cpu_clock = {
9599 .task_ctx_nr = perf_sw_context,
9600
9601 .capabilities = PERF_PMU_CAP_NO_NMI,
9602
9603 .event_init = cpu_clock_event_init,
9604 .add = cpu_clock_event_add,
9605 .del = cpu_clock_event_del,
9606 .start = cpu_clock_event_start,
9607 .stop = cpu_clock_event_stop,
9608 .read = cpu_clock_event_read,
9609};
9610
9611
9612
9613
9614
9615static void task_clock_event_update(struct perf_event *event, u64 now)
9616{
9617 u64 prev;
9618 s64 delta;
9619
9620 prev = local64_xchg(&event->hw.prev_count, now);
9621 delta = now - prev;
9622 local64_add(delta, &event->count);
9623}
9624
9625static void task_clock_event_start(struct perf_event *event, int flags)
9626{
9627 local64_set(&event->hw.prev_count, event->ctx->time);
9628 perf_swevent_start_hrtimer(event);
9629}
9630
9631static void task_clock_event_stop(struct perf_event *event, int flags)
9632{
9633 perf_swevent_cancel_hrtimer(event);
9634 task_clock_event_update(event, event->ctx->time);
9635}
9636
9637static int task_clock_event_add(struct perf_event *event, int flags)
9638{
9639 if (flags & PERF_EF_START)
9640 task_clock_event_start(event, flags);
9641 perf_event_update_userpage(event);
9642
9643 return 0;
9644}
9645
9646static void task_clock_event_del(struct perf_event *event, int flags)
9647{
9648 task_clock_event_stop(event, PERF_EF_UPDATE);
9649}
9650
9651static void task_clock_event_read(struct perf_event *event)
9652{
9653 u64 now = perf_clock();
9654 u64 delta = now - event->ctx->timestamp;
9655 u64 time = event->ctx->time + delta;
9656
9657 task_clock_event_update(event, time);
9658}
9659
9660static int task_clock_event_init(struct perf_event *event)
9661{
9662 if (event->attr.type != PERF_TYPE_SOFTWARE)
9663 return -ENOENT;
9664
9665 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
9666 return -ENOENT;
9667
9668
9669
9670
9671 if (has_branch_stack(event))
9672 return -EOPNOTSUPP;
9673
9674 perf_swevent_init_hrtimer(event);
9675
9676 return 0;
9677}
9678
9679static struct pmu perf_task_clock = {
9680 .task_ctx_nr = perf_sw_context,
9681
9682 .capabilities = PERF_PMU_CAP_NO_NMI,
9683
9684 .event_init = task_clock_event_init,
9685 .add = task_clock_event_add,
9686 .del = task_clock_event_del,
9687 .start = task_clock_event_start,
9688 .stop = task_clock_event_stop,
9689 .read = task_clock_event_read,
9690};
9691
9692static void perf_pmu_nop_void(struct pmu *pmu)
9693{
9694}
9695
9696static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
9697{
9698}
9699
9700static int perf_pmu_nop_int(struct pmu *pmu)
9701{
9702 return 0;
9703}
9704
9705static int perf_event_nop_int(struct perf_event *event, u64 value)
9706{
9707 return 0;
9708}
9709
9710static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9711
9712static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9713{
9714 __this_cpu_write(nop_txn_flags, flags);
9715
9716 if (flags & ~PERF_PMU_TXN_ADD)
9717 return;
9718
9719 perf_pmu_disable(pmu);
9720}
9721
9722static int perf_pmu_commit_txn(struct pmu *pmu)
9723{
9724 unsigned int flags = __this_cpu_read(nop_txn_flags);
9725
9726 __this_cpu_write(nop_txn_flags, 0);
9727
9728 if (flags & ~PERF_PMU_TXN_ADD)
9729 return 0;
9730
9731 perf_pmu_enable(pmu);
9732 return 0;
9733}
9734
9735static void perf_pmu_cancel_txn(struct pmu *pmu)
9736{
9737 unsigned int flags = __this_cpu_read(nop_txn_flags);
9738
9739 __this_cpu_write(nop_txn_flags, 0);
9740
9741 if (flags & ~PERF_PMU_TXN_ADD)
9742 return;
9743
9744 perf_pmu_enable(pmu);
9745}
9746
9747static int perf_event_idx_default(struct perf_event *event)
9748{
9749 return 0;
9750}
9751
9752
9753
9754
9755
9756static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9757{
9758 struct pmu *pmu;
9759
9760 if (ctxn < 0)
9761 return NULL;
9762
9763 list_for_each_entry(pmu, &pmus, entry) {
9764 if (pmu->task_ctx_nr == ctxn)
9765 return pmu->pmu_cpu_context;
9766 }
9767
9768 return NULL;
9769}
9770
9771static void free_pmu_context(struct pmu *pmu)
9772{
9773
9774
9775
9776
9777
9778 if (pmu->task_ctx_nr > perf_invalid_context)
9779 return;
9780
9781 free_percpu(pmu->pmu_cpu_context);
9782}
9783
9784
9785
9786
9787static ssize_t nr_addr_filters_show(struct device *dev,
9788 struct device_attribute *attr,
9789 char *page)
9790{
9791 struct pmu *pmu = dev_get_drvdata(dev);
9792
9793 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9794}
9795DEVICE_ATTR_RO(nr_addr_filters);
9796
9797static struct idr pmu_idr;
9798
9799static ssize_t
9800type_show(struct device *dev, struct device_attribute *attr, char *page)
9801{
9802 struct pmu *pmu = dev_get_drvdata(dev);
9803
9804 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9805}
9806static DEVICE_ATTR_RO(type);
9807
9808static ssize_t
9809perf_event_mux_interval_ms_show(struct device *dev,
9810 struct device_attribute *attr,
9811 char *page)
9812{
9813 struct pmu *pmu = dev_get_drvdata(dev);
9814
9815 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9816}
9817
9818static DEFINE_MUTEX(mux_interval_mutex);
9819
9820static ssize_t
9821perf_event_mux_interval_ms_store(struct device *dev,
9822 struct device_attribute *attr,
9823 const char *buf, size_t count)
9824{
9825 struct pmu *pmu = dev_get_drvdata(dev);
9826 int timer, cpu, ret;
9827
9828 ret = kstrtoint(buf, 0, &timer);
9829 if (ret)
9830 return ret;
9831
9832 if (timer < 1)
9833 return -EINVAL;
9834
9835
9836 if (timer == pmu->hrtimer_interval_ms)
9837 return count;
9838
9839 mutex_lock(&mux_interval_mutex);
9840 pmu->hrtimer_interval_ms = timer;
9841
9842
9843 cpus_read_lock();
9844 for_each_online_cpu(cpu) {
9845 struct perf_cpu_context *cpuctx;
9846 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9847 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9848
9849 cpu_function_call(cpu,
9850 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9851 }
9852 cpus_read_unlock();
9853 mutex_unlock(&mux_interval_mutex);
9854
9855 return count;
9856}
9857static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9858
9859static struct attribute *pmu_dev_attrs[] = {
9860 &dev_attr_type.attr,
9861 &dev_attr_perf_event_mux_interval_ms.attr,
9862 NULL,
9863};
9864ATTRIBUTE_GROUPS(pmu_dev);
9865
9866static int pmu_bus_running;
9867static struct bus_type pmu_bus = {
9868 .name = "event_source",
9869 .dev_groups = pmu_dev_groups,
9870};
9871
9872static void pmu_dev_release(struct device *dev)
9873{
9874 kfree(dev);
9875}
9876
9877static int pmu_dev_alloc(struct pmu *pmu)
9878{
9879 int ret = -ENOMEM;
9880
9881 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9882 if (!pmu->dev)
9883 goto out;
9884
9885 pmu->dev->groups = pmu->attr_groups;
9886 device_initialize(pmu->dev);
9887 ret = dev_set_name(pmu->dev, "%s", pmu->name);
9888 if (ret)
9889 goto free_dev;
9890
9891 dev_set_drvdata(pmu->dev, pmu);
9892 pmu->dev->bus = &pmu_bus;
9893 pmu->dev->release = pmu_dev_release;
9894 ret = device_add(pmu->dev);
9895 if (ret)
9896 goto free_dev;
9897
9898
9899 if (pmu->nr_addr_filters)
9900 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9901
9902 if (ret)
9903 goto del_dev;
9904
9905 if (pmu->attr_update)
9906 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
9907
9908 if (ret)
9909 goto del_dev;
9910
9911out:
9912 return ret;
9913
9914del_dev:
9915 device_del(pmu->dev);
9916
9917free_dev:
9918 put_device(pmu->dev);
9919 goto out;
9920}
9921
9922static struct lock_class_key cpuctx_mutex;
9923static struct lock_class_key cpuctx_lock;
9924
9925int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9926{
9927 int cpu, ret;
9928
9929 mutex_lock(&pmus_lock);
9930 ret = -ENOMEM;
9931 pmu->pmu_disable_count = alloc_percpu(int);
9932 if (!pmu->pmu_disable_count)
9933 goto unlock;
9934
9935 pmu->type = -1;
9936 if (!name)
9937 goto skip_type;
9938 pmu->name = name;
9939
9940 if (type < 0) {
9941 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9942 if (type < 0) {
9943 ret = type;
9944 goto free_pdc;
9945 }
9946 }
9947 pmu->type = type;
9948
9949 if (pmu_bus_running) {
9950 ret = pmu_dev_alloc(pmu);
9951 if (ret)
9952 goto free_idr;
9953 }
9954
9955skip_type:
9956 if (pmu->task_ctx_nr == perf_hw_context) {
9957 static int hw_context_taken = 0;
9958
9959
9960
9961
9962
9963
9964 if (WARN_ON_ONCE(hw_context_taken &&
9965 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9966 pmu->task_ctx_nr = perf_invalid_context;
9967
9968 hw_context_taken = 1;
9969 }
9970
9971 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9972 if (pmu->pmu_cpu_context)
9973 goto got_cpu_context;
9974
9975 ret = -ENOMEM;
9976 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9977 if (!pmu->pmu_cpu_context)
9978 goto free_dev;
9979
9980 for_each_possible_cpu(cpu) {
9981 struct perf_cpu_context *cpuctx;
9982
9983 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9984 __perf_event_init_context(&cpuctx->ctx);
9985 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9986 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9987 cpuctx->ctx.pmu = pmu;
9988 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9989
9990 __perf_mux_hrtimer_init(cpuctx, cpu);
9991 }
9992
9993got_cpu_context:
9994 if (!pmu->start_txn) {
9995 if (pmu->pmu_enable) {
9996
9997
9998
9999
10000
10001 pmu->start_txn = perf_pmu_start_txn;
10002 pmu->commit_txn = perf_pmu_commit_txn;
10003 pmu->cancel_txn = perf_pmu_cancel_txn;
10004 } else {
10005 pmu->start_txn = perf_pmu_nop_txn;
10006 pmu->commit_txn = perf_pmu_nop_int;
10007 pmu->cancel_txn = perf_pmu_nop_void;
10008 }
10009 }
10010
10011 if (!pmu->pmu_enable) {
10012 pmu->pmu_enable = perf_pmu_nop_void;
10013 pmu->pmu_disable = perf_pmu_nop_void;
10014 }
10015
10016 if (!pmu->check_period)
10017 pmu->check_period = perf_event_nop_int;
10018
10019 if (!pmu->event_idx)
10020 pmu->event_idx = perf_event_idx_default;
10021
10022 list_add_rcu(&pmu->entry, &pmus);
10023 atomic_set(&pmu->exclusive_cnt, 0);
10024 ret = 0;
10025unlock:
10026 mutex_unlock(&pmus_lock);
10027
10028 return ret;
10029
10030free_dev:
10031 device_del(pmu->dev);
10032 put_device(pmu->dev);
10033
10034free_idr:
10035 if (pmu->type >= PERF_TYPE_MAX)
10036 idr_remove(&pmu_idr, pmu->type);
10037
10038free_pdc:
10039 free_percpu(pmu->pmu_disable_count);
10040 goto unlock;
10041}
10042EXPORT_SYMBOL_GPL(perf_pmu_register);
10043
10044void perf_pmu_unregister(struct pmu *pmu)
10045{
10046 mutex_lock(&pmus_lock);
10047 list_del_rcu(&pmu->entry);
10048
10049
10050
10051
10052
10053 synchronize_srcu(&pmus_srcu);
10054 synchronize_rcu();
10055
10056 free_percpu(pmu->pmu_disable_count);
10057 if (pmu->type >= PERF_TYPE_MAX)
10058 idr_remove(&pmu_idr, pmu->type);
10059 if (pmu_bus_running) {
10060 if (pmu->nr_addr_filters)
10061 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
10062 device_del(pmu->dev);
10063 put_device(pmu->dev);
10064 }
10065 free_pmu_context(pmu);
10066 mutex_unlock(&pmus_lock);
10067}
10068EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10069
10070static inline bool has_extended_regs(struct perf_event *event)
10071{
10072 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10073 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10074}
10075
10076static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10077{
10078 struct perf_event_context *ctx = NULL;
10079 int ret;
10080
10081 if (!try_module_get(pmu->module))
10082 return -ENODEV;
10083
10084
10085
10086
10087
10088
10089
10090 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
10091
10092
10093
10094
10095 ctx = perf_event_ctx_lock_nested(event->group_leader,
10096 SINGLE_DEPTH_NESTING);
10097 BUG_ON(!ctx);
10098 }
10099
10100 event->pmu = pmu;
10101 ret = pmu->event_init(event);
10102
10103 if (ctx)
10104 perf_event_ctx_unlock(event->group_leader, ctx);
10105
10106 if (!ret) {
10107 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
10108 has_extended_regs(event))
10109 ret = -EOPNOTSUPP;
10110
10111 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
10112 event_has_any_exclude_flag(event))
10113 ret = -EINVAL;
10114
10115 if (ret && event->destroy)
10116 event->destroy(event);
10117 }
10118
10119 if (ret)
10120 module_put(pmu->module);
10121
10122 return ret;
10123}
10124
10125static struct pmu *perf_init_event(struct perf_event *event)
10126{
10127 struct pmu *pmu;
10128 int idx;
10129 int ret;
10130
10131 idx = srcu_read_lock(&pmus_srcu);
10132
10133
10134 if (event->parent && event->parent->pmu) {
10135 pmu = event->parent->pmu;
10136 ret = perf_try_init_event(pmu, event);
10137 if (!ret)
10138 goto unlock;
10139 }
10140
10141 rcu_read_lock();
10142 pmu = idr_find(&pmu_idr, event->attr.type);
10143 rcu_read_unlock();
10144 if (pmu) {
10145 ret = perf_try_init_event(pmu, event);
10146 if (ret)
10147 pmu = ERR_PTR(ret);
10148 goto unlock;
10149 }
10150
10151 list_for_each_entry_rcu(pmu, &pmus, entry) {
10152 ret = perf_try_init_event(pmu, event);
10153 if (!ret)
10154 goto unlock;
10155
10156 if (ret != -ENOENT) {
10157 pmu = ERR_PTR(ret);
10158 goto unlock;
10159 }
10160 }
10161 pmu = ERR_PTR(-ENOENT);
10162unlock:
10163 srcu_read_unlock(&pmus_srcu, idx);
10164
10165 return pmu;
10166}
10167
10168static void attach_sb_event(struct perf_event *event)
10169{
10170 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
10171
10172 raw_spin_lock(&pel->lock);
10173 list_add_rcu(&event->sb_list, &pel->list);
10174 raw_spin_unlock(&pel->lock);
10175}
10176
10177
10178
10179
10180
10181
10182
10183
10184static void account_pmu_sb_event(struct perf_event *event)
10185{
10186 if (is_sb_event(event))
10187 attach_sb_event(event);
10188}
10189
10190static void account_event_cpu(struct perf_event *event, int cpu)
10191{
10192 if (event->parent)
10193 return;
10194
10195 if (is_cgroup_event(event))
10196 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
10197}
10198
10199
10200static void account_freq_event_nohz(void)
10201{
10202#ifdef CONFIG_NO_HZ_FULL
10203
10204 spin_lock(&nr_freq_lock);
10205 if (atomic_inc_return(&nr_freq_events) == 1)
10206 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
10207 spin_unlock(&nr_freq_lock);
10208#endif
10209}
10210
10211static void account_freq_event(void)
10212{
10213 if (tick_nohz_full_enabled())
10214 account_freq_event_nohz();
10215 else
10216 atomic_inc(&nr_freq_events);
10217}
10218
10219
10220static void account_event(struct perf_event *event)
10221{
10222 bool inc = false;
10223
10224 if (event->parent)
10225 return;
10226
10227 if (event->attach_state & PERF_ATTACH_TASK)
10228 inc = true;
10229 if (event->attr.mmap || event->attr.mmap_data)
10230 atomic_inc(&nr_mmap_events);
10231 if (event->attr.comm)
10232 atomic_inc(&nr_comm_events);
10233 if (event->attr.namespaces)
10234 atomic_inc(&nr_namespaces_events);
10235 if (event->attr.task)
10236 atomic_inc(&nr_task_events);
10237 if (event->attr.freq)
10238 account_freq_event();
10239 if (event->attr.context_switch) {
10240 atomic_inc(&nr_switch_events);
10241 inc = true;
10242 }
10243 if (has_branch_stack(event))
10244 inc = true;
10245 if (is_cgroup_event(event))
10246 inc = true;
10247 if (event->attr.ksymbol)
10248 atomic_inc(&nr_ksymbol_events);
10249 if (event->attr.bpf_event)
10250 atomic_inc(&nr_bpf_events);
10251
10252 if (inc) {
10253
10254
10255
10256
10257
10258 if (atomic_inc_not_zero(&perf_sched_count))
10259 goto enabled;
10260
10261 mutex_lock(&perf_sched_mutex);
10262 if (!atomic_read(&perf_sched_count)) {
10263 static_branch_enable(&perf_sched_events);
10264
10265
10266
10267
10268
10269 synchronize_rcu();
10270 }
10271
10272
10273
10274
10275 atomic_inc(&perf_sched_count);
10276 mutex_unlock(&perf_sched_mutex);
10277 }
10278enabled:
10279
10280 account_event_cpu(event, event->cpu);
10281
10282 account_pmu_sb_event(event);
10283}
10284
10285
10286
10287
10288static struct perf_event *
10289perf_event_alloc(struct perf_event_attr *attr, int cpu,
10290 struct task_struct *task,
10291 struct perf_event *group_leader,
10292 struct perf_event *parent_event,
10293 perf_overflow_handler_t overflow_handler,
10294 void *context, int cgroup_fd)
10295{
10296 struct pmu *pmu;
10297 struct perf_event *event;
10298 struct hw_perf_event *hwc;
10299 long err = -EINVAL;
10300
10301 if ((unsigned)cpu >= nr_cpu_ids) {
10302 if (!task || cpu != -1)
10303 return ERR_PTR(-EINVAL);
10304 }
10305
10306 event = kzalloc(sizeof(*event), GFP_KERNEL);
10307 if (!event)
10308 return ERR_PTR(-ENOMEM);
10309
10310
10311
10312
10313
10314 if (!group_leader)
10315 group_leader = event;
10316
10317 mutex_init(&event->child_mutex);
10318 INIT_LIST_HEAD(&event->child_list);
10319
10320 INIT_LIST_HEAD(&event->event_entry);
10321 INIT_LIST_HEAD(&event->sibling_list);
10322 INIT_LIST_HEAD(&event->active_list);
10323 init_event_group(event);
10324 INIT_LIST_HEAD(&event->rb_entry);
10325 INIT_LIST_HEAD(&event->active_entry);
10326 INIT_LIST_HEAD(&event->addr_filters.list);
10327 INIT_HLIST_NODE(&event->hlist_entry);
10328
10329
10330 init_waitqueue_head(&event->waitq);
10331 event->pending_disable = -1;
10332 init_irq_work(&event->pending, perf_pending_event);
10333
10334 mutex_init(&event->mmap_mutex);
10335 raw_spin_lock_init(&event->addr_filters.lock);
10336
10337 atomic_long_set(&event->refcount, 1);
10338 event->cpu = cpu;
10339 event->attr = *attr;
10340 event->group_leader = group_leader;
10341 event->pmu = NULL;
10342 event->oncpu = -1;
10343
10344 event->parent = parent_event;
10345
10346 event->ns = get_pid_ns(task_active_pid_ns(current));
10347 event->id = atomic64_inc_return(&perf_event_id);
10348
10349 event->state = PERF_EVENT_STATE_INACTIVE;
10350
10351 if (task) {
10352 event->attach_state = PERF_ATTACH_TASK;
10353
10354
10355
10356
10357
10358 get_task_struct(task);
10359 event->hw.target = task;
10360 }
10361
10362 event->clock = &local_clock;
10363 if (parent_event)
10364 event->clock = parent_event->clock;
10365
10366 if (!overflow_handler && parent_event) {
10367 overflow_handler = parent_event->overflow_handler;
10368 context = parent_event->overflow_handler_context;
10369#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10370 if (overflow_handler == bpf_overflow_handler) {
10371 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
10372
10373 if (IS_ERR(prog)) {
10374 err = PTR_ERR(prog);
10375 goto err_ns;
10376 }
10377 event->prog = prog;
10378 event->orig_overflow_handler =
10379 parent_event->orig_overflow_handler;
10380 }
10381#endif
10382 }
10383
10384 if (overflow_handler) {
10385 event->overflow_handler = overflow_handler;
10386 event->overflow_handler_context = context;
10387 } else if (is_write_backward(event)){
10388 event->overflow_handler = perf_event_output_backward;
10389 event->overflow_handler_context = NULL;
10390 } else {
10391 event->overflow_handler = perf_event_output_forward;
10392 event->overflow_handler_context = NULL;
10393 }
10394
10395 perf_event__state_init(event);
10396
10397 pmu = NULL;
10398
10399 hwc = &event->hw;
10400 hwc->sample_period = attr->sample_period;
10401 if (attr->freq && attr->sample_freq)
10402 hwc->sample_period = 1;
10403 hwc->last_period = hwc->sample_period;
10404
10405 local64_set(&hwc->period_left, hwc->sample_period);
10406
10407
10408
10409
10410
10411 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
10412 goto err_ns;
10413
10414 if (!has_branch_stack(event))
10415 event->attr.branch_sample_type = 0;
10416
10417 if (cgroup_fd != -1) {
10418 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10419 if (err)
10420 goto err_ns;
10421 }
10422
10423 pmu = perf_init_event(event);
10424 if (IS_ERR(pmu)) {
10425 err = PTR_ERR(pmu);
10426 goto err_ns;
10427 }
10428
10429 err = exclusive_event_init(event);
10430 if (err)
10431 goto err_pmu;
10432
10433 if (has_addr_filter(event)) {
10434 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
10435 sizeof(struct perf_addr_filter_range),
10436 GFP_KERNEL);
10437 if (!event->addr_filter_ranges) {
10438 err = -ENOMEM;
10439 goto err_per_task;
10440 }
10441
10442
10443
10444
10445
10446 if (event->parent) {
10447 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10448
10449 raw_spin_lock_irq(&ifh->lock);
10450 memcpy(event->addr_filter_ranges,
10451 event->parent->addr_filter_ranges,
10452 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
10453 raw_spin_unlock_irq(&ifh->lock);
10454 }
10455
10456
10457 event->addr_filters_gen = 1;
10458 }
10459
10460 if (!event->parent) {
10461 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
10462 err = get_callchain_buffers(attr->sample_max_stack);
10463 if (err)
10464 goto err_addr_filters;
10465 }
10466 }
10467
10468
10469 account_event(event);
10470
10471 return event;
10472
10473err_addr_filters:
10474 kfree(event->addr_filter_ranges);
10475
10476err_per_task:
10477 exclusive_event_destroy(event);
10478
10479err_pmu:
10480 if (event->destroy)
10481 event->destroy(event);
10482 module_put(pmu->module);
10483err_ns:
10484 if (is_cgroup_event(event))
10485 perf_detach_cgroup(event);
10486 if (event->ns)
10487 put_pid_ns(event->ns);
10488 if (event->hw.target)
10489 put_task_struct(event->hw.target);
10490 kfree(event);
10491
10492 return ERR_PTR(err);
10493}
10494
10495static int perf_copy_attr(struct perf_event_attr __user *uattr,
10496 struct perf_event_attr *attr)
10497{
10498 u32 size;
10499 int ret;
10500
10501 if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
10502 return -EFAULT;
10503
10504
10505
10506
10507 memset(attr, 0, sizeof(*attr));
10508
10509 ret = get_user(size, &uattr->size);
10510 if (ret)
10511 return ret;
10512
10513 if (size > PAGE_SIZE)
10514 goto err_size;
10515
10516 if (!size)
10517 size = PERF_ATTR_SIZE_VER0;
10518
10519 if (size < PERF_ATTR_SIZE_VER0)
10520 goto err_size;
10521
10522
10523
10524
10525
10526
10527
10528 if (size > sizeof(*attr)) {
10529 unsigned char __user *addr;
10530 unsigned char __user *end;
10531 unsigned char val;
10532
10533 addr = (void __user *)uattr + sizeof(*attr);
10534 end = (void __user *)uattr + size;
10535
10536 for (; addr < end; addr++) {
10537 ret = get_user(val, addr);
10538 if (ret)
10539 return ret;
10540 if (val)
10541 goto err_size;
10542 }
10543 size = sizeof(*attr);
10544 }
10545
10546 ret = copy_from_user(attr, uattr, size);
10547 if (ret)
10548 return -EFAULT;
10549
10550 attr->size = size;
10551
10552 if (attr->__reserved_1)
10553 return -EINVAL;
10554
10555 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
10556 return -EINVAL;
10557
10558 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
10559 return -EINVAL;
10560
10561 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
10562 u64 mask = attr->branch_sample_type;
10563
10564
10565 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
10566 return -EINVAL;
10567
10568
10569 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
10570 return -EINVAL;
10571
10572
10573 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
10574
10575
10576 if (!attr->exclude_kernel)
10577 mask |= PERF_SAMPLE_BRANCH_KERNEL;
10578
10579 if (!attr->exclude_user)
10580 mask |= PERF_SAMPLE_BRANCH_USER;
10581
10582 if (!attr->exclude_hv)
10583 mask |= PERF_SAMPLE_BRANCH_HV;
10584
10585
10586
10587 attr->branch_sample_type = mask;
10588 }
10589
10590 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
10591 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10592 return -EACCES;
10593 }
10594
10595 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
10596 ret = perf_reg_validate(attr->sample_regs_user);
10597 if (ret)
10598 return ret;
10599 }
10600
10601 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
10602 if (!arch_perf_have_user_stack_dump())
10603 return -ENOSYS;
10604
10605
10606
10607
10608
10609
10610 if (attr->sample_stack_user >= USHRT_MAX)
10611 return -EINVAL;
10612 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
10613 return -EINVAL;
10614 }
10615
10616 if (!attr->sample_max_stack)
10617 attr->sample_max_stack = sysctl_perf_event_max_stack;
10618
10619 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10620 ret = perf_reg_validate(attr->sample_regs_intr);
10621out:
10622 return ret;
10623
10624err_size:
10625 put_user(sizeof(*attr), &uattr->size);
10626 ret = -E2BIG;
10627 goto out;
10628}
10629
10630static int
10631perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
10632{
10633 struct ring_buffer *rb = NULL;
10634 int ret = -EINVAL;
10635
10636 if (!output_event)
10637 goto set;
10638
10639
10640 if (event == output_event)
10641 goto out;
10642
10643
10644
10645
10646 if (output_event->cpu != event->cpu)
10647 goto out;
10648
10649
10650
10651
10652 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
10653 goto out;
10654
10655
10656
10657
10658 if (output_event->clock != event->clock)
10659 goto out;
10660
10661
10662
10663
10664
10665 if (is_write_backward(output_event) != is_write_backward(event))
10666 goto out;
10667
10668
10669
10670
10671 if (has_aux(event) && has_aux(output_event) &&
10672 event->pmu != output_event->pmu)
10673 goto out;
10674
10675set:
10676 mutex_lock(&event->mmap_mutex);
10677
10678 if (atomic_read(&event->mmap_count))
10679 goto unlock;
10680
10681 if (output_event) {
10682
10683 rb = ring_buffer_get(output_event);
10684 if (!rb)
10685 goto unlock;
10686 }
10687
10688 ring_buffer_attach(event, rb);
10689
10690 ret = 0;
10691unlock:
10692 mutex_unlock(&event->mmap_mutex);
10693
10694out:
10695 return ret;
10696}
10697
10698static void mutex_lock_double(struct mutex *a, struct mutex *b)
10699{
10700 if (b < a)
10701 swap(a, b);
10702
10703 mutex_lock(a);
10704 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10705}
10706
10707static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10708{
10709 bool nmi_safe = false;
10710
10711 switch (clk_id) {
10712 case CLOCK_MONOTONIC:
10713 event->clock = &ktime_get_mono_fast_ns;
10714 nmi_safe = true;
10715 break;
10716
10717 case CLOCK_MONOTONIC_RAW:
10718 event->clock = &ktime_get_raw_fast_ns;
10719 nmi_safe = true;
10720 break;
10721
10722 case CLOCK_REALTIME:
10723 event->clock = &ktime_get_real_ns;
10724 break;
10725
10726 case CLOCK_BOOTTIME:
10727 event->clock = &ktime_get_boottime_ns;
10728 break;
10729
10730 case CLOCK_TAI:
10731 event->clock = &ktime_get_clocktai_ns;
10732 break;
10733
10734 default:
10735 return -EINVAL;
10736 }
10737
10738 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
10739 return -EINVAL;
10740
10741 return 0;
10742}
10743
10744
10745
10746
10747
10748static struct perf_event_context *
10749__perf_event_ctx_lock_double(struct perf_event *group_leader,
10750 struct perf_event_context *ctx)
10751{
10752 struct perf_event_context *gctx;
10753
10754again:
10755 rcu_read_lock();
10756 gctx = READ_ONCE(group_leader->ctx);
10757 if (!refcount_inc_not_zero(&gctx->refcount)) {
10758 rcu_read_unlock();
10759 goto again;
10760 }
10761 rcu_read_unlock();
10762
10763 mutex_lock_double(&gctx->mutex, &ctx->mutex);
10764
10765 if (group_leader->ctx != gctx) {
10766 mutex_unlock(&ctx->mutex);
10767 mutex_unlock(&gctx->mutex);
10768 put_ctx(gctx);
10769 goto again;
10770 }
10771
10772 return gctx;
10773}
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783SYSCALL_DEFINE5(perf_event_open,
10784 struct perf_event_attr __user *, attr_uptr,
10785 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10786{
10787 struct perf_event *group_leader = NULL, *output_event = NULL;
10788 struct perf_event *event, *sibling;
10789 struct perf_event_attr attr;
10790 struct perf_event_context *ctx, *uninitialized_var(gctx);
10791 struct file *event_file = NULL;
10792 struct fd group = {NULL, 0};
10793 struct task_struct *task = NULL;
10794 struct pmu *pmu;
10795 int event_fd;
10796 int move_group = 0;
10797 int err;
10798 int f_flags = O_RDWR;
10799 int cgroup_fd = -1;
10800
10801
10802 if (flags & ~PERF_FLAG_ALL)
10803 return -EINVAL;
10804
10805 err = perf_copy_attr(attr_uptr, &attr);
10806 if (err)
10807 return err;
10808
10809 if (!attr.exclude_kernel) {
10810 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10811 return -EACCES;
10812 }
10813
10814 if (attr.namespaces) {
10815 if (!capable(CAP_SYS_ADMIN))
10816 return -EACCES;
10817 }
10818
10819 if (attr.freq) {
10820 if (attr.sample_freq > sysctl_perf_event_sample_rate)
10821 return -EINVAL;
10822 } else {
10823 if (attr.sample_period & (1ULL << 63))
10824 return -EINVAL;
10825 }
10826
10827
10828 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10829 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10830 return -EACCES;
10831
10832
10833
10834
10835
10836
10837
10838 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10839 return -EINVAL;
10840
10841 if (flags & PERF_FLAG_FD_CLOEXEC)
10842 f_flags |= O_CLOEXEC;
10843
10844 event_fd = get_unused_fd_flags(f_flags);
10845 if (event_fd < 0)
10846 return event_fd;
10847
10848 if (group_fd != -1) {
10849 err = perf_fget_light(group_fd, &group);
10850 if (err)
10851 goto err_fd;
10852 group_leader = group.file->private_data;
10853 if (flags & PERF_FLAG_FD_OUTPUT)
10854 output_event = group_leader;
10855 if (flags & PERF_FLAG_FD_NO_GROUP)
10856 group_leader = NULL;
10857 }
10858
10859 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10860 task = find_lively_task_by_vpid(pid);
10861 if (IS_ERR(task)) {
10862 err = PTR_ERR(task);
10863 goto err_group_fd;
10864 }
10865 }
10866
10867 if (task && group_leader &&
10868 group_leader->attr.inherit != attr.inherit) {
10869 err = -EINVAL;
10870 goto err_task;
10871 }
10872
10873 if (task) {
10874 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10875 if (err)
10876 goto err_task;
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886 err = -EACCES;
10887 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10888 goto err_cred;
10889 }
10890
10891 if (flags & PERF_FLAG_PID_CGROUP)
10892 cgroup_fd = pid;
10893
10894 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10895 NULL, NULL, cgroup_fd);
10896 if (IS_ERR(event)) {
10897 err = PTR_ERR(event);
10898 goto err_cred;
10899 }
10900
10901 if (is_sampling_event(event)) {
10902 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10903 err = -EOPNOTSUPP;
10904 goto err_alloc;
10905 }
10906 }
10907
10908
10909
10910
10911
10912 pmu = event->pmu;
10913
10914 if (attr.use_clockid) {
10915 err = perf_event_set_clock(event, attr.clockid);
10916 if (err)
10917 goto err_alloc;
10918 }
10919
10920 if (pmu->task_ctx_nr == perf_sw_context)
10921 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10922
10923 if (group_leader) {
10924 if (is_software_event(event) &&
10925 !in_software_context(group_leader)) {
10926
10927
10928
10929
10930
10931
10932
10933
10934 pmu = group_leader->ctx->pmu;
10935 } else if (!is_software_event(event) &&
10936 is_software_event(group_leader) &&
10937 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10938
10939
10940
10941
10942
10943 move_group = 1;
10944 }
10945 }
10946
10947
10948
10949
10950 ctx = find_get_context(pmu, task, event);
10951 if (IS_ERR(ctx)) {
10952 err = PTR_ERR(ctx);
10953 goto err_alloc;
10954 }
10955
10956
10957
10958
10959 if (group_leader) {
10960 err = -EINVAL;
10961
10962
10963
10964
10965
10966 if (group_leader->group_leader != group_leader)
10967 goto err_context;
10968
10969
10970 if (group_leader->clock != event->clock)
10971 goto err_context;
10972
10973
10974
10975
10976
10977
10978 if (group_leader->cpu != event->cpu)
10979 goto err_context;
10980
10981
10982
10983
10984
10985 if (group_leader->ctx->task != ctx->task)
10986 goto err_context;
10987
10988
10989
10990
10991
10992
10993 if (!move_group && group_leader->ctx != ctx)
10994 goto err_context;
10995
10996
10997
10998
10999 if (attr.exclusive || attr.pinned)
11000 goto err_context;
11001 }
11002
11003 if (output_event) {
11004 err = perf_event_set_output(event, output_event);
11005 if (err)
11006 goto err_context;
11007 }
11008
11009 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
11010 f_flags);
11011 if (IS_ERR(event_file)) {
11012 err = PTR_ERR(event_file);
11013 event_file = NULL;
11014 goto err_context;
11015 }
11016
11017 if (move_group) {
11018 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
11019
11020 if (gctx->task == TASK_TOMBSTONE) {
11021 err = -ESRCH;
11022 goto err_locked;
11023 }
11024
11025
11026
11027
11028
11029 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11030
11031
11032
11033
11034
11035 if (gctx != ctx) {
11036 err = -EINVAL;
11037 goto err_locked;
11038 } else {
11039 perf_event_ctx_unlock(group_leader, gctx);
11040 move_group = 0;
11041 }
11042 }
11043
11044
11045
11046
11047 err = -EBUSY;
11048 if (!exclusive_event_installable(group_leader, ctx))
11049 goto err_locked;
11050
11051 for_each_sibling_event(sibling, group_leader) {
11052 if (!exclusive_event_installable(sibling, ctx))
11053 goto err_locked;
11054 }
11055 } else {
11056 mutex_lock(&ctx->mutex);
11057 }
11058
11059 if (ctx->task == TASK_TOMBSTONE) {
11060 err = -ESRCH;
11061 goto err_locked;
11062 }
11063
11064 if (!perf_event_validate_size(event)) {
11065 err = -E2BIG;
11066 goto err_locked;
11067 }
11068
11069 if (!task) {
11070
11071
11072
11073
11074
11075
11076 struct perf_cpu_context *cpuctx =
11077 container_of(ctx, struct perf_cpu_context, ctx);
11078
11079 if (!cpuctx->online) {
11080 err = -ENODEV;
11081 goto err_locked;
11082 }
11083 }
11084
11085
11086
11087
11088
11089
11090 if (!exclusive_event_installable(event, ctx)) {
11091 err = -EBUSY;
11092 goto err_locked;
11093 }
11094
11095 WARN_ON_ONCE(ctx->parent_ctx);
11096
11097
11098
11099
11100
11101
11102 if (move_group) {
11103
11104
11105
11106
11107 perf_remove_from_context(group_leader, 0);
11108 put_ctx(gctx);
11109
11110 for_each_sibling_event(sibling, group_leader) {
11111 perf_remove_from_context(sibling, 0);
11112 put_ctx(gctx);
11113 }
11114
11115
11116
11117
11118
11119 synchronize_rcu();
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131 for_each_sibling_event(sibling, group_leader) {
11132 perf_event__state_init(sibling);
11133 perf_install_in_context(ctx, sibling, sibling->cpu);
11134 get_ctx(ctx);
11135 }
11136
11137
11138
11139
11140
11141
11142 perf_event__state_init(group_leader);
11143 perf_install_in_context(ctx, group_leader, group_leader->cpu);
11144 get_ctx(ctx);
11145 }
11146
11147
11148
11149
11150
11151
11152
11153 perf_event__header_size(event);
11154 perf_event__id_header_size(event);
11155
11156 event->owner = current;
11157
11158 perf_install_in_context(ctx, event, event->cpu);
11159 perf_unpin_context(ctx);
11160
11161 if (move_group)
11162 perf_event_ctx_unlock(group_leader, gctx);
11163 mutex_unlock(&ctx->mutex);
11164
11165 if (task) {
11166 mutex_unlock(&task->signal->cred_guard_mutex);
11167 put_task_struct(task);
11168 }
11169
11170 mutex_lock(¤t->perf_event_mutex);
11171 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
11172 mutex_unlock(¤t->perf_event_mutex);
11173
11174
11175
11176
11177
11178
11179
11180 fdput(group);
11181 fd_install(event_fd, event_file);
11182 return event_fd;
11183
11184err_locked:
11185 if (move_group)
11186 perf_event_ctx_unlock(group_leader, gctx);
11187 mutex_unlock(&ctx->mutex);
11188
11189 fput(event_file);
11190err_context:
11191 perf_unpin_context(ctx);
11192 put_ctx(ctx);
11193err_alloc:
11194
11195
11196
11197
11198 if (!event_file)
11199 free_event(event);
11200err_cred:
11201 if (task)
11202 mutex_unlock(&task->signal->cred_guard_mutex);
11203err_task:
11204 if (task)
11205 put_task_struct(task);
11206err_group_fd:
11207 fdput(group);
11208err_fd:
11209 put_unused_fd(event_fd);
11210 return err;
11211}
11212
11213
11214
11215
11216
11217
11218
11219
11220struct perf_event *
11221perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
11222 struct task_struct *task,
11223 perf_overflow_handler_t overflow_handler,
11224 void *context)
11225{
11226 struct perf_event_context *ctx;
11227 struct perf_event *event;
11228 int err;
11229
11230
11231
11232
11233
11234 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11235 overflow_handler, context, -1);
11236 if (IS_ERR(event)) {
11237 err = PTR_ERR(event);
11238 goto err;
11239 }
11240
11241
11242 event->owner = TASK_TOMBSTONE;
11243
11244 ctx = find_get_context(event->pmu, task, event);
11245 if (IS_ERR(ctx)) {
11246 err = PTR_ERR(ctx);
11247 goto err_free;
11248 }
11249
11250 WARN_ON_ONCE(ctx->parent_ctx);
11251 mutex_lock(&ctx->mutex);
11252 if (ctx->task == TASK_TOMBSTONE) {
11253 err = -ESRCH;
11254 goto err_unlock;
11255 }
11256
11257 if (!task) {
11258
11259
11260
11261
11262
11263
11264 struct perf_cpu_context *cpuctx =
11265 container_of(ctx, struct perf_cpu_context, ctx);
11266 if (!cpuctx->online) {
11267 err = -ENODEV;
11268 goto err_unlock;
11269 }
11270 }
11271
11272 if (!exclusive_event_installable(event, ctx)) {
11273 err = -EBUSY;
11274 goto err_unlock;
11275 }
11276
11277 perf_install_in_context(ctx, event, event->cpu);
11278 perf_unpin_context(ctx);
11279 mutex_unlock(&ctx->mutex);
11280
11281 return event;
11282
11283err_unlock:
11284 mutex_unlock(&ctx->mutex);
11285 perf_unpin_context(ctx);
11286 put_ctx(ctx);
11287err_free:
11288 free_event(event);
11289err:
11290 return ERR_PTR(err);
11291}
11292EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
11293
11294void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
11295{
11296 struct perf_event_context *src_ctx;
11297 struct perf_event_context *dst_ctx;
11298 struct perf_event *event, *tmp;
11299 LIST_HEAD(events);
11300
11301 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
11302 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
11303
11304
11305
11306
11307
11308 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
11309 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
11310 event_entry) {
11311 perf_remove_from_context(event, 0);
11312 unaccount_event_cpu(event, src_cpu);
11313 put_ctx(src_ctx);
11314 list_add(&event->migrate_entry, &events);
11315 }
11316
11317
11318
11319
11320 synchronize_rcu();
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11331 if (event->group_leader == event)
11332 continue;
11333
11334 list_del(&event->migrate_entry);
11335 if (event->state >= PERF_EVENT_STATE_OFF)
11336 event->state = PERF_EVENT_STATE_INACTIVE;
11337 account_event_cpu(event, dst_cpu);
11338 perf_install_in_context(dst_ctx, event, dst_cpu);
11339 get_ctx(dst_ctx);
11340 }
11341
11342
11343
11344
11345
11346 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11347 list_del(&event->migrate_entry);
11348 if (event->state >= PERF_EVENT_STATE_OFF)
11349 event->state = PERF_EVENT_STATE_INACTIVE;
11350 account_event_cpu(event, dst_cpu);
11351 perf_install_in_context(dst_ctx, event, dst_cpu);
11352 get_ctx(dst_ctx);
11353 }
11354 mutex_unlock(&dst_ctx->mutex);
11355 mutex_unlock(&src_ctx->mutex);
11356}
11357EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
11358
11359static void sync_child_event(struct perf_event *child_event,
11360 struct task_struct *child)
11361{
11362 struct perf_event *parent_event = child_event->parent;
11363 u64 child_val;
11364
11365 if (child_event->attr.inherit_stat)
11366 perf_event_read_event(child_event, child);
11367
11368 child_val = perf_event_count(child_event);
11369
11370
11371
11372
11373 atomic64_add(child_val, &parent_event->child_count);
11374 atomic64_add(child_event->total_time_enabled,
11375 &parent_event->child_total_time_enabled);
11376 atomic64_add(child_event->total_time_running,
11377 &parent_event->child_total_time_running);
11378}
11379
11380static void
11381perf_event_exit_event(struct perf_event *child_event,
11382 struct perf_event_context *child_ctx,
11383 struct task_struct *child)
11384{
11385 struct perf_event *parent_event = child_event->parent;
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399 raw_spin_lock_irq(&child_ctx->lock);
11400 WARN_ON_ONCE(child_ctx->is_active);
11401
11402 if (parent_event)
11403 perf_group_detach(child_event);
11404 list_del_event(child_event, child_ctx);
11405 perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT);
11406 raw_spin_unlock_irq(&child_ctx->lock);
11407
11408
11409
11410
11411 if (!parent_event) {
11412 perf_event_wakeup(child_event);
11413 return;
11414 }
11415
11416
11417
11418
11419 sync_child_event(child_event, child);
11420
11421
11422
11423
11424 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
11425 mutex_lock(&parent_event->child_mutex);
11426 list_del_init(&child_event->child_list);
11427 mutex_unlock(&parent_event->child_mutex);
11428
11429
11430
11431
11432 perf_event_wakeup(parent_event);
11433 free_event(child_event);
11434 put_event(parent_event);
11435}
11436
11437static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
11438{
11439 struct perf_event_context *child_ctx, *clone_ctx = NULL;
11440 struct perf_event *child_event, *next;
11441
11442 WARN_ON_ONCE(child != current);
11443
11444 child_ctx = perf_pin_task_context(child, ctxn);
11445 if (!child_ctx)
11446 return;
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458 mutex_lock(&child_ctx->mutex);
11459
11460
11461
11462
11463
11464
11465 raw_spin_lock_irq(&child_ctx->lock);
11466 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
11467
11468
11469
11470
11471
11472 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
11473 put_ctx(child_ctx);
11474 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
11475 put_task_struct(current);
11476
11477 clone_ctx = unclone_ctx(child_ctx);
11478 raw_spin_unlock_irq(&child_ctx->lock);
11479
11480 if (clone_ctx)
11481 put_ctx(clone_ctx);
11482
11483
11484
11485
11486
11487
11488 perf_event_task(child, child_ctx, 0);
11489
11490 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
11491 perf_event_exit_event(child_event, child_ctx, child);
11492
11493 mutex_unlock(&child_ctx->mutex);
11494
11495 put_ctx(child_ctx);
11496}
11497
11498
11499
11500
11501
11502
11503
11504void perf_event_exit_task(struct task_struct *child)
11505{
11506 struct perf_event *event, *tmp;
11507 int ctxn;
11508
11509 mutex_lock(&child->perf_event_mutex);
11510 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
11511 owner_entry) {
11512 list_del_init(&event->owner_entry);
11513
11514
11515
11516
11517
11518
11519 smp_store_release(&event->owner, NULL);
11520 }
11521 mutex_unlock(&child->perf_event_mutex);
11522
11523 for_each_task_context_nr(ctxn)
11524 perf_event_exit_task_context(child, ctxn);
11525
11526
11527
11528
11529
11530
11531
11532 perf_event_task(child, NULL, 0);
11533}
11534
11535static void perf_free_event(struct perf_event *event,
11536 struct perf_event_context *ctx)
11537{
11538 struct perf_event *parent = event->parent;
11539
11540 if (WARN_ON_ONCE(!parent))
11541 return;
11542
11543 mutex_lock(&parent->child_mutex);
11544 list_del_init(&event->child_list);
11545 mutex_unlock(&parent->child_mutex);
11546
11547 put_event(parent);
11548
11549 raw_spin_lock_irq(&ctx->lock);
11550 perf_group_detach(event);
11551 list_del_event(event, ctx);
11552 raw_spin_unlock_irq(&ctx->lock);
11553 free_event(event);
11554}
11555
11556
11557
11558
11559
11560
11561
11562
11563void perf_event_free_task(struct task_struct *task)
11564{
11565 struct perf_event_context *ctx;
11566 struct perf_event *event, *tmp;
11567 int ctxn;
11568
11569 for_each_task_context_nr(ctxn) {
11570 ctx = task->perf_event_ctxp[ctxn];
11571 if (!ctx)
11572 continue;
11573
11574 mutex_lock(&ctx->mutex);
11575 raw_spin_lock_irq(&ctx->lock);
11576
11577
11578
11579
11580
11581
11582 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
11583 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
11584 put_task_struct(task);
11585 raw_spin_unlock_irq(&ctx->lock);
11586
11587 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
11588 perf_free_event(event, ctx);
11589
11590 mutex_unlock(&ctx->mutex);
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
11607 put_ctx(ctx);
11608 }
11609}
11610
11611void perf_event_delayed_put(struct task_struct *task)
11612{
11613 int ctxn;
11614
11615 for_each_task_context_nr(ctxn)
11616 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
11617}
11618
11619struct file *perf_event_get(unsigned int fd)
11620{
11621 struct file *file = fget(fd);
11622 if (!file)
11623 return ERR_PTR(-EBADF);
11624
11625 if (file->f_op != &perf_fops) {
11626 fput(file);
11627 return ERR_PTR(-EBADF);
11628 }
11629
11630 return file;
11631}
11632
11633const struct perf_event *perf_get_event(struct file *file)
11634{
11635 if (file->f_op != &perf_fops)
11636 return ERR_PTR(-EINVAL);
11637
11638 return file->private_data;
11639}
11640
11641const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
11642{
11643 if (!event)
11644 return ERR_PTR(-EINVAL);
11645
11646 return &event->attr;
11647}
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657static struct perf_event *
11658inherit_event(struct perf_event *parent_event,
11659 struct task_struct *parent,
11660 struct perf_event_context *parent_ctx,
11661 struct task_struct *child,
11662 struct perf_event *group_leader,
11663 struct perf_event_context *child_ctx)
11664{
11665 enum perf_event_state parent_state = parent_event->state;
11666 struct perf_event *child_event;
11667 unsigned long flags;
11668
11669
11670
11671
11672
11673
11674
11675 if (parent_event->parent)
11676 parent_event = parent_event->parent;
11677
11678 child_event = perf_event_alloc(&parent_event->attr,
11679 parent_event->cpu,
11680 child,
11681 group_leader, parent_event,
11682 NULL, NULL, -1);
11683 if (IS_ERR(child_event))
11684 return child_event;
11685
11686
11687 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
11688 !child_ctx->task_ctx_data) {
11689 struct pmu *pmu = child_event->pmu;
11690
11691 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11692 GFP_KERNEL);
11693 if (!child_ctx->task_ctx_data) {
11694 free_event(child_event);
11695 return NULL;
11696 }
11697 }
11698
11699
11700
11701
11702
11703
11704
11705 mutex_lock(&parent_event->child_mutex);
11706 if (is_orphaned_event(parent_event) ||
11707 !atomic_long_inc_not_zero(&parent_event->refcount)) {
11708 mutex_unlock(&parent_event->child_mutex);
11709
11710 free_event(child_event);
11711 return NULL;
11712 }
11713
11714 get_ctx(child_ctx);
11715
11716
11717
11718
11719
11720
11721 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
11722 child_event->state = PERF_EVENT_STATE_INACTIVE;
11723 else
11724 child_event->state = PERF_EVENT_STATE_OFF;
11725
11726 if (parent_event->attr.freq) {
11727 u64 sample_period = parent_event->hw.sample_period;
11728 struct hw_perf_event *hwc = &child_event->hw;
11729
11730 hwc->sample_period = sample_period;
11731 hwc->last_period = sample_period;
11732
11733 local64_set(&hwc->period_left, sample_period);
11734 }
11735
11736 child_event->ctx = child_ctx;
11737 child_event->overflow_handler = parent_event->overflow_handler;
11738 child_event->overflow_handler_context
11739 = parent_event->overflow_handler_context;
11740
11741
11742
11743
11744 perf_event__header_size(child_event);
11745 perf_event__id_header_size(child_event);
11746
11747
11748
11749
11750 raw_spin_lock_irqsave(&child_ctx->lock, flags);
11751 add_event_to_ctx(child_event, child_ctx);
11752 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
11753
11754
11755
11756
11757 list_add_tail(&child_event->child_list, &parent_event->child_list);
11758 mutex_unlock(&parent_event->child_mutex);
11759
11760 return child_event;
11761}
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773static int inherit_group(struct perf_event *parent_event,
11774 struct task_struct *parent,
11775 struct perf_event_context *parent_ctx,
11776 struct task_struct *child,
11777 struct perf_event_context *child_ctx)
11778{
11779 struct perf_event *leader;
11780 struct perf_event *sub;
11781 struct perf_event *child_ctr;
11782
11783 leader = inherit_event(parent_event, parent, parent_ctx,
11784 child, NULL, child_ctx);
11785 if (IS_ERR(leader))
11786 return PTR_ERR(leader);
11787
11788
11789
11790
11791
11792 for_each_sibling_event(sub, parent_event) {
11793 child_ctr = inherit_event(sub, parent, parent_ctx,
11794 child, leader, child_ctx);
11795 if (IS_ERR(child_ctr))
11796 return PTR_ERR(child_ctr);
11797 }
11798 return 0;
11799}
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812static int
11813inherit_task_group(struct perf_event *event, struct task_struct *parent,
11814 struct perf_event_context *parent_ctx,
11815 struct task_struct *child, int ctxn,
11816 int *inherited_all)
11817{
11818 int ret;
11819 struct perf_event_context *child_ctx;
11820
11821 if (!event->attr.inherit) {
11822 *inherited_all = 0;
11823 return 0;
11824 }
11825
11826 child_ctx = child->perf_event_ctxp[ctxn];
11827 if (!child_ctx) {
11828
11829
11830
11831
11832
11833
11834 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11835 if (!child_ctx)
11836 return -ENOMEM;
11837
11838 child->perf_event_ctxp[ctxn] = child_ctx;
11839 }
11840
11841 ret = inherit_group(event, parent, parent_ctx,
11842 child, child_ctx);
11843
11844 if (ret)
11845 *inherited_all = 0;
11846
11847 return ret;
11848}
11849
11850
11851
11852
11853static int perf_event_init_context(struct task_struct *child, int ctxn)
11854{
11855 struct perf_event_context *child_ctx, *parent_ctx;
11856 struct perf_event_context *cloned_ctx;
11857 struct perf_event *event;
11858 struct task_struct *parent = current;
11859 int inherited_all = 1;
11860 unsigned long flags;
11861 int ret = 0;
11862
11863 if (likely(!parent->perf_event_ctxp[ctxn]))
11864 return 0;
11865
11866
11867
11868
11869
11870 parent_ctx = perf_pin_task_context(parent, ctxn);
11871 if (!parent_ctx)
11872 return 0;
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885 mutex_lock(&parent_ctx->mutex);
11886
11887
11888
11889
11890
11891 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
11892 ret = inherit_task_group(event, parent, parent_ctx,
11893 child, ctxn, &inherited_all);
11894 if (ret)
11895 goto out_unlock;
11896 }
11897
11898
11899
11900
11901
11902
11903 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11904 parent_ctx->rotate_disable = 1;
11905 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11906
11907 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
11908 ret = inherit_task_group(event, parent, parent_ctx,
11909 child, ctxn, &inherited_all);
11910 if (ret)
11911 goto out_unlock;
11912 }
11913
11914 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11915 parent_ctx->rotate_disable = 0;
11916
11917 child_ctx = child->perf_event_ctxp[ctxn];
11918
11919 if (child_ctx && inherited_all) {
11920
11921
11922
11923
11924
11925
11926
11927 cloned_ctx = parent_ctx->parent_ctx;
11928 if (cloned_ctx) {
11929 child_ctx->parent_ctx = cloned_ctx;
11930 child_ctx->parent_gen = parent_ctx->parent_gen;
11931 } else {
11932 child_ctx->parent_ctx = parent_ctx;
11933 child_ctx->parent_gen = parent_ctx->generation;
11934 }
11935 get_ctx(child_ctx->parent_ctx);
11936 }
11937
11938 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11939out_unlock:
11940 mutex_unlock(&parent_ctx->mutex);
11941
11942 perf_unpin_context(parent_ctx);
11943 put_ctx(parent_ctx);
11944
11945 return ret;
11946}
11947
11948
11949
11950
11951int perf_event_init_task(struct task_struct *child)
11952{
11953 int ctxn, ret;
11954
11955 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11956 mutex_init(&child->perf_event_mutex);
11957 INIT_LIST_HEAD(&child->perf_event_list);
11958
11959 for_each_task_context_nr(ctxn) {
11960 ret = perf_event_init_context(child, ctxn);
11961 if (ret) {
11962 perf_event_free_task(child);
11963 return ret;
11964 }
11965 }
11966
11967 return 0;
11968}
11969
11970static void __init perf_event_init_all_cpus(void)
11971{
11972 struct swevent_htable *swhash;
11973 int cpu;
11974
11975 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11976
11977 for_each_possible_cpu(cpu) {
11978 swhash = &per_cpu(swevent_htable, cpu);
11979 mutex_init(&swhash->hlist_mutex);
11980 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11981
11982 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11983 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11984
11985#ifdef CONFIG_CGROUP_PERF
11986 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11987#endif
11988 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11989 }
11990}
11991
11992static void perf_swevent_init_cpu(unsigned int cpu)
11993{
11994 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11995
11996 mutex_lock(&swhash->hlist_mutex);
11997 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11998 struct swevent_hlist *hlist;
11999
12000 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
12001 WARN_ON(!hlist);
12002 rcu_assign_pointer(swhash->swevent_hlist, hlist);
12003 }
12004 mutex_unlock(&swhash->hlist_mutex);
12005}
12006
12007#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
12008static void __perf_event_exit_context(void *__info)
12009{
12010 struct perf_event_context *ctx = __info;
12011 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
12012 struct perf_event *event;
12013
12014 raw_spin_lock(&ctx->lock);
12015 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
12016 list_for_each_entry(event, &ctx->event_list, event_entry)
12017 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
12018 raw_spin_unlock(&ctx->lock);
12019}
12020
12021static void perf_event_exit_cpu_context(int cpu)
12022{
12023 struct perf_cpu_context *cpuctx;
12024 struct perf_event_context *ctx;
12025 struct pmu *pmu;
12026
12027 mutex_lock(&pmus_lock);
12028 list_for_each_entry(pmu, &pmus, entry) {
12029 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12030 ctx = &cpuctx->ctx;
12031
12032 mutex_lock(&ctx->mutex);
12033 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
12034 cpuctx->online = 0;
12035 mutex_unlock(&ctx->mutex);
12036 }
12037 cpumask_clear_cpu(cpu, perf_online_mask);
12038 mutex_unlock(&pmus_lock);
12039}
12040#else
12041
12042static void perf_event_exit_cpu_context(int cpu) { }
12043
12044#endif
12045
12046int perf_event_init_cpu(unsigned int cpu)
12047{
12048 struct perf_cpu_context *cpuctx;
12049 struct perf_event_context *ctx;
12050 struct pmu *pmu;
12051
12052 perf_swevent_init_cpu(cpu);
12053
12054 mutex_lock(&pmus_lock);
12055 cpumask_set_cpu(cpu, perf_online_mask);
12056 list_for_each_entry(pmu, &pmus, entry) {
12057 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12058 ctx = &cpuctx->ctx;
12059
12060 mutex_lock(&ctx->mutex);
12061 cpuctx->online = 1;
12062 mutex_unlock(&ctx->mutex);
12063 }
12064 mutex_unlock(&pmus_lock);
12065
12066 return 0;
12067}
12068
12069int perf_event_exit_cpu(unsigned int cpu)
12070{
12071 perf_event_exit_cpu_context(cpu);
12072 return 0;
12073}
12074
12075static int
12076perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
12077{
12078 int cpu;
12079
12080 for_each_online_cpu(cpu)
12081 perf_event_exit_cpu(cpu);
12082
12083 return NOTIFY_OK;
12084}
12085
12086
12087
12088
12089
12090static struct notifier_block perf_reboot_notifier = {
12091 .notifier_call = perf_reboot,
12092 .priority = INT_MIN,
12093};
12094
12095void __init perf_event_init(void)
12096{
12097 int ret;
12098
12099 idr_init(&pmu_idr);
12100
12101 perf_event_init_all_cpus();
12102 init_srcu_struct(&pmus_srcu);
12103 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
12104 perf_pmu_register(&perf_cpu_clock, NULL, -1);
12105 perf_pmu_register(&perf_task_clock, NULL, -1);
12106 perf_tp_register();
12107 perf_event_init_cpu(smp_processor_id());
12108 register_reboot_notifier(&perf_reboot_notifier);
12109
12110 ret = init_hw_breakpoint();
12111 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
12112
12113
12114
12115
12116
12117 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
12118 != 1024);
12119}
12120
12121ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
12122 char *page)
12123{
12124 struct perf_pmu_events_attr *pmu_attr =
12125 container_of(attr, struct perf_pmu_events_attr, attr);
12126
12127 if (pmu_attr->event_str)
12128 return sprintf(page, "%s\n", pmu_attr->event_str);
12129
12130 return 0;
12131}
12132EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
12133
12134static int __init perf_event_sysfs_init(void)
12135{
12136 struct pmu *pmu;
12137 int ret;
12138
12139 mutex_lock(&pmus_lock);
12140
12141 ret = bus_register(&pmu_bus);
12142 if (ret)
12143 goto unlock;
12144
12145 list_for_each_entry(pmu, &pmus, entry) {
12146 if (!pmu->name || pmu->type < 0)
12147 continue;
12148
12149 ret = pmu_dev_alloc(pmu);
12150 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
12151 }
12152 pmu_bus_running = 1;
12153 ret = 0;
12154
12155unlock:
12156 mutex_unlock(&pmus_lock);
12157
12158 return ret;
12159}
12160device_initcall(perf_event_sysfs_init);
12161
12162#ifdef CONFIG_CGROUP_PERF
12163static struct cgroup_subsys_state *
12164perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
12165{
12166 struct perf_cgroup *jc;
12167
12168 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
12169 if (!jc)
12170 return ERR_PTR(-ENOMEM);
12171
12172 jc->info = alloc_percpu(struct perf_cgroup_info);
12173 if (!jc->info) {
12174 kfree(jc);
12175 return ERR_PTR(-ENOMEM);
12176 }
12177
12178 return &jc->css;
12179}
12180
12181static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
12182{
12183 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
12184
12185 free_percpu(jc->info);
12186 kfree(jc);
12187}
12188
12189static int __perf_cgroup_move(void *info)
12190{
12191 struct task_struct *task = info;
12192 rcu_read_lock();
12193 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
12194 rcu_read_unlock();
12195 return 0;
12196}
12197
12198static void perf_cgroup_attach(struct cgroup_taskset *tset)
12199{
12200 struct task_struct *task;
12201 struct cgroup_subsys_state *css;
12202
12203 cgroup_taskset_for_each(task, css, tset)
12204 task_function_call(task, __perf_cgroup_move, task);
12205}
12206
12207struct cgroup_subsys perf_event_cgrp_subsys = {
12208 .css_alloc = perf_cgroup_css_alloc,
12209 .css_free = perf_cgroup_css_free,
12210 .attach = perf_cgroup_attach,
12211
12212
12213
12214
12215
12216 .implicit_on_dfl = true,
12217 .threaded = true,
12218};
12219#endif
12220