1
2
3
4
5
6
7
8
9
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/rculist.h>
32#include <linux/uaccess.h>
33#include <linux/syscalls.h>
34#include <linux/anon_inodes.h>
35#include <linux/kernel_stat.h>
36#include <linux/cgroup.h>
37#include <linux/perf_event.h>
38#include <linux/trace_events.h>
39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h>
41#include <linux/module.h>
42#include <linux/mman.h>
43#include <linux/compat.h>
44#include <linux/bpf.h>
45#include <linux/filter.h>
46#include <linux/namei.h>
47#include <linux/parser.h>
48#include <linux/sched/clock.h>
49#include <linux/sched/mm.h>
50#include <linux/proc_ns.h>
51#include <linux/mount.h>
52#include <linux/min_heap.h>
53#include <linux/buildid.h>
54#include <linux/highmem.h>
55
56#include "internal.h"
57
58#include <asm/irq_regs.h>
59
60#include <linux/rh_flags.h>
61
62typedef int (*remote_function_f)(void *);
63
64struct remote_function_call {
65 struct task_struct *p;
66 remote_function_f func;
67 void *info;
68 int ret;
69};
70
71static void remote_function(void *data)
72{
73 struct remote_function_call *tfc = data;
74 struct task_struct *p = tfc->p;
75
76 if (p) {
77
78 if (task_cpu(p) != smp_processor_id())
79 return;
80
81
82
83
84
85
86 tfc->ret = -ESRCH;
87 if (p != current)
88 return;
89 }
90
91 tfc->ret = tfc->func(tfc->info);
92}
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107static int
108task_function_call(struct task_struct *p, remote_function_f func, void *info)
109{
110 struct remote_function_call data = {
111 .p = p,
112 .func = func,
113 .info = info,
114 .ret = -EAGAIN,
115 };
116 int ret;
117
118 for (;;) {
119 ret = smp_call_function_single(task_cpu(p), remote_function,
120 &data, 1);
121 if (!ret)
122 ret = data.ret;
123
124 if (ret != -EAGAIN)
125 break;
126
127 cond_resched();
128 }
129
130 return ret;
131}
132
133
134
135
136
137
138
139
140
141
142
143static int cpu_function_call(int cpu, remote_function_f func, void *info)
144{
145 struct remote_function_call data = {
146 .p = NULL,
147 .func = func,
148 .info = info,
149 .ret = -ENXIO,
150 };
151
152 smp_call_function_single(cpu, remote_function, &data, 1);
153
154 return data.ret;
155}
156
157static inline struct perf_cpu_context *
158__get_cpu_context(struct perf_event_context *ctx)
159{
160 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
161}
162
163static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
164 struct perf_event_context *ctx)
165{
166 raw_spin_lock(&cpuctx->ctx.lock);
167 if (ctx)
168 raw_spin_lock(&ctx->lock);
169}
170
171static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
172 struct perf_event_context *ctx)
173{
174 if (ctx)
175 raw_spin_unlock(&ctx->lock);
176 raw_spin_unlock(&cpuctx->ctx.lock);
177}
178
179#define TASK_TOMBSTONE ((void *)-1L)
180
181static bool is_kernel_event(struct perf_event *event)
182{
183 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
184}
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
206 struct perf_event_context *, void *);
207
208struct event_function_struct {
209 struct perf_event *event;
210 event_f func;
211 void *data;
212};
213
214static int event_function(void *info)
215{
216 struct event_function_struct *efs = info;
217 struct perf_event *event = efs->event;
218 struct perf_event_context *ctx = event->ctx;
219 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
220 struct perf_event_context *task_ctx = cpuctx->task_ctx;
221 int ret = 0;
222
223 lockdep_assert_irqs_disabled();
224
225 perf_ctx_lock(cpuctx, task_ctx);
226
227
228
229
230 if (ctx->task) {
231 if (ctx->task != current) {
232 ret = -ESRCH;
233 goto unlock;
234 }
235
236
237
238
239
240
241
242
243 WARN_ON_ONCE(!ctx->is_active);
244
245
246
247
248 WARN_ON_ONCE(task_ctx != ctx);
249 } else {
250 WARN_ON_ONCE(&cpuctx->ctx != ctx);
251 }
252
253 efs->func(event, cpuctx, ctx, efs->data);
254unlock:
255 perf_ctx_unlock(cpuctx, task_ctx);
256
257 return ret;
258}
259
260static void event_function_call(struct perf_event *event, event_f func, void *data)
261{
262 struct perf_event_context *ctx = event->ctx;
263 struct task_struct *task = READ_ONCE(ctx->task);
264 struct event_function_struct efs = {
265 .event = event,
266 .func = func,
267 .data = data,
268 };
269
270 if (!event->parent) {
271
272
273
274
275
276 lockdep_assert_held(&ctx->mutex);
277 }
278
279 if (!task) {
280 cpu_function_call(event->cpu, event_function, &efs);
281 return;
282 }
283
284 if (task == TASK_TOMBSTONE)
285 return;
286
287again:
288 if (!task_function_call(task, event_function, &efs))
289 return;
290
291 raw_spin_lock_irq(&ctx->lock);
292
293
294
295
296 task = ctx->task;
297 if (task == TASK_TOMBSTONE) {
298 raw_spin_unlock_irq(&ctx->lock);
299 return;
300 }
301 if (ctx->is_active) {
302 raw_spin_unlock_irq(&ctx->lock);
303 goto again;
304 }
305 func(event, NULL, ctx, data);
306 raw_spin_unlock_irq(&ctx->lock);
307}
308
309
310
311
312
313static void event_function_local(struct perf_event *event, event_f func, void *data)
314{
315 struct perf_event_context *ctx = event->ctx;
316 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
317 struct task_struct *task = READ_ONCE(ctx->task);
318 struct perf_event_context *task_ctx = NULL;
319
320 lockdep_assert_irqs_disabled();
321
322 if (task) {
323 if (task == TASK_TOMBSTONE)
324 return;
325
326 task_ctx = ctx;
327 }
328
329 perf_ctx_lock(cpuctx, task_ctx);
330
331 task = ctx->task;
332 if (task == TASK_TOMBSTONE)
333 goto unlock;
334
335 if (task) {
336
337
338
339
340
341 if (ctx->is_active) {
342 if (WARN_ON_ONCE(task != current))
343 goto unlock;
344
345 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
346 goto unlock;
347 }
348 } else {
349 WARN_ON_ONCE(&cpuctx->ctx != ctx);
350 }
351
352 func(event, cpuctx, ctx, data);
353unlock:
354 perf_ctx_unlock(cpuctx, task_ctx);
355}
356
357#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
358 PERF_FLAG_FD_OUTPUT |\
359 PERF_FLAG_PID_CGROUP |\
360 PERF_FLAG_FD_CLOEXEC)
361
362
363
364
365#define PERF_SAMPLE_BRANCH_PERM_PLM \
366 (PERF_SAMPLE_BRANCH_KERNEL |\
367 PERF_SAMPLE_BRANCH_HV)
368
369enum event_type_t {
370 EVENT_FLEXIBLE = 0x1,
371 EVENT_PINNED = 0x2,
372 EVENT_TIME = 0x4,
373
374 EVENT_CPU = 0x8,
375 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
376};
377
378
379
380
381
382
383static void perf_sched_delayed(struct work_struct *work);
384DEFINE_STATIC_KEY_FALSE(perf_sched_events);
385static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
386static DEFINE_MUTEX(perf_sched_mutex);
387static atomic_t perf_sched_count;
388
389static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
390static DEFINE_PER_CPU(int, perf_sched_cb_usages);
391static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
392
393static atomic_t nr_mmap_events __read_mostly;
394static atomic_t nr_comm_events __read_mostly;
395static atomic_t nr_namespaces_events __read_mostly;
396static atomic_t nr_task_events __read_mostly;
397static atomic_t nr_freq_events __read_mostly;
398static atomic_t nr_switch_events __read_mostly;
399static atomic_t nr_ksymbol_events __read_mostly;
400static atomic_t nr_bpf_events __read_mostly;
401static atomic_t nr_cgroup_events __read_mostly;
402static atomic_t nr_text_poke_events __read_mostly;
403static atomic_t nr_build_id_events __read_mostly;
404
405static LIST_HEAD(pmus);
406static DEFINE_MUTEX(pmus_lock);
407static struct srcu_struct pmus_srcu;
408static cpumask_var_t perf_online_mask;
409static struct kmem_cache *perf_event_cache;
410
411
412
413
414
415
416
417
418int sysctl_perf_event_paranoid __read_mostly = 2;
419
420
421int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
422
423
424
425
426#define DEFAULT_MAX_SAMPLE_RATE 100000
427#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
428#define DEFAULT_CPU_TIME_MAX_PERCENT 25
429
430int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
431
432static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
433static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
434
435static int perf_sample_allowed_ns __read_mostly =
436 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
437
438static void update_perf_cpu_limits(void)
439{
440 u64 tmp = perf_sample_period_ns;
441
442 tmp *= sysctl_perf_cpu_time_max_percent;
443 tmp = div_u64(tmp, 100);
444 if (!tmp)
445 tmp = 1;
446
447 WRITE_ONCE(perf_sample_allowed_ns, tmp);
448}
449
450static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
451
452int perf_proc_update_handler(struct ctl_table *table, int write,
453 void __user *buffer, size_t *lenp,
454 loff_t *ppos)
455{
456 int ret;
457 int perf_cpu = sysctl_perf_cpu_time_max_percent;
458
459
460
461 if (write && (perf_cpu == 100 || perf_cpu == 0))
462 return -EINVAL;
463
464 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465 if (ret || !write)
466 return ret;
467
468 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
469 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
470 update_perf_cpu_limits();
471
472 return 0;
473}
474
475int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
476
477int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
478 void __user *buffer, size_t *lenp,
479 loff_t *ppos)
480{
481 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
482
483 if (ret || !write)
484 return ret;
485
486 if (sysctl_perf_cpu_time_max_percent == 100 ||
487 sysctl_perf_cpu_time_max_percent == 0) {
488 printk(KERN_WARNING
489 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
490 WRITE_ONCE(perf_sample_allowed_ns, 0);
491 } else {
492 update_perf_cpu_limits();
493 }
494
495 return 0;
496}
497
498
499
500
501
502
503
504#define NR_ACCUMULATED_SAMPLES 128
505static DEFINE_PER_CPU(u64, running_sample_length);
506
507static u64 __report_avg;
508static u64 __report_allowed;
509
510static void perf_duration_warn(struct irq_work *w)
511{
512 printk_ratelimited(KERN_INFO
513 "perf: interrupt took too long (%lld > %lld), lowering "
514 "kernel.perf_event_max_sample_rate to %d\n",
515 __report_avg, __report_allowed,
516 sysctl_perf_event_sample_rate);
517}
518
519static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
520
521void perf_sample_event_took(u64 sample_len_ns)
522{
523 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
524 u64 running_len;
525 u64 avg_len;
526 u32 max;
527
528 if (max_len == 0)
529 return;
530
531
532 running_len = __this_cpu_read(running_sample_length);
533 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
534 running_len += sample_len_ns;
535 __this_cpu_write(running_sample_length, running_len);
536
537
538
539
540
541
542 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
543 if (avg_len <= max_len)
544 return;
545
546 __report_avg = avg_len;
547 __report_allowed = max_len;
548
549
550
551
552 avg_len += avg_len / 4;
553 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
554 if (avg_len < max)
555 max /= (u32)avg_len;
556 else
557 max = 1;
558
559 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
560 WRITE_ONCE(max_samples_per_tick, max);
561
562 sysctl_perf_event_sample_rate = max * HZ;
563 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
564
565 if (!irq_work_queue(&perf_duration_work)) {
566 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
567 "kernel.perf_event_max_sample_rate to %d\n",
568 __report_avg, __report_allowed,
569 sysctl_perf_event_sample_rate);
570 }
571}
572
573static atomic64_t perf_event_id;
574
575static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
576 enum event_type_t event_type);
577
578static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
579 enum event_type_t event_type,
580 struct task_struct *task);
581
582static void update_context_time(struct perf_event_context *ctx);
583static u64 perf_event_time(struct perf_event *event);
584
585void __weak perf_event_print_debug(void) { }
586
587extern __weak const char *perf_pmu_name(void)
588{
589 return "pmu";
590}
591
592static inline u64 perf_clock(void)
593{
594 return local_clock();
595}
596
597static inline u64 perf_event_clock(struct perf_event *event)
598{
599 return event->clock();
600}
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624static __always_inline enum perf_event_state
625__perf_effective_state(struct perf_event *event)
626{
627 struct perf_event *leader = event->group_leader;
628
629 if (leader->state <= PERF_EVENT_STATE_OFF)
630 return leader->state;
631
632 return event->state;
633}
634
635static __always_inline void
636__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
637{
638 enum perf_event_state state = __perf_effective_state(event);
639 u64 delta = now - event->tstamp;
640
641 *enabled = event->total_time_enabled;
642 if (state >= PERF_EVENT_STATE_INACTIVE)
643 *enabled += delta;
644
645 *running = event->total_time_running;
646 if (state >= PERF_EVENT_STATE_ACTIVE)
647 *running += delta;
648}
649
650static void perf_event_update_time(struct perf_event *event)
651{
652 u64 now = perf_event_time(event);
653
654 __perf_update_times(event, now, &event->total_time_enabled,
655 &event->total_time_running);
656 event->tstamp = now;
657}
658
659static void perf_event_update_sibling_time(struct perf_event *leader)
660{
661 struct perf_event *sibling;
662
663 for_each_sibling_event(sibling, leader)
664 perf_event_update_time(sibling);
665}
666
667static void
668perf_event_set_state(struct perf_event *event, enum perf_event_state state)
669{
670 if (event->state == state)
671 return;
672
673 perf_event_update_time(event);
674
675
676
677
678 if ((event->state < 0) ^ (state < 0))
679 perf_event_update_sibling_time(event);
680
681 WRITE_ONCE(event->state, state);
682}
683
684#ifdef CONFIG_CGROUP_PERF
685
686static inline bool
687perf_cgroup_match(struct perf_event *event)
688{
689 struct perf_event_context *ctx = event->ctx;
690 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
691
692
693 if (!event->cgrp)
694 return true;
695
696
697 if (!cpuctx->cgrp)
698 return false;
699
700
701
702
703
704
705
706 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
707 event->cgrp->css.cgroup);
708}
709
710static inline void perf_detach_cgroup(struct perf_event *event)
711{
712 css_put(&event->cgrp->css);
713 event->cgrp = NULL;
714}
715
716static inline int is_cgroup_event(struct perf_event *event)
717{
718 return event->cgrp != NULL;
719}
720
721static inline u64 perf_cgroup_event_time(struct perf_event *event)
722{
723 struct perf_cgroup_info *t;
724
725 t = per_cpu_ptr(event->cgrp->info, event->cpu);
726 return t->time;
727}
728
729static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
730{
731 struct perf_cgroup_info *info;
732 u64 now;
733
734 now = perf_clock();
735
736 info = this_cpu_ptr(cgrp->info);
737
738 info->time += now - info->timestamp;
739 info->timestamp = now;
740}
741
742static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
743{
744 struct perf_cgroup *cgrp = cpuctx->cgrp;
745 struct cgroup_subsys_state *css;
746
747 if (cgrp) {
748 for (css = &cgrp->css; css; css = css->parent) {
749 cgrp = container_of(css, struct perf_cgroup, css);
750 __update_cgrp_time(cgrp);
751 }
752 }
753}
754
755static inline void update_cgrp_time_from_event(struct perf_event *event)
756{
757 struct perf_cgroup *cgrp;
758
759
760
761
762
763 if (!is_cgroup_event(event))
764 return;
765
766 cgrp = perf_cgroup_from_task(current, event->ctx);
767
768
769
770 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
771 __update_cgrp_time(event->cgrp);
772}
773
774static inline void
775perf_cgroup_set_timestamp(struct task_struct *task,
776 struct perf_event_context *ctx)
777{
778 struct perf_cgroup *cgrp;
779 struct perf_cgroup_info *info;
780 struct cgroup_subsys_state *css;
781
782
783
784
785
786
787 if (!task || !ctx->nr_cgroups)
788 return;
789
790 cgrp = perf_cgroup_from_task(task, ctx);
791
792 for (css = &cgrp->css; css; css = css->parent) {
793 cgrp = container_of(css, struct perf_cgroup, css);
794 info = this_cpu_ptr(cgrp->info);
795 info->timestamp = ctx->timestamp;
796 }
797}
798
799static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
800
801#define PERF_CGROUP_SWOUT 0x1
802#define PERF_CGROUP_SWIN 0x2
803
804
805
806
807
808
809
810static void perf_cgroup_switch(struct task_struct *task, int mode)
811{
812 struct perf_cpu_context *cpuctx;
813 struct list_head *list;
814 unsigned long flags;
815
816
817
818
819
820 local_irq_save(flags);
821
822 list = this_cpu_ptr(&cgrp_cpuctx_list);
823 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
824 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
825
826 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
827 perf_pmu_disable(cpuctx->ctx.pmu);
828
829 if (mode & PERF_CGROUP_SWOUT) {
830 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
831
832
833
834
835 cpuctx->cgrp = NULL;
836 }
837
838 if (mode & PERF_CGROUP_SWIN) {
839 WARN_ON_ONCE(cpuctx->cgrp);
840
841
842
843
844
845
846
847 cpuctx->cgrp = perf_cgroup_from_task(task,
848 &cpuctx->ctx);
849 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
850 }
851 perf_pmu_enable(cpuctx->ctx.pmu);
852 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
853 }
854
855 local_irq_restore(flags);
856}
857
858static inline void perf_cgroup_sched_out(struct task_struct *task,
859 struct task_struct *next)
860{
861 struct perf_cgroup *cgrp1;
862 struct perf_cgroup *cgrp2 = NULL;
863
864 rcu_read_lock();
865
866
867
868
869
870 cgrp1 = perf_cgroup_from_task(task, NULL);
871 cgrp2 = perf_cgroup_from_task(next, NULL);
872
873
874
875
876
877
878 if (cgrp1 != cgrp2)
879 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
880
881 rcu_read_unlock();
882}
883
884static inline void perf_cgroup_sched_in(struct task_struct *prev,
885 struct task_struct *task)
886{
887 struct perf_cgroup *cgrp1;
888 struct perf_cgroup *cgrp2 = NULL;
889
890 rcu_read_lock();
891
892
893
894
895
896 cgrp1 = perf_cgroup_from_task(task, NULL);
897 cgrp2 = perf_cgroup_from_task(prev, NULL);
898
899
900
901
902
903
904 if (cgrp1 != cgrp2)
905 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
906
907 rcu_read_unlock();
908}
909
910static int perf_cgroup_ensure_storage(struct perf_event *event,
911 struct cgroup_subsys_state *css)
912{
913 struct perf_cpu_context *cpuctx;
914 struct perf_event **storage;
915 int cpu, heap_size, ret = 0;
916
917
918
919
920
921 for (heap_size = 1; css; css = css->parent)
922 heap_size++;
923
924 for_each_possible_cpu(cpu) {
925 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
926 if (heap_size <= cpuctx->heap_size)
927 continue;
928
929 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
930 GFP_KERNEL, cpu_to_node(cpu));
931 if (!storage) {
932 ret = -ENOMEM;
933 break;
934 }
935
936 raw_spin_lock_irq(&cpuctx->ctx.lock);
937 if (cpuctx->heap_size < heap_size) {
938 swap(cpuctx->heap, storage);
939 if (storage == cpuctx->heap_default)
940 storage = NULL;
941 cpuctx->heap_size = heap_size;
942 }
943 raw_spin_unlock_irq(&cpuctx->ctx.lock);
944
945 kfree(storage);
946 }
947
948 return ret;
949}
950
951static inline int perf_cgroup_connect(int fd, struct perf_event *event,
952 struct perf_event_attr *attr,
953 struct perf_event *group_leader)
954{
955 struct perf_cgroup *cgrp;
956 struct cgroup_subsys_state *css;
957 struct fd f = fdget(fd);
958 int ret = 0;
959
960 if (!f.file)
961 return -EBADF;
962
963 css = css_tryget_online_from_dir(f.file->f_path.dentry,
964 &perf_event_cgrp_subsys);
965 if (IS_ERR(css)) {
966 ret = PTR_ERR(css);
967 goto out;
968 }
969
970 ret = perf_cgroup_ensure_storage(event, css);
971 if (ret)
972 goto out;
973
974 cgrp = container_of(css, struct perf_cgroup, css);
975 event->cgrp = cgrp;
976
977
978
979
980
981
982 if (group_leader && group_leader->cgrp != cgrp) {
983 perf_detach_cgroup(event);
984 ret = -EINVAL;
985 }
986out:
987 fdput(f);
988 return ret;
989}
990
991static inline void
992perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
993{
994 struct perf_cgroup_info *t;
995 t = per_cpu_ptr(event->cgrp->info, event->cpu);
996 event->shadow_ctx_time = now - t->timestamp;
997}
998
999static inline void
1000perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1001{
1002 struct perf_cpu_context *cpuctx;
1003
1004 if (!is_cgroup_event(event))
1005 return;
1006
1007
1008
1009
1010
1011 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1012
1013
1014
1015
1016
1017
1018
1019 if (ctx->is_active && !cpuctx->cgrp) {
1020 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1021
1022 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1023 cpuctx->cgrp = cgrp;
1024 }
1025
1026 if (ctx->nr_cgroups++)
1027 return;
1028
1029 list_add(&cpuctx->cgrp_cpuctx_entry,
1030 per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1031}
1032
1033static inline void
1034perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1035{
1036 struct perf_cpu_context *cpuctx;
1037
1038 if (!is_cgroup_event(event))
1039 return;
1040
1041
1042
1043
1044
1045 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1046
1047 if (--ctx->nr_cgroups)
1048 return;
1049
1050 if (ctx->is_active && cpuctx->cgrp)
1051 cpuctx->cgrp = NULL;
1052
1053 list_del(&cpuctx->cgrp_cpuctx_entry);
1054}
1055
1056#else
1057
1058static inline bool
1059perf_cgroup_match(struct perf_event *event)
1060{
1061 return true;
1062}
1063
1064static inline void perf_detach_cgroup(struct perf_event *event)
1065{}
1066
1067static inline int is_cgroup_event(struct perf_event *event)
1068{
1069 return 0;
1070}
1071
1072static inline void update_cgrp_time_from_event(struct perf_event *event)
1073{
1074}
1075
1076static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1077{
1078}
1079
1080static inline void perf_cgroup_sched_out(struct task_struct *task,
1081 struct task_struct *next)
1082{
1083}
1084
1085static inline void perf_cgroup_sched_in(struct task_struct *prev,
1086 struct task_struct *task)
1087{
1088}
1089
1090static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1091 struct perf_event_attr *attr,
1092 struct perf_event *group_leader)
1093{
1094 return -EINVAL;
1095}
1096
1097static inline void
1098perf_cgroup_set_timestamp(struct task_struct *task,
1099 struct perf_event_context *ctx)
1100{
1101}
1102
1103static inline void
1104perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1105{
1106}
1107
1108static inline void
1109perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1110{
1111}
1112
1113static inline u64 perf_cgroup_event_time(struct perf_event *event)
1114{
1115 return 0;
1116}
1117
1118static inline void
1119perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1120{
1121}
1122
1123static inline void
1124perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1125{
1126}
1127#endif
1128
1129
1130
1131
1132
1133#define PERF_CPU_HRTIMER (1000 / HZ)
1134
1135
1136
1137static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1138{
1139 struct perf_cpu_context *cpuctx;
1140 bool rotations;
1141
1142 lockdep_assert_irqs_disabled();
1143
1144 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1145 rotations = perf_rotate_context(cpuctx);
1146
1147 raw_spin_lock(&cpuctx->hrtimer_lock);
1148 if (rotations)
1149 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1150 else
1151 cpuctx->hrtimer_active = 0;
1152 raw_spin_unlock(&cpuctx->hrtimer_lock);
1153
1154 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1155}
1156
1157static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1158{
1159 struct hrtimer *timer = &cpuctx->hrtimer;
1160 struct pmu *pmu = cpuctx->ctx.pmu;
1161 u64 interval;
1162
1163
1164 if (pmu->task_ctx_nr == perf_sw_context)
1165 return;
1166
1167
1168
1169
1170
1171 interval = pmu->hrtimer_interval_ms;
1172 if (interval < 1)
1173 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1174
1175 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1176
1177 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1178 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1179 timer->function = perf_mux_hrtimer_handler;
1180}
1181
1182static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1183{
1184 struct hrtimer *timer = &cpuctx->hrtimer;
1185 struct pmu *pmu = cpuctx->ctx.pmu;
1186 unsigned long flags;
1187
1188
1189 if (pmu->task_ctx_nr == perf_sw_context)
1190 return 0;
1191
1192 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1193 if (!cpuctx->hrtimer_active) {
1194 cpuctx->hrtimer_active = 1;
1195 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1196 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1197 }
1198 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1199
1200 return 0;
1201}
1202
1203void perf_pmu_disable(struct pmu *pmu)
1204{
1205 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1206 if (!(*count)++)
1207 pmu->pmu_disable(pmu);
1208}
1209
1210void perf_pmu_enable(struct pmu *pmu)
1211{
1212 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1213 if (!--(*count))
1214 pmu->pmu_enable(pmu);
1215}
1216
1217static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1218
1219
1220
1221
1222
1223
1224
1225static void perf_event_ctx_activate(struct perf_event_context *ctx)
1226{
1227 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1228
1229 lockdep_assert_irqs_disabled();
1230
1231 WARN_ON(!list_empty(&ctx->active_ctx_list));
1232
1233 list_add(&ctx->active_ctx_list, head);
1234}
1235
1236static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1237{
1238 lockdep_assert_irqs_disabled();
1239
1240 WARN_ON(list_empty(&ctx->active_ctx_list));
1241
1242 list_del_init(&ctx->active_ctx_list);
1243}
1244
1245static void get_ctx(struct perf_event_context *ctx)
1246{
1247 refcount_inc(&ctx->refcount);
1248}
1249
1250static void *alloc_task_ctx_data(struct pmu *pmu)
1251{
1252 if (pmu->task_ctx_cache)
1253 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1254
1255 return NULL;
1256}
1257
1258static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1259{
1260 if (pmu->task_ctx_cache && task_ctx_data)
1261 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1262}
1263
1264static void free_ctx(struct rcu_head *head)
1265{
1266 struct perf_event_context *ctx;
1267
1268 ctx = container_of(head, struct perf_event_context, rcu_head);
1269 free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1270 kfree(ctx);
1271}
1272
1273static void put_ctx(struct perf_event_context *ctx)
1274{
1275 if (refcount_dec_and_test(&ctx->refcount)) {
1276 if (ctx->parent_ctx)
1277 put_ctx(ctx->parent_ctx);
1278 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1279 put_task_struct(ctx->task);
1280 call_rcu(&ctx->rcu_head, free_ctx);
1281 }
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350static struct perf_event_context *
1351perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1352{
1353 struct perf_event_context *ctx;
1354
1355again:
1356 rcu_read_lock();
1357 ctx = READ_ONCE(event->ctx);
1358 if (!refcount_inc_not_zero(&ctx->refcount)) {
1359 rcu_read_unlock();
1360 goto again;
1361 }
1362 rcu_read_unlock();
1363
1364 mutex_lock_nested(&ctx->mutex, nesting);
1365 if (event->ctx != ctx) {
1366 mutex_unlock(&ctx->mutex);
1367 put_ctx(ctx);
1368 goto again;
1369 }
1370
1371 return ctx;
1372}
1373
1374static inline struct perf_event_context *
1375perf_event_ctx_lock(struct perf_event *event)
1376{
1377 return perf_event_ctx_lock_nested(event, 0);
1378}
1379
1380static void perf_event_ctx_unlock(struct perf_event *event,
1381 struct perf_event_context *ctx)
1382{
1383 mutex_unlock(&ctx->mutex);
1384 put_ctx(ctx);
1385}
1386
1387
1388
1389
1390
1391
1392static __must_check struct perf_event_context *
1393unclone_ctx(struct perf_event_context *ctx)
1394{
1395 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1396
1397 lockdep_assert_held(&ctx->lock);
1398
1399 if (parent_ctx)
1400 ctx->parent_ctx = NULL;
1401 ctx->generation++;
1402
1403 return parent_ctx;
1404}
1405
1406static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1407 enum pid_type type)
1408{
1409 u32 nr;
1410
1411
1412
1413 if (event->parent)
1414 event = event->parent;
1415
1416 nr = __task_pid_nr_ns(p, type, event->ns);
1417
1418 if (!nr && !pid_alive(p))
1419 nr = -1;
1420 return nr;
1421}
1422
1423static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1424{
1425 return perf_event_pid_type(event, p, PIDTYPE_TGID);
1426}
1427
1428static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1429{
1430 return perf_event_pid_type(event, p, PIDTYPE_PID);
1431}
1432
1433
1434
1435
1436
1437static u64 primary_event_id(struct perf_event *event)
1438{
1439 u64 id = event->id;
1440
1441 if (event->parent)
1442 id = event->parent->id;
1443
1444 return id;
1445}
1446
1447
1448
1449
1450
1451
1452
1453static struct perf_event_context *
1454perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1455{
1456 struct perf_event_context *ctx;
1457
1458retry:
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468 local_irq_save(*flags);
1469 rcu_read_lock();
1470 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1471 if (ctx) {
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482 raw_spin_lock(&ctx->lock);
1483 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1484 raw_spin_unlock(&ctx->lock);
1485 rcu_read_unlock();
1486 local_irq_restore(*flags);
1487 goto retry;
1488 }
1489
1490 if (ctx->task == TASK_TOMBSTONE ||
1491 !refcount_inc_not_zero(&ctx->refcount)) {
1492 raw_spin_unlock(&ctx->lock);
1493 ctx = NULL;
1494 } else {
1495 WARN_ON_ONCE(ctx->task != task);
1496 }
1497 }
1498 rcu_read_unlock();
1499 if (!ctx)
1500 local_irq_restore(*flags);
1501 return ctx;
1502}
1503
1504
1505
1506
1507
1508
1509static struct perf_event_context *
1510perf_pin_task_context(struct task_struct *task, int ctxn)
1511{
1512 struct perf_event_context *ctx;
1513 unsigned long flags;
1514
1515 ctx = perf_lock_task_context(task, ctxn, &flags);
1516 if (ctx) {
1517 ++ctx->pin_count;
1518 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1519 }
1520 return ctx;
1521}
1522
1523static void perf_unpin_context(struct perf_event_context *ctx)
1524{
1525 unsigned long flags;
1526
1527 raw_spin_lock_irqsave(&ctx->lock, flags);
1528 --ctx->pin_count;
1529 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1530}
1531
1532
1533
1534
1535static void update_context_time(struct perf_event_context *ctx)
1536{
1537 u64 now = perf_clock();
1538
1539 ctx->time += now - ctx->timestamp;
1540 ctx->timestamp = now;
1541}
1542
1543static u64 perf_event_time(struct perf_event *event)
1544{
1545 struct perf_event_context *ctx = event->ctx;
1546
1547 if (is_cgroup_event(event))
1548 return perf_cgroup_event_time(event);
1549
1550 return ctx ? ctx->time : 0;
1551}
1552
1553static enum event_type_t get_event_type(struct perf_event *event)
1554{
1555 struct perf_event_context *ctx = event->ctx;
1556 enum event_type_t event_type;
1557
1558 lockdep_assert_held(&ctx->lock);
1559
1560
1561
1562
1563
1564 if (event->group_leader != event)
1565 event = event->group_leader;
1566
1567 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1568 if (!ctx->task)
1569 event_type |= EVENT_CPU;
1570
1571 return event_type;
1572}
1573
1574
1575
1576
1577static void init_event_group(struct perf_event *event)
1578{
1579 RB_CLEAR_NODE(&event->group_node);
1580 event->group_index = 0;
1581}
1582
1583
1584
1585
1586
1587static struct perf_event_groups *
1588get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1589{
1590 if (event->attr.pinned)
1591 return &ctx->pinned_groups;
1592 else
1593 return &ctx->flexible_groups;
1594}
1595
1596
1597
1598
1599static void perf_event_groups_init(struct perf_event_groups *groups)
1600{
1601 groups->tree = RB_ROOT;
1602 groups->index = 0;
1603}
1604
1605
1606
1607
1608
1609
1610
1611static bool
1612perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1613{
1614 if (left->cpu < right->cpu)
1615 return true;
1616 if (left->cpu > right->cpu)
1617 return false;
1618
1619#ifdef CONFIG_CGROUP_PERF
1620 if (left->cgrp != right->cgrp) {
1621 if (!left->cgrp || !left->cgrp->css.cgroup) {
1622
1623
1624
1625
1626 return true;
1627 }
1628 if (!right->cgrp || !right->cgrp->css.cgroup) {
1629
1630
1631
1632
1633 return false;
1634 }
1635
1636 if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1637 return true;
1638
1639 return false;
1640 }
1641#endif
1642
1643 if (left->group_index < right->group_index)
1644 return true;
1645 if (left->group_index > right->group_index)
1646 return false;
1647
1648 return false;
1649}
1650
1651
1652
1653
1654
1655
1656static void
1657perf_event_groups_insert(struct perf_event_groups *groups,
1658 struct perf_event *event)
1659{
1660 struct perf_event *node_event;
1661 struct rb_node *parent;
1662 struct rb_node **node;
1663
1664 event->group_index = ++groups->index;
1665
1666 node = &groups->tree.rb_node;
1667 parent = *node;
1668
1669 while (*node) {
1670 parent = *node;
1671 node_event = container_of(*node, struct perf_event, group_node);
1672
1673 if (perf_event_groups_less(event, node_event))
1674 node = &parent->rb_left;
1675 else
1676 node = &parent->rb_right;
1677 }
1678
1679 rb_link_node(&event->group_node, parent, node);
1680 rb_insert_color(&event->group_node, &groups->tree);
1681}
1682
1683
1684
1685
1686static void
1687add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1688{
1689 struct perf_event_groups *groups;
1690
1691 groups = get_event_groups(event, ctx);
1692 perf_event_groups_insert(groups, event);
1693}
1694
1695
1696
1697
1698static void
1699perf_event_groups_delete(struct perf_event_groups *groups,
1700 struct perf_event *event)
1701{
1702 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1703 RB_EMPTY_ROOT(&groups->tree));
1704
1705 rb_erase(&event->group_node, &groups->tree);
1706 init_event_group(event);
1707}
1708
1709
1710
1711
1712static void
1713del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1714{
1715 struct perf_event_groups *groups;
1716
1717 groups = get_event_groups(event, ctx);
1718 perf_event_groups_delete(groups, event);
1719}
1720
1721
1722
1723
1724static struct perf_event *
1725perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1726 struct cgroup *cgrp)
1727{
1728 struct perf_event *node_event = NULL, *match = NULL;
1729 struct rb_node *node = groups->tree.rb_node;
1730#ifdef CONFIG_CGROUP_PERF
1731 u64 node_cgrp_id, cgrp_id = 0;
1732
1733 if (cgrp)
1734 cgrp_id = cgrp->kn->id;
1735#endif
1736
1737 while (node) {
1738 node_event = container_of(node, struct perf_event, group_node);
1739
1740 if (cpu < node_event->cpu) {
1741 node = node->rb_left;
1742 continue;
1743 }
1744 if (cpu > node_event->cpu) {
1745 node = node->rb_right;
1746 continue;
1747 }
1748#ifdef CONFIG_CGROUP_PERF
1749 node_cgrp_id = 0;
1750 if (node_event->cgrp && node_event->cgrp->css.cgroup)
1751 node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1752
1753 if (cgrp_id < node_cgrp_id) {
1754 node = node->rb_left;
1755 continue;
1756 }
1757 if (cgrp_id > node_cgrp_id) {
1758 node = node->rb_right;
1759 continue;
1760 }
1761#endif
1762 match = node_event;
1763 node = node->rb_left;
1764 }
1765
1766 return match;
1767}
1768
1769
1770
1771
1772static struct perf_event *
1773perf_event_groups_next(struct perf_event *event)
1774{
1775 struct perf_event *next;
1776#ifdef CONFIG_CGROUP_PERF
1777 u64 curr_cgrp_id = 0;
1778 u64 next_cgrp_id = 0;
1779#endif
1780
1781 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1782 if (next == NULL || next->cpu != event->cpu)
1783 return NULL;
1784
1785#ifdef CONFIG_CGROUP_PERF
1786 if (event->cgrp && event->cgrp->css.cgroup)
1787 curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1788
1789 if (next->cgrp && next->cgrp->css.cgroup)
1790 next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1791
1792 if (curr_cgrp_id != next_cgrp_id)
1793 return NULL;
1794#endif
1795 return next;
1796}
1797
1798
1799
1800
1801#define perf_event_groups_for_each(event, groups) \
1802 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1803 typeof(*event), group_node); event; \
1804 event = rb_entry_safe(rb_next(&event->group_node), \
1805 typeof(*event), group_node))
1806
1807
1808
1809
1810
1811static void
1812list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1813{
1814 lockdep_assert_held(&ctx->lock);
1815
1816 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1817 event->attach_state |= PERF_ATTACH_CONTEXT;
1818
1819 event->tstamp = perf_event_time(event);
1820
1821
1822
1823
1824
1825
1826 if (event->group_leader == event) {
1827 event->group_caps = event->event_caps;
1828 add_event_to_groups(event, ctx);
1829 }
1830
1831 list_add_rcu(&event->event_entry, &ctx->event_list);
1832 ctx->nr_events++;
1833 if (event->attr.inherit_stat)
1834 ctx->nr_stat++;
1835
1836 if (event->state > PERF_EVENT_STATE_OFF)
1837 perf_cgroup_event_enable(event, ctx);
1838
1839 ctx->generation++;
1840}
1841
1842
1843
1844
1845static inline void perf_event__state_init(struct perf_event *event)
1846{
1847 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1848 PERF_EVENT_STATE_INACTIVE;
1849}
1850
1851static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1852{
1853 int entry = sizeof(u64);
1854 int size = 0;
1855 int nr = 1;
1856
1857 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1858 size += sizeof(u64);
1859
1860 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1861 size += sizeof(u64);
1862
1863 if (event->attr.read_format & PERF_FORMAT_ID)
1864 entry += sizeof(u64);
1865
1866 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1867 nr += nr_siblings;
1868 size += sizeof(u64);
1869 }
1870
1871 size += entry * nr;
1872 event->read_size = size;
1873}
1874
1875static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1876{
1877 struct perf_sample_data *data;
1878 u16 size = 0;
1879
1880 if (sample_type & PERF_SAMPLE_IP)
1881 size += sizeof(data->ip);
1882
1883 if (sample_type & PERF_SAMPLE_ADDR)
1884 size += sizeof(data->addr);
1885
1886 if (sample_type & PERF_SAMPLE_PERIOD)
1887 size += sizeof(data->period);
1888
1889 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1890 size += sizeof(data->weight.full);
1891
1892 if (sample_type & PERF_SAMPLE_READ)
1893 size += event->read_size;
1894
1895 if (sample_type & PERF_SAMPLE_DATA_SRC)
1896 size += sizeof(data->data_src.val);
1897
1898 if (sample_type & PERF_SAMPLE_TRANSACTION)
1899 size += sizeof(data->txn);
1900
1901 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1902 size += sizeof(data->phys_addr);
1903
1904 if (sample_type & PERF_SAMPLE_CGROUP)
1905 size += sizeof(data->cgroup);
1906
1907 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1908 size += sizeof(data->data_page_size);
1909
1910 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1911 size += sizeof(data->code_page_size);
1912
1913 event->header_size = size;
1914}
1915
1916
1917
1918
1919
1920static void perf_event__header_size(struct perf_event *event)
1921{
1922 __perf_event_read_size(event,
1923 event->group_leader->nr_siblings);
1924 __perf_event_header_size(event, event->attr.sample_type);
1925}
1926
1927static void perf_event__id_header_size(struct perf_event *event)
1928{
1929 struct perf_sample_data *data;
1930 u64 sample_type = event->attr.sample_type;
1931 u16 size = 0;
1932
1933 if (sample_type & PERF_SAMPLE_TID)
1934 size += sizeof(data->tid_entry);
1935
1936 if (sample_type & PERF_SAMPLE_TIME)
1937 size += sizeof(data->time);
1938
1939 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1940 size += sizeof(data->id);
1941
1942 if (sample_type & PERF_SAMPLE_ID)
1943 size += sizeof(data->id);
1944
1945 if (sample_type & PERF_SAMPLE_STREAM_ID)
1946 size += sizeof(data->stream_id);
1947
1948 if (sample_type & PERF_SAMPLE_CPU)
1949 size += sizeof(data->cpu_entry);
1950
1951 event->id_header_size = size;
1952}
1953
1954static bool perf_event_validate_size(struct perf_event *event)
1955{
1956
1957
1958
1959
1960 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1961 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1962 perf_event__id_header_size(event);
1963
1964
1965
1966
1967
1968 if (event->read_size + event->header_size +
1969 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1970 return false;
1971
1972 return true;
1973}
1974
1975static void perf_group_attach(struct perf_event *event)
1976{
1977 struct perf_event *group_leader = event->group_leader, *pos;
1978
1979 lockdep_assert_held(&event->ctx->lock);
1980
1981
1982
1983
1984 if (event->attach_state & PERF_ATTACH_GROUP)
1985 return;
1986
1987 event->attach_state |= PERF_ATTACH_GROUP;
1988
1989 if (group_leader == event)
1990 return;
1991
1992 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1993
1994 group_leader->group_caps &= event->event_caps;
1995
1996 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1997 group_leader->nr_siblings++;
1998
1999 perf_event__header_size(group_leader);
2000
2001 for_each_sibling_event(pos, group_leader)
2002 perf_event__header_size(pos);
2003}
2004
2005
2006
2007
2008
2009static void
2010list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2011{
2012 WARN_ON_ONCE(event->ctx != ctx);
2013 lockdep_assert_held(&ctx->lock);
2014
2015
2016
2017
2018 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2019 return;
2020
2021 event->attach_state &= ~PERF_ATTACH_CONTEXT;
2022
2023 ctx->nr_events--;
2024 if (event->attr.inherit_stat)
2025 ctx->nr_stat--;
2026
2027 list_del_rcu(&event->event_entry);
2028
2029 if (event->group_leader == event)
2030 del_event_from_groups(event, ctx);
2031
2032
2033
2034
2035
2036
2037
2038
2039 if (event->state > PERF_EVENT_STATE_OFF) {
2040 perf_cgroup_event_disable(event, ctx);
2041 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2042 }
2043
2044 ctx->generation++;
2045}
2046
2047static int
2048perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2049{
2050 if (!has_aux(aux_event))
2051 return 0;
2052
2053 if (!event->pmu->aux_output_match)
2054 return 0;
2055
2056 return event->pmu->aux_output_match(aux_event);
2057}
2058
2059static void put_event(struct perf_event *event);
2060static void event_sched_out(struct perf_event *event,
2061 struct perf_cpu_context *cpuctx,
2062 struct perf_event_context *ctx);
2063
2064static void perf_put_aux_event(struct perf_event *event)
2065{
2066 struct perf_event_context *ctx = event->ctx;
2067 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2068 struct perf_event *iter;
2069
2070
2071
2072
2073 if (event->aux_event) {
2074 iter = event->aux_event;
2075 event->aux_event = NULL;
2076 put_event(iter);
2077 return;
2078 }
2079
2080
2081
2082
2083
2084 for_each_sibling_event(iter, event->group_leader) {
2085 if (iter->aux_event != event)
2086 continue;
2087
2088 iter->aux_event = NULL;
2089 put_event(event);
2090
2091
2092
2093
2094
2095
2096 event_sched_out(iter, cpuctx, ctx);
2097 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2098 }
2099}
2100
2101static bool perf_need_aux_event(struct perf_event *event)
2102{
2103 return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2104}
2105
2106static int perf_get_aux_event(struct perf_event *event,
2107 struct perf_event *group_leader)
2108{
2109
2110
2111
2112
2113
2114
2115 if (!group_leader)
2116 return 0;
2117
2118
2119
2120
2121 if (event->attr.aux_output && event->attr.aux_sample_size)
2122 return 0;
2123
2124 if (event->attr.aux_output &&
2125 !perf_aux_output_match(event, group_leader))
2126 return 0;
2127
2128 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2129 return 0;
2130
2131 if (!atomic_long_inc_not_zero(&group_leader->refcount))
2132 return 0;
2133
2134
2135
2136
2137
2138
2139
2140 event->aux_event = group_leader;
2141
2142 return 1;
2143}
2144
2145static inline struct list_head *get_event_list(struct perf_event *event)
2146{
2147 struct perf_event_context *ctx = event->ctx;
2148 return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2149}
2150
2151
2152
2153
2154
2155
2156
2157static inline void perf_remove_sibling_event(struct perf_event *event)
2158{
2159 struct perf_event_context *ctx = event->ctx;
2160 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2161
2162 event_sched_out(event, cpuctx, ctx);
2163 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2164}
2165
2166static void perf_group_detach(struct perf_event *event)
2167{
2168 struct perf_event *leader = event->group_leader;
2169 struct perf_event *sibling, *tmp;
2170 struct perf_event_context *ctx = event->ctx;
2171
2172 lockdep_assert_held(&ctx->lock);
2173
2174
2175
2176
2177 if (!(event->attach_state & PERF_ATTACH_GROUP))
2178 return;
2179
2180 event->attach_state &= ~PERF_ATTACH_GROUP;
2181
2182 perf_put_aux_event(event);
2183
2184
2185
2186
2187 if (leader != event) {
2188 list_del_init(&event->sibling_list);
2189 event->group_leader->nr_siblings--;
2190 goto out;
2191 }
2192
2193
2194
2195
2196
2197
2198 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2199
2200 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2201 perf_remove_sibling_event(sibling);
2202
2203 sibling->group_leader = sibling;
2204 list_del_init(&sibling->sibling_list);
2205
2206
2207 sibling->group_caps = event->group_caps;
2208
2209 if (!RB_EMPTY_NODE(&event->group_node)) {
2210 add_event_to_groups(sibling, event->ctx);
2211
2212 if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2213 list_add_tail(&sibling->active_list, get_event_list(sibling));
2214 }
2215
2216 WARN_ON_ONCE(sibling->ctx != event->ctx);
2217 }
2218
2219out:
2220 for_each_sibling_event(tmp, leader)
2221 perf_event__header_size(tmp);
2222
2223 perf_event__header_size(leader);
2224}
2225
2226static void sync_child_event(struct perf_event *child_event);
2227
2228static void perf_child_detach(struct perf_event *event)
2229{
2230 struct perf_event *parent_event = event->parent;
2231
2232 if (!(event->attach_state & PERF_ATTACH_CHILD))
2233 return;
2234
2235 event->attach_state &= ~PERF_ATTACH_CHILD;
2236
2237 if (WARN_ON_ONCE(!parent_event))
2238 return;
2239
2240 lockdep_assert_held(&parent_event->child_mutex);
2241
2242 sync_child_event(event);
2243 list_del_init(&event->child_list);
2244}
2245
2246static bool is_orphaned_event(struct perf_event *event)
2247{
2248 return event->state == PERF_EVENT_STATE_DEAD;
2249}
2250
2251static inline int __pmu_filter_match(struct perf_event *event)
2252{
2253 struct pmu *pmu = event->pmu;
2254 return pmu->filter_match ? pmu->filter_match(event) : 1;
2255}
2256
2257
2258
2259
2260
2261
2262
2263static inline int pmu_filter_match(struct perf_event *event)
2264{
2265 struct perf_event *sibling;
2266
2267 if (!__pmu_filter_match(event))
2268 return 0;
2269
2270 for_each_sibling_event(sibling, event) {
2271 if (!__pmu_filter_match(sibling))
2272 return 0;
2273 }
2274
2275 return 1;
2276}
2277
2278static inline int
2279event_filter_match(struct perf_event *event)
2280{
2281 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2282 perf_cgroup_match(event) && pmu_filter_match(event);
2283}
2284
2285static void
2286event_sched_out(struct perf_event *event,
2287 struct perf_cpu_context *cpuctx,
2288 struct perf_event_context *ctx)
2289{
2290 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2291
2292 WARN_ON_ONCE(event->ctx != ctx);
2293 lockdep_assert_held(&ctx->lock);
2294
2295 if (event->state != PERF_EVENT_STATE_ACTIVE)
2296 return;
2297
2298
2299
2300
2301
2302
2303 list_del_init(&event->active_list);
2304
2305 perf_pmu_disable(event->pmu);
2306
2307 event->pmu->del(event, 0);
2308 event->oncpu = -1;
2309
2310 if (READ_ONCE(event->pending_disable) >= 0) {
2311 WRITE_ONCE(event->pending_disable, -1);
2312 perf_cgroup_event_disable(event, ctx);
2313 state = PERF_EVENT_STATE_OFF;
2314 }
2315 perf_event_set_state(event, state);
2316
2317 if (!is_software_event(event))
2318 cpuctx->active_oncpu--;
2319 if (!--ctx->nr_active)
2320 perf_event_ctx_deactivate(ctx);
2321 if (event->attr.freq && event->attr.sample_freq)
2322 ctx->nr_freq--;
2323 if (event->attr.exclusive || !cpuctx->active_oncpu)
2324 cpuctx->exclusive = 0;
2325
2326 perf_pmu_enable(event->pmu);
2327}
2328
2329static void
2330group_sched_out(struct perf_event *group_event,
2331 struct perf_cpu_context *cpuctx,
2332 struct perf_event_context *ctx)
2333{
2334 struct perf_event *event;
2335
2336 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2337 return;
2338
2339 perf_pmu_disable(ctx->pmu);
2340
2341 event_sched_out(group_event, cpuctx, ctx);
2342
2343
2344
2345
2346 for_each_sibling_event(event, group_event)
2347 event_sched_out(event, cpuctx, ctx);
2348
2349 perf_pmu_enable(ctx->pmu);
2350}
2351
2352#define DETACH_GROUP 0x01UL
2353#define DETACH_CHILD 0x02UL
2354
2355
2356
2357
2358
2359
2360
2361static void
2362__perf_remove_from_context(struct perf_event *event,
2363 struct perf_cpu_context *cpuctx,
2364 struct perf_event_context *ctx,
2365 void *info)
2366{
2367 unsigned long flags = (unsigned long)info;
2368
2369 if (ctx->is_active & EVENT_TIME) {
2370 update_context_time(ctx);
2371 update_cgrp_time_from_cpuctx(cpuctx);
2372 }
2373
2374 event_sched_out(event, cpuctx, ctx);
2375 if (flags & DETACH_GROUP)
2376 perf_group_detach(event);
2377 if (flags & DETACH_CHILD)
2378 perf_child_detach(event);
2379 list_del_event(event, ctx);
2380
2381 if (!ctx->nr_events && ctx->is_active) {
2382 ctx->is_active = 0;
2383 ctx->rotate_necessary = 0;
2384 if (ctx->task) {
2385 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2386 cpuctx->task_ctx = NULL;
2387 }
2388 }
2389}
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2402{
2403 struct perf_event_context *ctx = event->ctx;
2404
2405 lockdep_assert_held(&ctx->mutex);
2406
2407
2408
2409
2410
2411
2412 raw_spin_lock_irq(&ctx->lock);
2413 if (!ctx->is_active) {
2414 __perf_remove_from_context(event, __get_cpu_context(ctx),
2415 ctx, (void *)flags);
2416 raw_spin_unlock_irq(&ctx->lock);
2417 return;
2418 }
2419 raw_spin_unlock_irq(&ctx->lock);
2420
2421 event_function_call(event, __perf_remove_from_context, (void *)flags);
2422}
2423
2424
2425
2426
2427static void __perf_event_disable(struct perf_event *event,
2428 struct perf_cpu_context *cpuctx,
2429 struct perf_event_context *ctx,
2430 void *info)
2431{
2432 if (event->state < PERF_EVENT_STATE_INACTIVE)
2433 return;
2434
2435 if (ctx->is_active & EVENT_TIME) {
2436 update_context_time(ctx);
2437 update_cgrp_time_from_event(event);
2438 }
2439
2440 if (event == event->group_leader)
2441 group_sched_out(event, cpuctx, ctx);
2442 else
2443 event_sched_out(event, cpuctx, ctx);
2444
2445 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2446 perf_cgroup_event_disable(event, ctx);
2447}
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463static void _perf_event_disable(struct perf_event *event)
2464{
2465 struct perf_event_context *ctx = event->ctx;
2466
2467 raw_spin_lock_irq(&ctx->lock);
2468 if (event->state <= PERF_EVENT_STATE_OFF) {
2469 raw_spin_unlock_irq(&ctx->lock);
2470 return;
2471 }
2472 raw_spin_unlock_irq(&ctx->lock);
2473
2474 event_function_call(event, __perf_event_disable, NULL);
2475}
2476
2477void perf_event_disable_local(struct perf_event *event)
2478{
2479 event_function_local(event, __perf_event_disable, NULL);
2480}
2481
2482
2483
2484
2485
2486void perf_event_disable(struct perf_event *event)
2487{
2488 struct perf_event_context *ctx;
2489
2490 ctx = perf_event_ctx_lock(event);
2491 _perf_event_disable(event);
2492 perf_event_ctx_unlock(event, ctx);
2493}
2494EXPORT_SYMBOL_GPL(perf_event_disable);
2495
2496void perf_event_disable_inatomic(struct perf_event *event)
2497{
2498 WRITE_ONCE(event->pending_disable, smp_processor_id());
2499
2500 irq_work_queue(&event->pending);
2501}
2502
2503static void perf_set_shadow_time(struct perf_event *event,
2504 struct perf_event_context *ctx)
2505{
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531 if (is_cgroup_event(event))
2532 perf_cgroup_set_shadow_time(event, event->tstamp);
2533 else
2534 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2535}
2536
2537#define MAX_INTERRUPTS (~0ULL)
2538
2539static void perf_log_throttle(struct perf_event *event, int enable);
2540static void perf_log_itrace_start(struct perf_event *event);
2541
2542static int
2543event_sched_in(struct perf_event *event,
2544 struct perf_cpu_context *cpuctx,
2545 struct perf_event_context *ctx)
2546{
2547 int ret = 0;
2548
2549 WARN_ON_ONCE(event->ctx != ctx);
2550
2551 lockdep_assert_held(&ctx->lock);
2552
2553 if (event->state <= PERF_EVENT_STATE_OFF)
2554 return 0;
2555
2556 WRITE_ONCE(event->oncpu, smp_processor_id());
2557
2558
2559
2560
2561
2562 smp_wmb();
2563 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2564
2565
2566
2567
2568
2569
2570 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2571 perf_log_throttle(event, 1);
2572 event->hw.interrupts = 0;
2573 }
2574
2575 perf_pmu_disable(event->pmu);
2576
2577 perf_set_shadow_time(event, ctx);
2578
2579 perf_log_itrace_start(event);
2580
2581 if (event->pmu->add(event, PERF_EF_START)) {
2582 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2583 event->oncpu = -1;
2584 ret = -EAGAIN;
2585 goto out;
2586 }
2587
2588 if (!is_software_event(event))
2589 cpuctx->active_oncpu++;
2590 if (!ctx->nr_active++)
2591 perf_event_ctx_activate(ctx);
2592 if (event->attr.freq && event->attr.sample_freq)
2593 ctx->nr_freq++;
2594
2595 if (event->attr.exclusive)
2596 cpuctx->exclusive = 1;
2597
2598out:
2599 perf_pmu_enable(event->pmu);
2600
2601 return ret;
2602}
2603
2604static int
2605group_sched_in(struct perf_event *group_event,
2606 struct perf_cpu_context *cpuctx,
2607 struct perf_event_context *ctx)
2608{
2609 struct perf_event *event, *partial_group = NULL;
2610 struct pmu *pmu = ctx->pmu;
2611
2612 if (group_event->state == PERF_EVENT_STATE_OFF)
2613 return 0;
2614
2615 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2616
2617 if (event_sched_in(group_event, cpuctx, ctx))
2618 goto error;
2619
2620
2621
2622
2623 for_each_sibling_event(event, group_event) {
2624 if (event_sched_in(event, cpuctx, ctx)) {
2625 partial_group = event;
2626 goto group_error;
2627 }
2628 }
2629
2630 if (!pmu->commit_txn(pmu))
2631 return 0;
2632
2633group_error:
2634
2635
2636
2637
2638
2639 for_each_sibling_event(event, group_event) {
2640 if (event == partial_group)
2641 break;
2642
2643 event_sched_out(event, cpuctx, ctx);
2644 }
2645 event_sched_out(group_event, cpuctx, ctx);
2646
2647error:
2648 pmu->cancel_txn(pmu);
2649 return -EAGAIN;
2650}
2651
2652
2653
2654
2655static int group_can_go_on(struct perf_event *event,
2656 struct perf_cpu_context *cpuctx,
2657 int can_add_hw)
2658{
2659
2660
2661
2662 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2663 return 1;
2664
2665
2666
2667
2668 if (cpuctx->exclusive)
2669 return 0;
2670
2671
2672
2673
2674 if (event->attr.exclusive && !list_empty(get_event_list(event)))
2675 return 0;
2676
2677
2678
2679
2680 return can_add_hw;
2681}
2682
2683static void add_event_to_ctx(struct perf_event *event,
2684 struct perf_event_context *ctx)
2685{
2686 list_add_event(event, ctx);
2687 perf_group_attach(event);
2688}
2689
2690static void ctx_sched_out(struct perf_event_context *ctx,
2691 struct perf_cpu_context *cpuctx,
2692 enum event_type_t event_type);
2693static void
2694ctx_sched_in(struct perf_event_context *ctx,
2695 struct perf_cpu_context *cpuctx,
2696 enum event_type_t event_type,
2697 struct task_struct *task);
2698
2699static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2700 struct perf_event_context *ctx,
2701 enum event_type_t event_type)
2702{
2703 if (!cpuctx->task_ctx)
2704 return;
2705
2706 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2707 return;
2708
2709 ctx_sched_out(ctx, cpuctx, event_type);
2710}
2711
2712static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2713 struct perf_event_context *ctx,
2714 struct task_struct *task)
2715{
2716 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2717 if (ctx)
2718 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2719 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2720 if (ctx)
2721 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2722}
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739static void ctx_resched(struct perf_cpu_context *cpuctx,
2740 struct perf_event_context *task_ctx,
2741 enum event_type_t event_type)
2742{
2743 enum event_type_t ctx_event_type;
2744 bool cpu_event = !!(event_type & EVENT_CPU);
2745
2746
2747
2748
2749
2750 if (event_type & EVENT_PINNED)
2751 event_type |= EVENT_FLEXIBLE;
2752
2753 ctx_event_type = event_type & EVENT_ALL;
2754
2755 perf_pmu_disable(cpuctx->ctx.pmu);
2756 if (task_ctx)
2757 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2758
2759
2760
2761
2762
2763
2764
2765
2766 if (cpu_event)
2767 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2768 else if (ctx_event_type & EVENT_PINNED)
2769 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2770
2771 perf_event_sched_in(cpuctx, task_ctx, current);
2772 perf_pmu_enable(cpuctx->ctx.pmu);
2773}
2774
2775void perf_pmu_resched(struct pmu *pmu)
2776{
2777 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2778 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2779
2780 perf_ctx_lock(cpuctx, task_ctx);
2781 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2782 perf_ctx_unlock(cpuctx, task_ctx);
2783}
2784
2785
2786
2787
2788
2789
2790
2791static int __perf_install_in_context(void *info)
2792{
2793 struct perf_event *event = info;
2794 struct perf_event_context *ctx = event->ctx;
2795 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2796 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2797 bool reprogram = true;
2798 int ret = 0;
2799
2800 raw_spin_lock(&cpuctx->ctx.lock);
2801 if (ctx->task) {
2802 raw_spin_lock(&ctx->lock);
2803 task_ctx = ctx;
2804
2805 reprogram = (ctx->task == current);
2806
2807
2808
2809
2810
2811
2812
2813
2814 if (task_curr(ctx->task) && !reprogram) {
2815 ret = -ESRCH;
2816 goto unlock;
2817 }
2818
2819 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2820 } else if (task_ctx) {
2821 raw_spin_lock(&task_ctx->lock);
2822 }
2823
2824#ifdef CONFIG_CGROUP_PERF
2825 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2826
2827
2828
2829
2830 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2831 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2832 event->cgrp->css.cgroup);
2833 }
2834#endif
2835
2836 if (reprogram) {
2837 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2838 add_event_to_ctx(event, ctx);
2839 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2840 } else {
2841 add_event_to_ctx(event, ctx);
2842 }
2843
2844unlock:
2845 perf_ctx_unlock(cpuctx, task_ctx);
2846
2847 return ret;
2848}
2849
2850static bool exclusive_event_installable(struct perf_event *event,
2851 struct perf_event_context *ctx);
2852
2853
2854
2855
2856
2857
2858static void
2859perf_install_in_context(struct perf_event_context *ctx,
2860 struct perf_event *event,
2861 int cpu)
2862{
2863 struct task_struct *task = READ_ONCE(ctx->task);
2864
2865 lockdep_assert_held(&ctx->mutex);
2866
2867 WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2868
2869 if (event->cpu != -1)
2870 event->cpu = cpu;
2871
2872
2873
2874
2875
2876 smp_store_release(&event->ctx, ctx);
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2887 raw_spin_lock_irq(&ctx->lock);
2888 if (ctx->task == TASK_TOMBSTONE) {
2889 raw_spin_unlock_irq(&ctx->lock);
2890 return;
2891 }
2892 add_event_to_ctx(event, ctx);
2893 raw_spin_unlock_irq(&ctx->lock);
2894 return;
2895 }
2896
2897 if (!task) {
2898 cpu_function_call(cpu, __perf_install_in_context, event);
2899 return;
2900 }
2901
2902
2903
2904
2905 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2906 return;
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938 smp_mb();
2939again:
2940 if (!task_function_call(task, __perf_install_in_context, event))
2941 return;
2942
2943 raw_spin_lock_irq(&ctx->lock);
2944 task = ctx->task;
2945 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2946
2947
2948
2949
2950
2951 raw_spin_unlock_irq(&ctx->lock);
2952 return;
2953 }
2954
2955
2956
2957
2958 if (task_curr(task)) {
2959 raw_spin_unlock_irq(&ctx->lock);
2960 goto again;
2961 }
2962 add_event_to_ctx(event, ctx);
2963 raw_spin_unlock_irq(&ctx->lock);
2964}
2965
2966
2967
2968
2969static void __perf_event_enable(struct perf_event *event,
2970 struct perf_cpu_context *cpuctx,
2971 struct perf_event_context *ctx,
2972 void *info)
2973{
2974 struct perf_event *leader = event->group_leader;
2975 struct perf_event_context *task_ctx;
2976
2977 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2978 event->state <= PERF_EVENT_STATE_ERROR)
2979 return;
2980
2981 if (ctx->is_active)
2982 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2983
2984 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2985 perf_cgroup_event_enable(event, ctx);
2986
2987 if (!ctx->is_active)
2988 return;
2989
2990 if (!event_filter_match(event)) {
2991 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2992 return;
2993 }
2994
2995
2996
2997
2998
2999 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
3000 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3001 return;
3002 }
3003
3004 task_ctx = cpuctx->task_ctx;
3005 if (ctx->task)
3006 WARN_ON_ONCE(task_ctx != ctx);
3007
3008 ctx_resched(cpuctx, task_ctx, get_event_type(event));
3009}
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020static void _perf_event_enable(struct perf_event *event)
3021{
3022 struct perf_event_context *ctx = event->ctx;
3023
3024 raw_spin_lock_irq(&ctx->lock);
3025 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3026 event->state < PERF_EVENT_STATE_ERROR) {
3027out:
3028 raw_spin_unlock_irq(&ctx->lock);
3029 return;
3030 }
3031
3032
3033
3034
3035
3036
3037
3038
3039 if (event->state == PERF_EVENT_STATE_ERROR) {
3040
3041
3042
3043 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3044 event->group_leader == event)
3045 goto out;
3046
3047 event->state = PERF_EVENT_STATE_OFF;
3048 }
3049 raw_spin_unlock_irq(&ctx->lock);
3050
3051 event_function_call(event, __perf_event_enable, NULL);
3052}
3053
3054
3055
3056
3057void perf_event_enable(struct perf_event *event)
3058{
3059 struct perf_event_context *ctx;
3060
3061 ctx = perf_event_ctx_lock(event);
3062 _perf_event_enable(event);
3063 perf_event_ctx_unlock(event, ctx);
3064}
3065EXPORT_SYMBOL_GPL(perf_event_enable);
3066
3067struct stop_event_data {
3068 struct perf_event *event;
3069 unsigned int restart;
3070};
3071
3072static int __perf_event_stop(void *info)
3073{
3074 struct stop_event_data *sd = info;
3075 struct perf_event *event = sd->event;
3076
3077
3078 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3079 return 0;
3080
3081
3082 smp_rmb();
3083
3084
3085
3086
3087
3088 if (READ_ONCE(event->oncpu) != smp_processor_id())
3089 return -EAGAIN;
3090
3091 event->pmu->stop(event, PERF_EF_UPDATE);
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102 if (sd->restart)
3103 event->pmu->start(event, 0);
3104
3105 return 0;
3106}
3107
3108static int perf_event_stop(struct perf_event *event, int restart)
3109{
3110 struct stop_event_data sd = {
3111 .event = event,
3112 .restart = restart,
3113 };
3114 int ret = 0;
3115
3116 do {
3117 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3118 return 0;
3119
3120
3121 smp_rmb();
3122
3123
3124
3125
3126
3127
3128 ret = cpu_function_call(READ_ONCE(event->oncpu),
3129 __perf_event_stop, &sd);
3130 } while (ret == -EAGAIN);
3131
3132 return ret;
3133}
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157void perf_event_addr_filters_sync(struct perf_event *event)
3158{
3159 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3160
3161 if (!has_addr_filter(event))
3162 return;
3163
3164 raw_spin_lock(&ifh->lock);
3165 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3166 event->pmu->addr_filters_sync(event);
3167 event->hw.addr_filters_gen = event->addr_filters_gen;
3168 }
3169 raw_spin_unlock(&ifh->lock);
3170}
3171EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3172
3173static int _perf_event_refresh(struct perf_event *event, int refresh)
3174{
3175
3176
3177
3178 if (event->attr.inherit || !is_sampling_event(event))
3179 return -EINVAL;
3180
3181 atomic_add(refresh, &event->event_limit);
3182 _perf_event_enable(event);
3183
3184 return 0;
3185}
3186
3187
3188
3189
3190int perf_event_refresh(struct perf_event *event, int refresh)
3191{
3192 struct perf_event_context *ctx;
3193 int ret;
3194
3195 ctx = perf_event_ctx_lock(event);
3196 ret = _perf_event_refresh(event, refresh);
3197 perf_event_ctx_unlock(event, ctx);
3198
3199 return ret;
3200}
3201EXPORT_SYMBOL_GPL(perf_event_refresh);
3202
3203static int perf_event_modify_breakpoint(struct perf_event *bp,
3204 struct perf_event_attr *attr)
3205{
3206 int err;
3207
3208 _perf_event_disable(bp);
3209
3210 err = modify_user_hw_breakpoint_check(bp, attr, true);
3211
3212 if (!bp->attr.disabled)
3213 _perf_event_enable(bp);
3214
3215 return err;
3216}
3217
3218static int perf_event_modify_attr(struct perf_event *event,
3219 struct perf_event_attr *attr)
3220{
3221 int (*func)(struct perf_event *, struct perf_event_attr *);
3222 struct perf_event *child;
3223 int err;
3224
3225 if (event->attr.type != attr->type)
3226 return -EINVAL;
3227
3228 switch (event->attr.type) {
3229 case PERF_TYPE_BREAKPOINT:
3230 func = perf_event_modify_breakpoint;
3231 break;
3232 default:
3233
3234 return -EOPNOTSUPP;
3235 }
3236
3237 WARN_ON_ONCE(event->ctx->parent_ctx);
3238
3239 mutex_lock(&event->child_mutex);
3240 err = func(event, attr);
3241 if (err)
3242 goto out;
3243 list_for_each_entry(child, &event->child_list, child_list) {
3244 err = func(child, attr);
3245 if (err)
3246 goto out;
3247 }
3248out:
3249 mutex_unlock(&event->child_mutex);
3250 return err;
3251}
3252
3253static void ctx_sched_out(struct perf_event_context *ctx,
3254 struct perf_cpu_context *cpuctx,
3255 enum event_type_t event_type)
3256{
3257 struct perf_event *event, *tmp;
3258 int is_active = ctx->is_active;
3259
3260 lockdep_assert_held(&ctx->lock);
3261
3262 if (likely(!ctx->nr_events)) {
3263
3264
3265
3266 WARN_ON_ONCE(ctx->is_active);
3267 if (ctx->task)
3268 WARN_ON_ONCE(cpuctx->task_ctx);
3269 return;
3270 }
3271
3272 ctx->is_active &= ~event_type;
3273 if (!(ctx->is_active & EVENT_ALL))
3274 ctx->is_active = 0;
3275
3276 if (ctx->task) {
3277 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3278 if (!ctx->is_active)
3279 cpuctx->task_ctx = NULL;
3280 }
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292 if (is_active & EVENT_TIME) {
3293
3294 update_context_time(ctx);
3295 update_cgrp_time_from_cpuctx(cpuctx);
3296 }
3297
3298 is_active ^= ctx->is_active;
3299
3300 if (!ctx->nr_active || !(is_active & EVENT_ALL))
3301 return;
3302
3303 perf_pmu_disable(ctx->pmu);
3304 if (is_active & EVENT_PINNED) {
3305 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3306 group_sched_out(event, cpuctx, ctx);
3307 }
3308
3309 if (is_active & EVENT_FLEXIBLE) {
3310 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3311 group_sched_out(event, cpuctx, ctx);
3312
3313
3314
3315
3316
3317
3318 ctx->rotate_necessary = 0;
3319 }
3320 perf_pmu_enable(ctx->pmu);
3321}
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331static int context_equiv(struct perf_event_context *ctx1,
3332 struct perf_event_context *ctx2)
3333{
3334 lockdep_assert_held(&ctx1->lock);
3335 lockdep_assert_held(&ctx2->lock);
3336
3337
3338 if (ctx1->pin_count || ctx2->pin_count)
3339 return 0;
3340
3341
3342 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3343 return 1;
3344
3345
3346 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3347 return 1;
3348
3349
3350
3351
3352
3353 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3354 ctx1->parent_gen == ctx2->parent_gen)
3355 return 1;
3356
3357
3358 return 0;
3359}
3360
3361static void __perf_event_sync_stat(struct perf_event *event,
3362 struct perf_event *next_event)
3363{
3364 u64 value;
3365
3366 if (!event->attr.inherit_stat)
3367 return;
3368
3369
3370
3371
3372
3373
3374
3375
3376 if (event->state == PERF_EVENT_STATE_ACTIVE)
3377 event->pmu->read(event);
3378
3379 perf_event_update_time(event);
3380
3381
3382
3383
3384
3385 value = local64_read(&next_event->count);
3386 value = local64_xchg(&event->count, value);
3387 local64_set(&next_event->count, value);
3388
3389 swap(event->total_time_enabled, next_event->total_time_enabled);
3390 swap(event->total_time_running, next_event->total_time_running);
3391
3392
3393
3394
3395 perf_event_update_userpage(event);
3396 perf_event_update_userpage(next_event);
3397}
3398
3399static void perf_event_sync_stat(struct perf_event_context *ctx,
3400 struct perf_event_context *next_ctx)
3401{
3402 struct perf_event *event, *next_event;
3403
3404 if (!ctx->nr_stat)
3405 return;
3406
3407 update_context_time(ctx);
3408
3409 event = list_first_entry(&ctx->event_list,
3410 struct perf_event, event_entry);
3411
3412 next_event = list_first_entry(&next_ctx->event_list,
3413 struct perf_event, event_entry);
3414
3415 while (&event->event_entry != &ctx->event_list &&
3416 &next_event->event_entry != &next_ctx->event_list) {
3417
3418 __perf_event_sync_stat(event, next_event);
3419
3420 event = list_next_entry(event, event_entry);
3421 next_event = list_next_entry(next_event, event_entry);
3422 }
3423}
3424
3425static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3426 struct task_struct *next)
3427{
3428 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3429 struct perf_event_context *next_ctx;
3430 struct perf_event_context *parent, *next_parent;
3431 struct perf_cpu_context *cpuctx;
3432 int do_switch = 1;
3433 struct pmu *pmu;
3434
3435 if (likely(!ctx))
3436 return;
3437
3438 pmu = ctx->pmu;
3439 cpuctx = __get_cpu_context(ctx);
3440 if (!cpuctx->task_ctx)
3441 return;
3442
3443 rcu_read_lock();
3444 next_ctx = next->perf_event_ctxp[ctxn];
3445 if (!next_ctx)
3446 goto unlock;
3447
3448 parent = rcu_dereference(ctx->parent_ctx);
3449 next_parent = rcu_dereference(next_ctx->parent_ctx);
3450
3451
3452 if (!parent && !next_parent)
3453 goto unlock;
3454
3455 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465 raw_spin_lock(&ctx->lock);
3466 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3467 if (context_equiv(ctx, next_ctx)) {
3468
3469 WRITE_ONCE(ctx->task, next);
3470 WRITE_ONCE(next_ctx->task, task);
3471
3472 perf_pmu_disable(pmu);
3473
3474 if (cpuctx->sched_cb_usage && pmu->sched_task)
3475 pmu->sched_task(ctx, false);
3476
3477
3478
3479
3480
3481
3482
3483 if (pmu->swap_task_ctx)
3484 pmu->swap_task_ctx(ctx, next_ctx);
3485 else
3486 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3487
3488 perf_pmu_enable(pmu);
3489
3490
3491
3492
3493
3494
3495
3496
3497 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3498 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3499
3500 do_switch = 0;
3501
3502 perf_event_sync_stat(ctx, next_ctx);
3503 }
3504 raw_spin_unlock(&next_ctx->lock);
3505 raw_spin_unlock(&ctx->lock);
3506 }
3507unlock:
3508 rcu_read_unlock();
3509
3510 if (do_switch) {
3511 raw_spin_lock(&ctx->lock);
3512 perf_pmu_disable(pmu);
3513
3514 if (cpuctx->sched_cb_usage && pmu->sched_task)
3515 pmu->sched_task(ctx, false);
3516 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3517
3518 perf_pmu_enable(pmu);
3519 raw_spin_unlock(&ctx->lock);
3520 }
3521}
3522
3523static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3524
3525void perf_sched_cb_dec(struct pmu *pmu)
3526{
3527 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3528
3529 this_cpu_dec(perf_sched_cb_usages);
3530
3531 if (!--cpuctx->sched_cb_usage)
3532 list_del(&cpuctx->sched_cb_entry);
3533}
3534
3535
3536void perf_sched_cb_inc(struct pmu *pmu)
3537{
3538 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3539
3540 if (!cpuctx->sched_cb_usage++)
3541 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3542
3543 this_cpu_inc(perf_sched_cb_usages);
3544}
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3555{
3556 struct pmu *pmu;
3557
3558 pmu = cpuctx->ctx.pmu;
3559
3560 if (WARN_ON_ONCE(!pmu->sched_task))
3561 return;
3562
3563 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3564 perf_pmu_disable(pmu);
3565
3566 pmu->sched_task(cpuctx->task_ctx, sched_in);
3567
3568 perf_pmu_enable(pmu);
3569 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3570}
3571
3572static void perf_pmu_sched_task(struct task_struct *prev,
3573 struct task_struct *next,
3574 bool sched_in)
3575{
3576 struct perf_cpu_context *cpuctx;
3577
3578 if (prev == next)
3579 return;
3580
3581 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3582
3583 if (cpuctx->task_ctx)
3584 continue;
3585
3586 __perf_pmu_sched_task(cpuctx, sched_in);
3587 }
3588}
3589
3590static void perf_event_switch(struct task_struct *task,
3591 struct task_struct *next_prev, bool sched_in);
3592
3593#define for_each_task_context_nr(ctxn) \
3594 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607void __perf_event_task_sched_out(struct task_struct *task,
3608 struct task_struct *next)
3609{
3610 int ctxn;
3611
3612 if (__this_cpu_read(perf_sched_cb_usages))
3613 perf_pmu_sched_task(task, next, false);
3614
3615 if (atomic_read(&nr_switch_events))
3616 perf_event_switch(task, next, false);
3617
3618 for_each_task_context_nr(ctxn)
3619 perf_event_context_sched_out(task, ctxn, next);
3620
3621
3622
3623
3624
3625
3626 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3627 perf_cgroup_sched_out(task, next);
3628}
3629
3630
3631
3632
3633static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3634 enum event_type_t event_type)
3635{
3636 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3637}
3638
3639static bool perf_less_group_idx(const void *l, const void *r)
3640{
3641 const struct perf_event *le = *(const struct perf_event **)l;
3642 const struct perf_event *re = *(const struct perf_event **)r;
3643
3644 return le->group_index < re->group_index;
3645}
3646
3647static void swap_ptr(void *l, void *r)
3648{
3649 void **lp = l, **rp = r;
3650
3651 swap(*lp, *rp);
3652}
3653
3654static const struct min_heap_callbacks perf_min_heap = {
3655 .elem_size = sizeof(struct perf_event *),
3656 .less = perf_less_group_idx,
3657 .swp = swap_ptr,
3658};
3659
3660static void __heap_add(struct min_heap *heap, struct perf_event *event)
3661{
3662 struct perf_event **itrs = heap->data;
3663
3664 if (event) {
3665 itrs[heap->nr] = event;
3666 heap->nr++;
3667 }
3668}
3669
3670static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3671 struct perf_event_groups *groups, int cpu,
3672 int (*func)(struct perf_event *, void *),
3673 void *data)
3674{
3675#ifdef CONFIG_CGROUP_PERF
3676 struct cgroup_subsys_state *css = NULL;
3677#endif
3678
3679 struct perf_event *itrs[2];
3680 struct min_heap event_heap;
3681 struct perf_event **evt;
3682 int ret;
3683
3684 if (cpuctx) {
3685 event_heap = (struct min_heap){
3686 .data = cpuctx->heap,
3687 .nr = 0,
3688 .size = cpuctx->heap_size,
3689 };
3690
3691 lockdep_assert_held(&cpuctx->ctx.lock);
3692
3693#ifdef CONFIG_CGROUP_PERF
3694 if (cpuctx->cgrp)
3695 css = &cpuctx->cgrp->css;
3696#endif
3697 } else {
3698 event_heap = (struct min_heap){
3699 .data = itrs,
3700 .nr = 0,
3701 .size = ARRAY_SIZE(itrs),
3702 };
3703
3704 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3705 }
3706 evt = event_heap.data;
3707
3708 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3709
3710#ifdef CONFIG_CGROUP_PERF
3711 for (; css; css = css->parent)
3712 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3713#endif
3714
3715 min_heapify_all(&event_heap, &perf_min_heap);
3716
3717 while (event_heap.nr) {
3718 ret = func(*evt, data);
3719 if (ret)
3720 return ret;
3721
3722 *evt = perf_event_groups_next(*evt);
3723 if (*evt)
3724 min_heapify(&event_heap, 0, &perf_min_heap);
3725 else
3726 min_heap_pop(&event_heap, &perf_min_heap);
3727 }
3728
3729 return 0;
3730}
3731
3732static int merge_sched_in(struct perf_event *event, void *data)
3733{
3734 struct perf_event_context *ctx = event->ctx;
3735 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3736 int *can_add_hw = data;
3737
3738 if (event->state <= PERF_EVENT_STATE_OFF)
3739 return 0;
3740
3741 if (!event_filter_match(event))
3742 return 0;
3743
3744 if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3745 if (!group_sched_in(event, cpuctx, ctx))
3746 list_add_tail(&event->active_list, get_event_list(event));
3747 }
3748
3749 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3750 if (event->attr.pinned) {
3751 perf_cgroup_event_disable(event, ctx);
3752 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3753 }
3754
3755 *can_add_hw = 0;
3756 ctx->rotate_necessary = 1;
3757 perf_mux_hrtimer_restart(cpuctx);
3758 }
3759
3760 return 0;
3761}
3762
3763static void
3764ctx_pinned_sched_in(struct perf_event_context *ctx,
3765 struct perf_cpu_context *cpuctx)
3766{
3767 int can_add_hw = 1;
3768
3769 if (ctx != &cpuctx->ctx)
3770 cpuctx = NULL;
3771
3772 visit_groups_merge(cpuctx, &ctx->pinned_groups,
3773 smp_processor_id(),
3774 merge_sched_in, &can_add_hw);
3775}
3776
3777static void
3778ctx_flexible_sched_in(struct perf_event_context *ctx,
3779 struct perf_cpu_context *cpuctx)
3780{
3781 int can_add_hw = 1;
3782
3783 if (ctx != &cpuctx->ctx)
3784 cpuctx = NULL;
3785
3786 visit_groups_merge(cpuctx, &ctx->flexible_groups,
3787 smp_processor_id(),
3788 merge_sched_in, &can_add_hw);
3789}
3790
3791static void
3792ctx_sched_in(struct perf_event_context *ctx,
3793 struct perf_cpu_context *cpuctx,
3794 enum event_type_t event_type,
3795 struct task_struct *task)
3796{
3797 int is_active = ctx->is_active;
3798 u64 now;
3799
3800 lockdep_assert_held(&ctx->lock);
3801
3802 if (likely(!ctx->nr_events))
3803 return;
3804
3805 ctx->is_active |= (event_type | EVENT_TIME);
3806 if (ctx->task) {
3807 if (!is_active)
3808 cpuctx->task_ctx = ctx;
3809 else
3810 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3811 }
3812
3813 is_active ^= ctx->is_active;
3814
3815 if (is_active & EVENT_TIME) {
3816
3817 now = perf_clock();
3818 ctx->timestamp = now;
3819 perf_cgroup_set_timestamp(task, ctx);
3820 }
3821
3822
3823
3824
3825
3826 if (is_active & EVENT_PINNED)
3827 ctx_pinned_sched_in(ctx, cpuctx);
3828
3829
3830 if (is_active & EVENT_FLEXIBLE)
3831 ctx_flexible_sched_in(ctx, cpuctx);
3832}
3833
3834static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3835 enum event_type_t event_type,
3836 struct task_struct *task)
3837{
3838 struct perf_event_context *ctx = &cpuctx->ctx;
3839
3840 ctx_sched_in(ctx, cpuctx, event_type, task);
3841}
3842
3843static void perf_event_context_sched_in(struct perf_event_context *ctx,
3844 struct task_struct *task)
3845{
3846 struct perf_cpu_context *cpuctx;
3847 struct pmu *pmu;
3848
3849 cpuctx = __get_cpu_context(ctx);
3850
3851
3852
3853
3854
3855 pmu = ctx->pmu = cpuctx->ctx.pmu;
3856
3857 if (cpuctx->task_ctx == ctx) {
3858 if (cpuctx->sched_cb_usage)
3859 __perf_pmu_sched_task(cpuctx, true);
3860 return;
3861 }
3862
3863 perf_ctx_lock(cpuctx, ctx);
3864
3865
3866
3867
3868 if (!ctx->nr_events)
3869 goto unlock;
3870
3871 perf_pmu_disable(pmu);
3872
3873
3874
3875
3876
3877
3878
3879
3880 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3881 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3882 perf_event_sched_in(cpuctx, ctx, task);
3883
3884 if (cpuctx->sched_cb_usage && pmu->sched_task)
3885 pmu->sched_task(cpuctx->task_ctx, true);
3886
3887 perf_pmu_enable(pmu);
3888
3889unlock:
3890 perf_ctx_unlock(cpuctx, ctx);
3891}
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904void __perf_event_task_sched_in(struct task_struct *prev,
3905 struct task_struct *task)
3906{
3907 struct perf_event_context *ctx;
3908 int ctxn;
3909
3910
3911
3912
3913
3914
3915
3916
3917 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3918 perf_cgroup_sched_in(prev, task);
3919
3920 for_each_task_context_nr(ctxn) {
3921 ctx = task->perf_event_ctxp[ctxn];
3922 if (likely(!ctx))
3923 continue;
3924
3925 perf_event_context_sched_in(ctx, task);
3926 }
3927
3928 if (atomic_read(&nr_switch_events))
3929 perf_event_switch(task, prev, true);
3930
3931 if (__this_cpu_read(perf_sched_cb_usages))
3932 perf_pmu_sched_task(prev, task, true);
3933}
3934
3935static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3936{
3937 u64 frequency = event->attr.sample_freq;
3938 u64 sec = NSEC_PER_SEC;
3939 u64 divisor, dividend;
3940
3941 int count_fls, nsec_fls, frequency_fls, sec_fls;
3942
3943 count_fls = fls64(count);
3944 nsec_fls = fls64(nsec);
3945 frequency_fls = fls64(frequency);
3946 sec_fls = 30;
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962#define REDUCE_FLS(a, b) \
3963do { \
3964 if (a##_fls > b##_fls) { \
3965 a >>= 1; \
3966 a##_fls--; \
3967 } else { \
3968 b >>= 1; \
3969 b##_fls--; \
3970 } \
3971} while (0)
3972
3973
3974
3975
3976
3977 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3978 REDUCE_FLS(nsec, frequency);
3979 REDUCE_FLS(sec, count);
3980 }
3981
3982 if (count_fls + sec_fls > 64) {
3983 divisor = nsec * frequency;
3984
3985 while (count_fls + sec_fls > 64) {
3986 REDUCE_FLS(count, sec);
3987 divisor >>= 1;
3988 }
3989
3990 dividend = count * sec;
3991 } else {
3992 dividend = count * sec;
3993
3994 while (nsec_fls + frequency_fls > 64) {
3995 REDUCE_FLS(nsec, frequency);
3996 dividend >>= 1;
3997 }
3998
3999 divisor = nsec * frequency;
4000 }
4001
4002 if (!divisor)
4003 return dividend;
4004
4005 return div64_u64(dividend, divisor);
4006}
4007
4008static DEFINE_PER_CPU(int, perf_throttled_count);
4009static DEFINE_PER_CPU(u64, perf_throttled_seq);
4010
4011static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4012{
4013 struct hw_perf_event *hwc = &event->hw;
4014 s64 period, sample_period;
4015 s64 delta;
4016
4017 period = perf_calculate_period(event, nsec, count);
4018
4019 delta = (s64)(period - hwc->sample_period);
4020 delta = (delta + 7) / 8;
4021
4022 sample_period = hwc->sample_period + delta;
4023
4024 if (!sample_period)
4025 sample_period = 1;
4026
4027 hwc->sample_period = sample_period;
4028
4029 if (local64_read(&hwc->period_left) > 8*sample_period) {
4030 if (disable)
4031 event->pmu->stop(event, PERF_EF_UPDATE);
4032
4033 local64_set(&hwc->period_left, 0);
4034
4035 if (disable)
4036 event->pmu->start(event, PERF_EF_RELOAD);
4037 }
4038}
4039
4040
4041
4042
4043
4044
4045static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4046 int needs_unthr)
4047{
4048 struct perf_event *event;
4049 struct hw_perf_event *hwc;
4050 u64 now, period = TICK_NSEC;
4051 s64 delta;
4052
4053
4054
4055
4056
4057
4058 if (!(ctx->nr_freq || needs_unthr))
4059 return;
4060
4061 raw_spin_lock(&ctx->lock);
4062 perf_pmu_disable(ctx->pmu);
4063
4064 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4065 if (event->state != PERF_EVENT_STATE_ACTIVE)
4066 continue;
4067
4068 if (!event_filter_match(event))
4069 continue;
4070
4071 perf_pmu_disable(event->pmu);
4072
4073 hwc = &event->hw;
4074
4075 if (hwc->interrupts == MAX_INTERRUPTS) {
4076 hwc->interrupts = 0;
4077 perf_log_throttle(event, 1);
4078 event->pmu->start(event, 0);
4079 }
4080
4081 if (!event->attr.freq || !event->attr.sample_freq)
4082 goto next;
4083
4084
4085
4086
4087 event->pmu->stop(event, PERF_EF_UPDATE);
4088
4089 now = local64_read(&event->count);
4090 delta = now - hwc->freq_count_stamp;
4091 hwc->freq_count_stamp = now;
4092
4093
4094
4095
4096
4097
4098
4099
4100 if (delta > 0)
4101 perf_adjust_period(event, period, delta, false);
4102
4103 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4104 next:
4105 perf_pmu_enable(event->pmu);
4106 }
4107
4108 perf_pmu_enable(ctx->pmu);
4109 raw_spin_unlock(&ctx->lock);
4110}
4111
4112
4113
4114
4115static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4116{
4117
4118
4119
4120
4121 if (ctx->rotate_disable)
4122 return;
4123
4124 perf_event_groups_delete(&ctx->flexible_groups, event);
4125 perf_event_groups_insert(&ctx->flexible_groups, event);
4126}
4127
4128
4129static inline struct perf_event *
4130ctx_event_to_rotate(struct perf_event_context *ctx)
4131{
4132 struct perf_event *event;
4133
4134
4135 event = list_first_entry_or_null(&ctx->flexible_active,
4136 struct perf_event, active_list);
4137
4138
4139 if (!event) {
4140 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4141 typeof(*event), group_node);
4142 }
4143
4144
4145
4146
4147
4148 ctx->rotate_necessary = 0;
4149
4150 return event;
4151}
4152
4153static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4154{
4155 struct perf_event *cpu_event = NULL, *task_event = NULL;
4156 struct perf_event_context *task_ctx = NULL;
4157 int cpu_rotate, task_rotate;
4158
4159
4160
4161
4162
4163
4164 cpu_rotate = cpuctx->ctx.rotate_necessary;
4165 task_ctx = cpuctx->task_ctx;
4166 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4167
4168 if (!(cpu_rotate || task_rotate))
4169 return false;
4170
4171 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4172 perf_pmu_disable(cpuctx->ctx.pmu);
4173
4174 if (task_rotate)
4175 task_event = ctx_event_to_rotate(task_ctx);
4176 if (cpu_rotate)
4177 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4178
4179
4180
4181
4182
4183 if (task_event || (task_ctx && cpu_event))
4184 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4185 if (cpu_event)
4186 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4187
4188 if (task_event)
4189 rotate_ctx(task_ctx, task_event);
4190 if (cpu_event)
4191 rotate_ctx(&cpuctx->ctx, cpu_event);
4192
4193 perf_event_sched_in(cpuctx, task_ctx, current);
4194
4195 perf_pmu_enable(cpuctx->ctx.pmu);
4196 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4197
4198 return true;
4199}
4200
4201void perf_event_task_tick(void)
4202{
4203 struct list_head *head = this_cpu_ptr(&active_ctx_list);
4204 struct perf_event_context *ctx, *tmp;
4205 int throttled;
4206
4207 lockdep_assert_irqs_disabled();
4208
4209 __this_cpu_inc(perf_throttled_seq);
4210 throttled = __this_cpu_xchg(perf_throttled_count, 0);
4211 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4212
4213 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4214 perf_adjust_freq_unthr_context(ctx, throttled);
4215}
4216
4217static int event_enable_on_exec(struct perf_event *event,
4218 struct perf_event_context *ctx)
4219{
4220 if (!event->attr.enable_on_exec)
4221 return 0;
4222
4223 event->attr.enable_on_exec = 0;
4224 if (event->state >= PERF_EVENT_STATE_INACTIVE)
4225 return 0;
4226
4227 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4228
4229 return 1;
4230}
4231
4232
4233
4234
4235
4236static void perf_event_enable_on_exec(int ctxn)
4237{
4238 struct perf_event_context *ctx, *clone_ctx = NULL;
4239 enum event_type_t event_type = 0;
4240 struct perf_cpu_context *cpuctx;
4241 struct perf_event *event;
4242 unsigned long flags;
4243 int enabled = 0;
4244
4245 local_irq_save(flags);
4246 ctx = current->perf_event_ctxp[ctxn];
4247 if (!ctx || !ctx->nr_events)
4248 goto out;
4249
4250 cpuctx = __get_cpu_context(ctx);
4251 perf_ctx_lock(cpuctx, ctx);
4252 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4253 list_for_each_entry(event, &ctx->event_list, event_entry) {
4254 enabled |= event_enable_on_exec(event, ctx);
4255 event_type |= get_event_type(event);
4256 }
4257
4258
4259
4260
4261 if (enabled) {
4262 clone_ctx = unclone_ctx(ctx);
4263 ctx_resched(cpuctx, ctx, event_type);
4264 } else {
4265 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4266 }
4267 perf_ctx_unlock(cpuctx, ctx);
4268
4269out:
4270 local_irq_restore(flags);
4271
4272 if (clone_ctx)
4273 put_ctx(clone_ctx);
4274}
4275
4276static void perf_remove_from_owner(struct perf_event *event);
4277static void perf_event_exit_event(struct perf_event *event,
4278 struct perf_event_context *ctx);
4279
4280
4281
4282
4283
4284static void perf_event_remove_on_exec(int ctxn)
4285{
4286 struct perf_event_context *ctx, *clone_ctx = NULL;
4287 struct perf_event *event, *next;
4288 LIST_HEAD(free_list);
4289 unsigned long flags;
4290 bool modified = false;
4291
4292 ctx = perf_pin_task_context(current, ctxn);
4293 if (!ctx)
4294 return;
4295
4296 mutex_lock(&ctx->mutex);
4297
4298 if (WARN_ON_ONCE(ctx->task != current))
4299 goto unlock;
4300
4301 list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4302 if (!event->attr.remove_on_exec)
4303 continue;
4304
4305 if (!is_kernel_event(event))
4306 perf_remove_from_owner(event);
4307
4308 modified = true;
4309
4310 perf_event_exit_event(event, ctx);
4311 }
4312
4313 raw_spin_lock_irqsave(&ctx->lock, flags);
4314 if (modified)
4315 clone_ctx = unclone_ctx(ctx);
4316 --ctx->pin_count;
4317 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4318
4319unlock:
4320 mutex_unlock(&ctx->mutex);
4321
4322 put_ctx(ctx);
4323 if (clone_ctx)
4324 put_ctx(clone_ctx);
4325}
4326
4327struct perf_read_data {
4328 struct perf_event *event;
4329 bool group;
4330 int ret;
4331};
4332
4333static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4334{
4335 u16 local_pkg, event_pkg;
4336
4337 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4338 int local_cpu = smp_processor_id();
4339
4340 event_pkg = topology_physical_package_id(event_cpu);
4341 local_pkg = topology_physical_package_id(local_cpu);
4342
4343 if (event_pkg == local_pkg)
4344 return local_cpu;
4345 }
4346
4347 return event_cpu;
4348}
4349
4350
4351
4352
4353static void __perf_event_read(void *info)
4354{
4355 struct perf_read_data *data = info;
4356 struct perf_event *sub, *event = data->event;
4357 struct perf_event_context *ctx = event->ctx;
4358 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4359 struct pmu *pmu = event->pmu;
4360
4361
4362
4363
4364
4365
4366
4367
4368 if (ctx->task && cpuctx->task_ctx != ctx)
4369 return;
4370
4371 raw_spin_lock(&ctx->lock);
4372 if (ctx->is_active & EVENT_TIME) {
4373 update_context_time(ctx);
4374 update_cgrp_time_from_event(event);
4375 }
4376
4377 perf_event_update_time(event);
4378 if (data->group)
4379 perf_event_update_sibling_time(event);
4380
4381 if (event->state != PERF_EVENT_STATE_ACTIVE)
4382 goto unlock;
4383
4384 if (!data->group) {
4385 pmu->read(event);
4386 data->ret = 0;
4387 goto unlock;
4388 }
4389
4390 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4391
4392 pmu->read(event);
4393
4394 for_each_sibling_event(sub, event) {
4395 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4396
4397
4398
4399
4400 sub->pmu->read(sub);
4401 }
4402 }
4403
4404 data->ret = pmu->commit_txn(pmu);
4405
4406unlock:
4407 raw_spin_unlock(&ctx->lock);
4408}
4409
4410static inline u64 perf_event_count(struct perf_event *event)
4411{
4412 return local64_read(&event->count) + atomic64_read(&event->child_count);
4413}
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423int perf_event_read_local(struct perf_event *event, u64 *value,
4424 u64 *enabled, u64 *running)
4425{
4426 unsigned long flags;
4427 int ret = 0;
4428
4429
4430
4431
4432
4433 local_irq_save(flags);
4434
4435
4436
4437
4438
4439 if (event->attr.inherit) {
4440 ret = -EOPNOTSUPP;
4441 goto out;
4442 }
4443
4444
4445 if ((event->attach_state & PERF_ATTACH_TASK) &&
4446 event->hw.target != current) {
4447 ret = -EINVAL;
4448 goto out;
4449 }
4450
4451
4452 if (!(event->attach_state & PERF_ATTACH_TASK) &&
4453 event->cpu != smp_processor_id()) {
4454 ret = -EINVAL;
4455 goto out;
4456 }
4457
4458
4459 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4460 ret = -EBUSY;
4461 goto out;
4462 }
4463
4464
4465 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4466 ret = -EBUSY;
4467 goto out;
4468 }
4469
4470
4471
4472
4473
4474
4475 if (event->oncpu == smp_processor_id())
4476 event->pmu->read(event);
4477
4478 *value = local64_read(&event->count);
4479 if (enabled || running) {
4480 u64 now = event->shadow_ctx_time + perf_clock();
4481 u64 __enabled, __running;
4482
4483 __perf_update_times(event, now, &__enabled, &__running);
4484 if (enabled)
4485 *enabled = __enabled;
4486 if (running)
4487 *running = __running;
4488 }
4489out:
4490 local_irq_restore(flags);
4491
4492 return ret;
4493}
4494
4495static int perf_event_read(struct perf_event *event, bool group)
4496{
4497 enum perf_event_state state = READ_ONCE(event->state);
4498 int event_cpu, ret = 0;
4499
4500
4501
4502
4503
4504again:
4505 if (state == PERF_EVENT_STATE_ACTIVE) {
4506 struct perf_read_data data;
4507
4508
4509
4510
4511
4512
4513
4514 smp_rmb();
4515
4516 event_cpu = READ_ONCE(event->oncpu);
4517 if ((unsigned)event_cpu >= nr_cpu_ids)
4518 return 0;
4519
4520 data = (struct perf_read_data){
4521 .event = event,
4522 .group = group,
4523 .ret = 0,
4524 };
4525
4526 preempt_disable();
4527 event_cpu = __perf_event_read_cpu(event, event_cpu);
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4540 preempt_enable();
4541 ret = data.ret;
4542
4543 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4544 struct perf_event_context *ctx = event->ctx;
4545 unsigned long flags;
4546
4547 raw_spin_lock_irqsave(&ctx->lock, flags);
4548 state = event->state;
4549 if (state != PERF_EVENT_STATE_INACTIVE) {
4550 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4551 goto again;
4552 }
4553
4554
4555
4556
4557
4558 if (ctx->is_active & EVENT_TIME) {
4559 update_context_time(ctx);
4560 update_cgrp_time_from_event(event);
4561 }
4562
4563 perf_event_update_time(event);
4564 if (group)
4565 perf_event_update_sibling_time(event);
4566 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4567 }
4568
4569 return ret;
4570}
4571
4572
4573
4574
4575static void __perf_event_init_context(struct perf_event_context *ctx)
4576{
4577 raw_spin_lock_init(&ctx->lock);
4578 mutex_init(&ctx->mutex);
4579 INIT_LIST_HEAD(&ctx->active_ctx_list);
4580 perf_event_groups_init(&ctx->pinned_groups);
4581 perf_event_groups_init(&ctx->flexible_groups);
4582 INIT_LIST_HEAD(&ctx->event_list);
4583 INIT_LIST_HEAD(&ctx->pinned_active);
4584 INIT_LIST_HEAD(&ctx->flexible_active);
4585 refcount_set(&ctx->refcount, 1);
4586}
4587
4588static struct perf_event_context *
4589alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4590{
4591 struct perf_event_context *ctx;
4592
4593 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4594 if (!ctx)
4595 return NULL;
4596
4597 __perf_event_init_context(ctx);
4598 if (task)
4599 ctx->task = get_task_struct(task);
4600 ctx->pmu = pmu;
4601
4602 return ctx;
4603}
4604
4605static struct task_struct *
4606find_lively_task_by_vpid(pid_t vpid)
4607{
4608 struct task_struct *task;
4609
4610 rcu_read_lock();
4611 if (!vpid)
4612 task = current;
4613 else
4614 task = find_task_by_vpid(vpid);
4615 if (task)
4616 get_task_struct(task);
4617 rcu_read_unlock();
4618
4619 if (!task)
4620 return ERR_PTR(-ESRCH);
4621
4622 return task;
4623}
4624
4625
4626
4627
4628static struct perf_event_context *
4629find_get_context(struct pmu *pmu, struct task_struct *task,
4630 struct perf_event *event)
4631{
4632 struct perf_event_context *ctx, *clone_ctx = NULL;
4633 struct perf_cpu_context *cpuctx;
4634 void *task_ctx_data = NULL;
4635 unsigned long flags;
4636 int ctxn, err;
4637 int cpu = event->cpu;
4638
4639 if (!task) {
4640
4641 err = perf_allow_cpu(&event->attr);
4642 if (err)
4643 return ERR_PTR(err);
4644
4645 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4646 ctx = &cpuctx->ctx;
4647 get_ctx(ctx);
4648 raw_spin_lock_irqsave(&ctx->lock, flags);
4649 ++ctx->pin_count;
4650 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4651
4652 return ctx;
4653 }
4654
4655 err = -EINVAL;
4656 ctxn = pmu->task_ctx_nr;
4657 if (ctxn < 0)
4658 goto errout;
4659
4660 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4661 task_ctx_data = alloc_task_ctx_data(pmu);
4662 if (!task_ctx_data) {
4663 err = -ENOMEM;
4664 goto errout;
4665 }
4666 }
4667
4668retry:
4669 ctx = perf_lock_task_context(task, ctxn, &flags);
4670 if (ctx) {
4671 clone_ctx = unclone_ctx(ctx);
4672 ++ctx->pin_count;
4673
4674 if (task_ctx_data && !ctx->task_ctx_data) {
4675 ctx->task_ctx_data = task_ctx_data;
4676 task_ctx_data = NULL;
4677 }
4678 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4679
4680 if (clone_ctx)
4681 put_ctx(clone_ctx);
4682 } else {
4683 ctx = alloc_perf_context(pmu, task);
4684 err = -ENOMEM;
4685 if (!ctx)
4686 goto errout;
4687
4688 if (task_ctx_data) {
4689 ctx->task_ctx_data = task_ctx_data;
4690 task_ctx_data = NULL;
4691 }
4692
4693 err = 0;
4694 mutex_lock(&task->perf_event_mutex);
4695
4696
4697
4698
4699 if (task->flags & PF_EXITING)
4700 err = -ESRCH;
4701 else if (task->perf_event_ctxp[ctxn])
4702 err = -EAGAIN;
4703 else {
4704 get_ctx(ctx);
4705 ++ctx->pin_count;
4706 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4707 }
4708 mutex_unlock(&task->perf_event_mutex);
4709
4710 if (unlikely(err)) {
4711 put_ctx(ctx);
4712
4713 if (err == -EAGAIN)
4714 goto retry;
4715 goto errout;
4716 }
4717 }
4718
4719 free_task_ctx_data(pmu, task_ctx_data);
4720 return ctx;
4721
4722errout:
4723 free_task_ctx_data(pmu, task_ctx_data);
4724 return ERR_PTR(err);
4725}
4726
4727static void perf_event_free_filter(struct perf_event *event);
4728static void perf_event_free_bpf_prog(struct perf_event *event);
4729
4730static void free_event_rcu(struct rcu_head *head)
4731{
4732 struct perf_event *event;
4733
4734 event = container_of(head, struct perf_event, rcu_head);
4735 if (event->ns)
4736 put_pid_ns(event->ns);
4737 perf_event_free_filter(event);
4738 kmem_cache_free(perf_event_cache, event);
4739}
4740
4741static void ring_buffer_attach(struct perf_event *event,
4742 struct perf_buffer *rb);
4743
4744static void detach_sb_event(struct perf_event *event)
4745{
4746 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4747
4748 raw_spin_lock(&pel->lock);
4749 list_del_rcu(&event->sb_list);
4750 raw_spin_unlock(&pel->lock);
4751}
4752
4753static bool is_sb_event(struct perf_event *event)
4754{
4755 struct perf_event_attr *attr = &event->attr;
4756
4757 if (event->parent)
4758 return false;
4759
4760 if (event->attach_state & PERF_ATTACH_TASK)
4761 return false;
4762
4763 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4764 attr->comm || attr->comm_exec ||
4765 attr->task || attr->ksymbol ||
4766 attr->context_switch || attr->text_poke ||
4767 attr->bpf_event)
4768 return true;
4769 return false;
4770}
4771
4772static void unaccount_pmu_sb_event(struct perf_event *event)
4773{
4774 if (is_sb_event(event))
4775 detach_sb_event(event);
4776}
4777
4778static void unaccount_event_cpu(struct perf_event *event, int cpu)
4779{
4780 if (event->parent)
4781 return;
4782
4783 if (is_cgroup_event(event))
4784 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4785}
4786
4787#ifdef CONFIG_NO_HZ_FULL
4788static DEFINE_SPINLOCK(nr_freq_lock);
4789#endif
4790
4791static void unaccount_freq_event_nohz(void)
4792{
4793#ifdef CONFIG_NO_HZ_FULL
4794 spin_lock(&nr_freq_lock);
4795 if (atomic_dec_and_test(&nr_freq_events))
4796 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4797 spin_unlock(&nr_freq_lock);
4798#endif
4799}
4800
4801static void unaccount_freq_event(void)
4802{
4803 if (tick_nohz_full_enabled())
4804 unaccount_freq_event_nohz();
4805 else
4806 atomic_dec(&nr_freq_events);
4807}
4808
4809static void unaccount_event(struct perf_event *event)
4810{
4811 bool dec = false;
4812
4813 if (event->parent)
4814 return;
4815
4816 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4817 dec = true;
4818 if (event->attr.mmap || event->attr.mmap_data)
4819 atomic_dec(&nr_mmap_events);
4820 if (event->attr.build_id)
4821 atomic_dec(&nr_build_id_events);
4822 if (event->attr.comm)
4823 atomic_dec(&nr_comm_events);
4824 if (event->attr.namespaces)
4825 atomic_dec(&nr_namespaces_events);
4826 if (event->attr.cgroup)
4827 atomic_dec(&nr_cgroup_events);
4828 if (event->attr.task)
4829 atomic_dec(&nr_task_events);
4830 if (event->attr.freq)
4831 unaccount_freq_event();
4832 if (event->attr.context_switch) {
4833 dec = true;
4834 atomic_dec(&nr_switch_events);
4835 }
4836 if (is_cgroup_event(event))
4837 dec = true;
4838 if (has_branch_stack(event))
4839 dec = true;
4840 if (event->attr.ksymbol)
4841 atomic_dec(&nr_ksymbol_events);
4842 if (event->attr.bpf_event)
4843 atomic_dec(&nr_bpf_events);
4844 if (event->attr.text_poke)
4845 atomic_dec(&nr_text_poke_events);
4846
4847 if (dec) {
4848 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4849 schedule_delayed_work(&perf_sched_work, HZ);
4850 }
4851
4852 unaccount_event_cpu(event, event->cpu);
4853
4854 unaccount_pmu_sb_event(event);
4855}
4856
4857static void perf_sched_delayed(struct work_struct *work)
4858{
4859 mutex_lock(&perf_sched_mutex);
4860 if (atomic_dec_and_test(&perf_sched_count))
4861 static_branch_disable(&perf_sched_events);
4862 mutex_unlock(&perf_sched_mutex);
4863}
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877static int exclusive_event_init(struct perf_event *event)
4878{
4879 struct pmu *pmu = event->pmu;
4880
4881 if (!is_exclusive_pmu(pmu))
4882 return 0;
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897 if (event->attach_state & PERF_ATTACH_TASK) {
4898 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4899 return -EBUSY;
4900 } else {
4901 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4902 return -EBUSY;
4903 }
4904
4905 return 0;
4906}
4907
4908static void exclusive_event_destroy(struct perf_event *event)
4909{
4910 struct pmu *pmu = event->pmu;
4911
4912 if (!is_exclusive_pmu(pmu))
4913 return;
4914
4915
4916 if (event->attach_state & PERF_ATTACH_TASK)
4917 atomic_dec(&pmu->exclusive_cnt);
4918 else
4919 atomic_inc(&pmu->exclusive_cnt);
4920}
4921
4922static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4923{
4924 if ((e1->pmu == e2->pmu) &&
4925 (e1->cpu == e2->cpu ||
4926 e1->cpu == -1 ||
4927 e2->cpu == -1))
4928 return true;
4929 return false;
4930}
4931
4932static bool exclusive_event_installable(struct perf_event *event,
4933 struct perf_event_context *ctx)
4934{
4935 struct perf_event *iter_event;
4936 struct pmu *pmu = event->pmu;
4937
4938 lockdep_assert_held(&ctx->mutex);
4939
4940 if (!is_exclusive_pmu(pmu))
4941 return true;
4942
4943 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4944 if (exclusive_event_match(iter_event, event))
4945 return false;
4946 }
4947
4948 return true;
4949}
4950
4951static void perf_addr_filters_splice(struct perf_event *event,
4952 struct list_head *head);
4953
4954static void _free_event(struct perf_event *event)
4955{
4956 irq_work_sync(&event->pending);
4957
4958 unaccount_event(event);
4959
4960 security_perf_event_free(event);
4961
4962 if (event->rb) {
4963
4964
4965
4966
4967
4968
4969 mutex_lock(&event->mmap_mutex);
4970 ring_buffer_attach(event, NULL);
4971 mutex_unlock(&event->mmap_mutex);
4972 }
4973
4974 if (is_cgroup_event(event))
4975 perf_detach_cgroup(event);
4976
4977 if (!event->parent) {
4978 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4979 put_callchain_buffers();
4980 }
4981
4982 perf_event_free_bpf_prog(event);
4983 perf_addr_filters_splice(event, NULL);
4984 kfree(event->addr_filter_ranges);
4985
4986 if (event->destroy)
4987 event->destroy(event);
4988
4989
4990
4991
4992
4993 if (event->hw.target)
4994 put_task_struct(event->hw.target);
4995
4996
4997
4998
4999
5000 if (event->ctx)
5001 put_ctx(event->ctx);
5002
5003 exclusive_event_destroy(event);
5004 module_put(event->pmu->module);
5005
5006 call_rcu(&event->rcu_head, free_event_rcu);
5007}
5008
5009
5010
5011
5012
5013static void free_event(struct perf_event *event)
5014{
5015 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5016 "unexpected event refcount: %ld; ptr=%p\n",
5017 atomic_long_read(&event->refcount), event)) {
5018
5019 return;
5020 }
5021
5022 _free_event(event);
5023}
5024
5025
5026
5027
5028static void perf_remove_from_owner(struct perf_event *event)
5029{
5030 struct task_struct *owner;
5031
5032 rcu_read_lock();
5033
5034
5035
5036
5037
5038
5039 owner = READ_ONCE(event->owner);
5040 if (owner) {
5041
5042
5043
5044
5045
5046 get_task_struct(owner);
5047 }
5048 rcu_read_unlock();
5049
5050 if (owner) {
5051
5052
5053
5054
5055
5056
5057
5058
5059 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5060
5061
5062
5063
5064
5065
5066
5067 if (event->owner) {
5068 list_del_init(&event->owner_entry);
5069 smp_store_release(&event->owner, NULL);
5070 }
5071 mutex_unlock(&owner->perf_event_mutex);
5072 put_task_struct(owner);
5073 }
5074}
5075
5076static void put_event(struct perf_event *event)
5077{
5078 if (!atomic_long_dec_and_test(&event->refcount))
5079 return;
5080
5081 _free_event(event);
5082}
5083
5084
5085
5086
5087
5088
5089int perf_event_release_kernel(struct perf_event *event)
5090{
5091 struct perf_event_context *ctx = event->ctx;
5092 struct perf_event *child, *tmp;
5093 LIST_HEAD(free_list);
5094
5095
5096
5097
5098
5099 if (!ctx) {
5100 WARN_ON_ONCE(event->attach_state &
5101 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5102 goto no_ctx;
5103 }
5104
5105 if (!is_kernel_event(event))
5106 perf_remove_from_owner(event);
5107
5108 ctx = perf_event_ctx_lock(event);
5109 WARN_ON_ONCE(ctx->parent_ctx);
5110 perf_remove_from_context(event, DETACH_GROUP);
5111
5112 raw_spin_lock_irq(&ctx->lock);
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124 event->state = PERF_EVENT_STATE_DEAD;
5125 raw_spin_unlock_irq(&ctx->lock);
5126
5127 perf_event_ctx_unlock(event, ctx);
5128
5129again:
5130 mutex_lock(&event->child_mutex);
5131 list_for_each_entry(child, &event->child_list, child_list) {
5132
5133
5134
5135
5136
5137 ctx = READ_ONCE(child->ctx);
5138
5139
5140
5141
5142
5143
5144
5145
5146 get_ctx(ctx);
5147
5148
5149
5150
5151
5152
5153 mutex_unlock(&event->child_mutex);
5154 mutex_lock(&ctx->mutex);
5155 mutex_lock(&event->child_mutex);
5156
5157
5158
5159
5160
5161
5162 tmp = list_first_entry_or_null(&event->child_list,
5163 struct perf_event, child_list);
5164 if (tmp == child) {
5165 perf_remove_from_context(child, DETACH_GROUP);
5166 list_move(&child->child_list, &free_list);
5167
5168
5169
5170
5171 put_event(event);
5172 }
5173
5174 mutex_unlock(&event->child_mutex);
5175 mutex_unlock(&ctx->mutex);
5176 put_ctx(ctx);
5177 goto again;
5178 }
5179 mutex_unlock(&event->child_mutex);
5180
5181 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5182 void *var = &child->ctx->refcount;
5183
5184 list_del(&child->child_list);
5185 free_event(child);
5186
5187
5188
5189
5190
5191 smp_mb();
5192 wake_up_var(var);
5193 }
5194
5195no_ctx:
5196 put_event(event);
5197 return 0;
5198}
5199EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5200
5201
5202
5203
5204static int perf_release(struct inode *inode, struct file *file)
5205{
5206 perf_event_release_kernel(file->private_data);
5207 return 0;
5208}
5209
5210static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5211{
5212 struct perf_event *child;
5213 u64 total = 0;
5214
5215 *enabled = 0;
5216 *running = 0;
5217
5218 mutex_lock(&event->child_mutex);
5219
5220 (void)perf_event_read(event, false);
5221 total += perf_event_count(event);
5222
5223 *enabled += event->total_time_enabled +
5224 atomic64_read(&event->child_total_time_enabled);
5225 *running += event->total_time_running +
5226 atomic64_read(&event->child_total_time_running);
5227
5228 list_for_each_entry(child, &event->child_list, child_list) {
5229 (void)perf_event_read(child, false);
5230 total += perf_event_count(child);
5231 *enabled += child->total_time_enabled;
5232 *running += child->total_time_running;
5233 }
5234 mutex_unlock(&event->child_mutex);
5235
5236 return total;
5237}
5238
5239u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5240{
5241 struct perf_event_context *ctx;
5242 u64 count;
5243
5244 ctx = perf_event_ctx_lock(event);
5245 count = __perf_event_read_value(event, enabled, running);
5246 perf_event_ctx_unlock(event, ctx);
5247
5248 return count;
5249}
5250EXPORT_SYMBOL_GPL(perf_event_read_value);
5251
5252static int __perf_read_group_add(struct perf_event *leader,
5253 u64 read_format, u64 *values)
5254{
5255 struct perf_event_context *ctx = leader->ctx;
5256 struct perf_event *sub;
5257 unsigned long flags;
5258 int n = 1;
5259 int ret;
5260
5261 ret = perf_event_read(leader, true);
5262 if (ret)
5263 return ret;
5264
5265 raw_spin_lock_irqsave(&ctx->lock, flags);
5266
5267
5268
5269
5270
5271
5272 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5273 values[n++] += leader->total_time_enabled +
5274 atomic64_read(&leader->child_total_time_enabled);
5275 }
5276
5277 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5278 values[n++] += leader->total_time_running +
5279 atomic64_read(&leader->child_total_time_running);
5280 }
5281
5282
5283
5284
5285 values[n++] += perf_event_count(leader);
5286 if (read_format & PERF_FORMAT_ID)
5287 values[n++] = primary_event_id(leader);
5288
5289 for_each_sibling_event(sub, leader) {
5290 values[n++] += perf_event_count(sub);
5291 if (read_format & PERF_FORMAT_ID)
5292 values[n++] = primary_event_id(sub);
5293 }
5294
5295 raw_spin_unlock_irqrestore(&ctx->lock, flags);
5296 return 0;
5297}
5298
5299static int perf_read_group(struct perf_event *event,
5300 u64 read_format, char __user *buf)
5301{
5302 struct perf_event *leader = event->group_leader, *child;
5303 struct perf_event_context *ctx = leader->ctx;
5304 int ret;
5305 u64 *values;
5306
5307 lockdep_assert_held(&ctx->mutex);
5308
5309 values = kzalloc(event->read_size, GFP_KERNEL);
5310 if (!values)
5311 return -ENOMEM;
5312
5313 values[0] = 1 + leader->nr_siblings;
5314
5315
5316
5317
5318
5319 mutex_lock(&leader->child_mutex);
5320
5321 ret = __perf_read_group_add(leader, read_format, values);
5322 if (ret)
5323 goto unlock;
5324
5325 list_for_each_entry(child, &leader->child_list, child_list) {
5326 ret = __perf_read_group_add(child, read_format, values);
5327 if (ret)
5328 goto unlock;
5329 }
5330
5331 mutex_unlock(&leader->child_mutex);
5332
5333 ret = event->read_size;
5334 if (copy_to_user(buf, values, event->read_size))
5335 ret = -EFAULT;
5336 goto out;
5337
5338unlock:
5339 mutex_unlock(&leader->child_mutex);
5340out:
5341 kfree(values);
5342 return ret;
5343}
5344
5345static int perf_read_one(struct perf_event *event,
5346 u64 read_format, char __user *buf)
5347{
5348 u64 enabled, running;
5349 u64 values[4];
5350 int n = 0;
5351
5352 values[n++] = __perf_event_read_value(event, &enabled, &running);
5353 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5354 values[n++] = enabled;
5355 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5356 values[n++] = running;
5357 if (read_format & PERF_FORMAT_ID)
5358 values[n++] = primary_event_id(event);
5359
5360 if (copy_to_user(buf, values, n * sizeof(u64)))
5361 return -EFAULT;
5362
5363 return n * sizeof(u64);
5364}
5365
5366static bool is_event_hup(struct perf_event *event)
5367{
5368 bool no_children;
5369
5370 if (event->state > PERF_EVENT_STATE_EXIT)
5371 return false;
5372
5373 mutex_lock(&event->child_mutex);
5374 no_children = list_empty(&event->child_list);
5375 mutex_unlock(&event->child_mutex);
5376 return no_children;
5377}
5378
5379
5380
5381
5382static ssize_t
5383__perf_read(struct perf_event *event, char __user *buf, size_t count)
5384{
5385 u64 read_format = event->attr.read_format;
5386 int ret;
5387
5388
5389
5390
5391
5392
5393 if (event->state == PERF_EVENT_STATE_ERROR)
5394 return 0;
5395
5396 if (count < event->read_size)
5397 return -ENOSPC;
5398
5399 WARN_ON_ONCE(event->ctx->parent_ctx);
5400 if (read_format & PERF_FORMAT_GROUP)
5401 ret = perf_read_group(event, read_format, buf);
5402 else
5403 ret = perf_read_one(event, read_format, buf);
5404
5405 return ret;
5406}
5407
5408static ssize_t
5409perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5410{
5411 struct perf_event *event = file->private_data;
5412 struct perf_event_context *ctx;
5413 int ret;
5414
5415 ret = security_perf_event_read(event);
5416 if (ret)
5417 return ret;
5418
5419 ctx = perf_event_ctx_lock(event);
5420 ret = __perf_read(event, buf, count);
5421 perf_event_ctx_unlock(event, ctx);
5422
5423 return ret;
5424}
5425
5426static __poll_t perf_poll(struct file *file, poll_table *wait)
5427{
5428 struct perf_event *event = file->private_data;
5429 struct perf_buffer *rb;
5430 __poll_t events = EPOLLHUP;
5431
5432 poll_wait(file, &event->waitq, wait);
5433
5434 if (is_event_hup(event))
5435 return events;
5436
5437
5438
5439
5440
5441 mutex_lock(&event->mmap_mutex);
5442 rb = event->rb;
5443 if (rb)
5444 events = atomic_xchg(&rb->poll, 0);
5445 mutex_unlock(&event->mmap_mutex);
5446 return events;
5447}
5448
5449static void _perf_event_reset(struct perf_event *event)
5450{
5451 (void)perf_event_read(event, false);
5452 local64_set(&event->count, 0);
5453 perf_event_update_userpage(event);
5454}
5455
5456
5457u64 perf_event_pause(struct perf_event *event, bool reset)
5458{
5459 struct perf_event_context *ctx;
5460 u64 count;
5461
5462 ctx = perf_event_ctx_lock(event);
5463 WARN_ON_ONCE(event->attr.inherit);
5464 _perf_event_disable(event);
5465 count = local64_read(&event->count);
5466 if (reset)
5467 local64_set(&event->count, 0);
5468 perf_event_ctx_unlock(event, ctx);
5469
5470 return count;
5471}
5472EXPORT_SYMBOL_GPL(perf_event_pause);
5473
5474
5475
5476
5477
5478
5479
5480static void perf_event_for_each_child(struct perf_event *event,
5481 void (*func)(struct perf_event *))
5482{
5483 struct perf_event *child;
5484
5485 WARN_ON_ONCE(event->ctx->parent_ctx);
5486
5487 mutex_lock(&event->child_mutex);
5488 func(event);
5489 list_for_each_entry(child, &event->child_list, child_list)
5490 func(child);
5491 mutex_unlock(&event->child_mutex);
5492}
5493
5494static void perf_event_for_each(struct perf_event *event,
5495 void (*func)(struct perf_event *))
5496{
5497 struct perf_event_context *ctx = event->ctx;
5498 struct perf_event *sibling;
5499
5500 lockdep_assert_held(&ctx->mutex);
5501
5502 event = event->group_leader;
5503
5504 perf_event_for_each_child(event, func);
5505 for_each_sibling_event(sibling, event)
5506 perf_event_for_each_child(sibling, func);
5507}
5508
5509static void __perf_event_period(struct perf_event *event,
5510 struct perf_cpu_context *cpuctx,
5511 struct perf_event_context *ctx,
5512 void *info)
5513{
5514 u64 value = *((u64 *)info);
5515 bool active;
5516
5517 if (event->attr.freq) {
5518 event->attr.sample_freq = value;
5519 } else {
5520 event->attr.sample_period = value;
5521 event->hw.sample_period = value;
5522 }
5523
5524 active = (event->state == PERF_EVENT_STATE_ACTIVE);
5525 if (active) {
5526 perf_pmu_disable(ctx->pmu);
5527
5528
5529
5530
5531 if (event->hw.interrupts == MAX_INTERRUPTS) {
5532 event->hw.interrupts = 0;
5533 perf_log_throttle(event, 1);
5534 }
5535 event->pmu->stop(event, PERF_EF_UPDATE);
5536 }
5537
5538 local64_set(&event->hw.period_left, 0);
5539
5540 if (active) {
5541 event->pmu->start(event, PERF_EF_RELOAD);
5542 perf_pmu_enable(ctx->pmu);
5543 }
5544}
5545
5546static int perf_event_check_period(struct perf_event *event, u64 value)
5547{
5548 return event->pmu->check_period(event, value);
5549}
5550
5551static int _perf_event_period(struct perf_event *event, u64 value)
5552{
5553 if (!is_sampling_event(event))
5554 return -EINVAL;
5555
5556 if (!value)
5557 return -EINVAL;
5558
5559 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5560 return -EINVAL;
5561
5562 if (perf_event_check_period(event, value))
5563 return -EINVAL;
5564
5565 if (!event->attr.freq && (value & (1ULL << 63)))
5566 return -EINVAL;
5567
5568 event_function_call(event, __perf_event_period, &value);
5569
5570 return 0;
5571}
5572
5573int perf_event_period(struct perf_event *event, u64 value)
5574{
5575 struct perf_event_context *ctx;
5576 int ret;
5577
5578 ctx = perf_event_ctx_lock(event);
5579 ret = _perf_event_period(event, value);
5580 perf_event_ctx_unlock(event, ctx);
5581
5582 return ret;
5583}
5584EXPORT_SYMBOL_GPL(perf_event_period);
5585
5586static const struct file_operations perf_fops;
5587
5588static inline int perf_fget_light(int fd, struct fd *p)
5589{
5590 struct fd f = fdget(fd);
5591 if (!f.file)
5592 return -EBADF;
5593
5594 if (f.file->f_op != &perf_fops) {
5595 fdput(f);
5596 return -EBADF;
5597 }
5598 *p = f;
5599 return 0;
5600}
5601
5602static int perf_event_set_output(struct perf_event *event,
5603 struct perf_event *output_event);
5604static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5605static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5606static int perf_copy_attr(struct perf_event_attr __user *uattr,
5607 struct perf_event_attr *attr);
5608
5609static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5610{
5611 void (*func)(struct perf_event *);
5612 u32 flags = arg;
5613
5614 switch (cmd) {
5615 case PERF_EVENT_IOC_ENABLE:
5616 func = _perf_event_enable;
5617 break;
5618 case PERF_EVENT_IOC_DISABLE:
5619 func = _perf_event_disable;
5620 break;
5621 case PERF_EVENT_IOC_RESET:
5622 func = _perf_event_reset;
5623 break;
5624
5625 case PERF_EVENT_IOC_REFRESH:
5626 return _perf_event_refresh(event, arg);
5627
5628 case PERF_EVENT_IOC_PERIOD:
5629 {
5630 u64 value;
5631
5632 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5633 return -EFAULT;
5634
5635 return _perf_event_period(event, value);
5636 }
5637 case PERF_EVENT_IOC_ID:
5638 {
5639 u64 id = primary_event_id(event);
5640
5641 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5642 return -EFAULT;
5643 return 0;
5644 }
5645
5646 case PERF_EVENT_IOC_SET_OUTPUT:
5647 {
5648 int ret;
5649 if (arg != -1) {
5650 struct perf_event *output_event;
5651 struct fd output;
5652 ret = perf_fget_light(arg, &output);
5653 if (ret)
5654 return ret;
5655 output_event = output.file->private_data;
5656 ret = perf_event_set_output(event, output_event);
5657 fdput(output);
5658 } else {
5659 ret = perf_event_set_output(event, NULL);
5660 }
5661 return ret;
5662 }
5663
5664 case PERF_EVENT_IOC_SET_FILTER:
5665 return perf_event_set_filter(event, (void __user *)arg);
5666
5667 case PERF_EVENT_IOC_SET_BPF:
5668 return perf_event_set_bpf_prog(event, arg);
5669
5670 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5671 struct perf_buffer *rb;
5672
5673 rcu_read_lock();
5674 rb = rcu_dereference(event->rb);
5675 if (!rb || !rb->nr_pages) {
5676 rcu_read_unlock();
5677 return -EINVAL;
5678 }
5679 rb_toggle_paused(rb, !!arg);
5680 rcu_read_unlock();
5681 return 0;
5682 }
5683
5684 case PERF_EVENT_IOC_QUERY_BPF:
5685 return perf_event_query_prog_array(event, (void __user *)arg);
5686
5687 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5688 struct perf_event_attr new_attr;
5689 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5690 &new_attr);
5691
5692 if (err)
5693 return err;
5694
5695 return perf_event_modify_attr(event, &new_attr);
5696 }
5697 default:
5698 return -ENOTTY;
5699 }
5700
5701 if (flags & PERF_IOC_FLAG_GROUP)
5702 perf_event_for_each(event, func);
5703 else
5704 perf_event_for_each_child(event, func);
5705
5706 return 0;
5707}
5708
5709static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5710{
5711 struct perf_event *event = file->private_data;
5712 struct perf_event_context *ctx;
5713 long ret;
5714
5715
5716 ret = security_perf_event_write(event);
5717 if (ret)
5718 return ret;
5719
5720 ctx = perf_event_ctx_lock(event);
5721 ret = _perf_ioctl(event, cmd, arg);
5722 perf_event_ctx_unlock(event, ctx);
5723
5724 return ret;
5725}
5726
5727#ifdef CONFIG_COMPAT
5728static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5729 unsigned long arg)
5730{
5731 switch (_IOC_NR(cmd)) {
5732 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5733 case _IOC_NR(PERF_EVENT_IOC_ID):
5734 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5735 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5736
5737 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5738 cmd &= ~IOCSIZE_MASK;
5739 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5740 }
5741 break;
5742 }
5743 return perf_ioctl(file, cmd, arg);
5744}
5745#else
5746# define perf_compat_ioctl NULL
5747#endif
5748
5749int perf_event_task_enable(void)
5750{
5751 struct perf_event_context *ctx;
5752 struct perf_event *event;
5753
5754 mutex_lock(¤t->perf_event_mutex);
5755 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5756 ctx = perf_event_ctx_lock(event);
5757 perf_event_for_each_child(event, _perf_event_enable);
5758 perf_event_ctx_unlock(event, ctx);
5759 }
5760 mutex_unlock(¤t->perf_event_mutex);
5761
5762 return 0;
5763}
5764
5765int perf_event_task_disable(void)
5766{
5767 struct perf_event_context *ctx;
5768 struct perf_event *event;
5769
5770 mutex_lock(¤t->perf_event_mutex);
5771 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5772 ctx = perf_event_ctx_lock(event);
5773 perf_event_for_each_child(event, _perf_event_disable);
5774 perf_event_ctx_unlock(event, ctx);
5775 }
5776 mutex_unlock(¤t->perf_event_mutex);
5777
5778 return 0;
5779}
5780
5781static int perf_event_index(struct perf_event *event)
5782{
5783 if (event->hw.state & PERF_HES_STOPPED)
5784 return 0;
5785
5786 if (event->state != PERF_EVENT_STATE_ACTIVE)
5787 return 0;
5788
5789 return event->pmu->event_idx(event);
5790}
5791
5792static void calc_timer_values(struct perf_event *event,
5793 u64 *now,
5794 u64 *enabled,
5795 u64 *running)
5796{
5797 u64 ctx_time;
5798
5799 *now = perf_clock();
5800 ctx_time = event->shadow_ctx_time + *now;
5801 __perf_update_times(event, ctx_time, enabled, running);
5802}
5803
5804static void perf_event_init_userpage(struct perf_event *event)
5805{
5806 struct perf_event_mmap_page *userpg;
5807 struct perf_buffer *rb;
5808
5809 rcu_read_lock();
5810 rb = rcu_dereference(event->rb);
5811 if (!rb)
5812 goto unlock;
5813
5814 userpg = rb->user_page;
5815
5816
5817 userpg->cap_bit0_is_deprecated = 1;
5818 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5819 userpg->data_offset = PAGE_SIZE;
5820 userpg->data_size = perf_data_size(rb);
5821
5822unlock:
5823 rcu_read_unlock();
5824}
5825
5826void __weak arch_perf_update_userpage(
5827 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5828{
5829}
5830
5831
5832
5833
5834
5835
5836void perf_event_update_userpage(struct perf_event *event)
5837{
5838 struct perf_event_mmap_page *userpg;
5839 struct perf_buffer *rb;
5840 u64 enabled, running, now;
5841
5842 rcu_read_lock();
5843 rb = rcu_dereference(event->rb);
5844 if (!rb)
5845 goto unlock;
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856 calc_timer_values(event, &now, &enabled, &running);
5857
5858 userpg = rb->user_page;
5859
5860
5861
5862
5863 preempt_disable();
5864 ++userpg->lock;
5865 barrier();
5866 userpg->index = perf_event_index(event);
5867 userpg->offset = perf_event_count(event);
5868 if (userpg->index)
5869 userpg->offset -= local64_read(&event->hw.prev_count);
5870
5871 userpg->time_enabled = enabled +
5872 atomic64_read(&event->child_total_time_enabled);
5873
5874 userpg->time_running = running +
5875 atomic64_read(&event->child_total_time_running);
5876
5877 arch_perf_update_userpage(event, userpg, now);
5878
5879 barrier();
5880 ++userpg->lock;
5881 preempt_enable();
5882unlock:
5883 rcu_read_unlock();
5884}
5885EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5886
5887static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5888{
5889 struct perf_event *event = vmf->vma->vm_file->private_data;
5890 struct perf_buffer *rb;
5891 vm_fault_t ret = VM_FAULT_SIGBUS;
5892
5893 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5894 if (vmf->pgoff == 0)
5895 ret = 0;
5896 return ret;
5897 }
5898
5899 rcu_read_lock();
5900 rb = rcu_dereference(event->rb);
5901 if (!rb)
5902 goto unlock;
5903
5904 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5905 goto unlock;
5906
5907 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5908 if (!vmf->page)
5909 goto unlock;
5910
5911 get_page(vmf->page);
5912 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5913 vmf->page->index = vmf->pgoff;
5914
5915 ret = 0;
5916unlock:
5917 rcu_read_unlock();
5918
5919 return ret;
5920}
5921
5922static void ring_buffer_attach(struct perf_event *event,
5923 struct perf_buffer *rb)
5924{
5925 struct perf_buffer *old_rb = NULL;
5926 unsigned long flags;
5927
5928 if (event->rb) {
5929
5930
5931
5932
5933 WARN_ON_ONCE(event->rcu_pending);
5934
5935 old_rb = event->rb;
5936 spin_lock_irqsave(&old_rb->event_lock, flags);
5937 list_del_rcu(&event->rb_entry);
5938 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5939
5940 event->rcu_batches = get_state_synchronize_rcu();
5941 event->rcu_pending = 1;
5942 }
5943
5944 if (rb) {
5945 if (event->rcu_pending) {
5946 cond_synchronize_rcu(event->rcu_batches);
5947 event->rcu_pending = 0;
5948 }
5949
5950 spin_lock_irqsave(&rb->event_lock, flags);
5951 list_add_rcu(&event->rb_entry, &rb->event_list);
5952 spin_unlock_irqrestore(&rb->event_lock, flags);
5953 }
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965 if (has_aux(event))
5966 perf_event_stop(event, 0);
5967
5968 rcu_assign_pointer(event->rb, rb);
5969
5970 if (old_rb) {
5971 ring_buffer_put(old_rb);
5972
5973
5974
5975
5976
5977 wake_up_all(&event->waitq);
5978 }
5979}
5980
5981static void ring_buffer_wakeup(struct perf_event *event)
5982{
5983 struct perf_buffer *rb;
5984
5985 rcu_read_lock();
5986 rb = rcu_dereference(event->rb);
5987 if (rb) {
5988 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5989 wake_up_all(&event->waitq);
5990 }
5991 rcu_read_unlock();
5992}
5993
5994struct perf_buffer *ring_buffer_get(struct perf_event *event)
5995{
5996 struct perf_buffer *rb;
5997
5998 rcu_read_lock();
5999 rb = rcu_dereference(event->rb);
6000 if (rb) {
6001 if (!refcount_inc_not_zero(&rb->refcount))
6002 rb = NULL;
6003 }
6004 rcu_read_unlock();
6005
6006 return rb;
6007}
6008
6009void ring_buffer_put(struct perf_buffer *rb)
6010{
6011 if (!refcount_dec_and_test(&rb->refcount))
6012 return;
6013
6014 WARN_ON_ONCE(!list_empty(&rb->event_list));
6015
6016 call_rcu(&rb->rcu_head, rb_free_rcu);
6017}
6018
6019static void perf_mmap_open(struct vm_area_struct *vma)
6020{
6021 struct perf_event *event = vma->vm_file->private_data;
6022
6023 atomic_inc(&event->mmap_count);
6024 atomic_inc(&event->rb->mmap_count);
6025
6026 if (vma->vm_pgoff)
6027 atomic_inc(&event->rb->aux_mmap_count);
6028
6029 if (event->pmu->event_mapped)
6030 event->pmu->event_mapped(event, vma->vm_mm);
6031}
6032
6033static void perf_pmu_output_stop(struct perf_event *event);
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043static void perf_mmap_close(struct vm_area_struct *vma)
6044{
6045 struct perf_event *event = vma->vm_file->private_data;
6046 struct perf_buffer *rb = ring_buffer_get(event);
6047 struct user_struct *mmap_user = rb->mmap_user;
6048 int mmap_locked = rb->mmap_locked;
6049 unsigned long size = perf_data_size(rb);
6050 bool detach_rest = false;
6051
6052 if (event->pmu->event_unmapped)
6053 event->pmu->event_unmapped(event, vma->vm_mm);
6054
6055
6056
6057
6058
6059
6060 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6061 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6062
6063
6064
6065
6066
6067
6068 perf_pmu_output_stop(event);
6069
6070
6071 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6072 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6073
6074
6075 rb_free_aux(rb);
6076 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6077
6078 mutex_unlock(&event->mmap_mutex);
6079 }
6080
6081 if (atomic_dec_and_test(&rb->mmap_count))
6082 detach_rest = true;
6083
6084 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6085 goto out_put;
6086
6087 ring_buffer_attach(event, NULL);
6088 mutex_unlock(&event->mmap_mutex);
6089
6090
6091 if (!detach_rest)
6092 goto out_put;
6093
6094
6095
6096
6097
6098
6099again:
6100 rcu_read_lock();
6101 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6102 if (!atomic_long_inc_not_zero(&event->refcount)) {
6103
6104
6105
6106
6107 continue;
6108 }
6109 rcu_read_unlock();
6110
6111 mutex_lock(&event->mmap_mutex);
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122 if (event->rb == rb)
6123 ring_buffer_attach(event, NULL);
6124
6125 mutex_unlock(&event->mmap_mutex);
6126 put_event(event);
6127
6128
6129
6130
6131
6132 goto again;
6133 }
6134 rcu_read_unlock();
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145 atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6146 &mmap_user->locked_vm);
6147 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6148 free_uid(mmap_user);
6149
6150out_put:
6151 ring_buffer_put(rb);
6152}
6153
6154static const struct vm_operations_struct perf_mmap_vmops = {
6155 .open = perf_mmap_open,
6156 .close = perf_mmap_close,
6157 .fault = perf_mmap_fault,
6158 .page_mkwrite = perf_mmap_fault,
6159};
6160
6161static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6162{
6163 struct perf_event *event = file->private_data;
6164 unsigned long user_locked, user_lock_limit;
6165 struct user_struct *user = current_user();
6166 struct perf_buffer *rb = NULL;
6167 unsigned long locked, lock_limit;
6168 unsigned long vma_size;
6169 unsigned long nr_pages;
6170 long user_extra = 0, extra = 0;
6171 int ret = 0, flags = 0;
6172
6173
6174
6175
6176
6177
6178 if (event->cpu == -1 && event->attr.inherit)
6179 return -EINVAL;
6180
6181 if (!(vma->vm_flags & VM_SHARED))
6182 return -EINVAL;
6183
6184 ret = security_perf_event_read(event);
6185 if (ret)
6186 return ret;
6187
6188 vma_size = vma->vm_end - vma->vm_start;
6189
6190 if (vma->vm_pgoff == 0) {
6191 nr_pages = (vma_size / PAGE_SIZE) - 1;
6192 } else {
6193
6194
6195
6196
6197
6198 u64 aux_offset, aux_size;
6199
6200 if (!event->rb)
6201 return -EINVAL;
6202
6203 nr_pages = vma_size / PAGE_SIZE;
6204
6205 mutex_lock(&event->mmap_mutex);
6206 ret = -EINVAL;
6207
6208 rb = event->rb;
6209 if (!rb)
6210 goto aux_unlock;
6211
6212 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6213 aux_size = READ_ONCE(rb->user_page->aux_size);
6214
6215 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6216 goto aux_unlock;
6217
6218 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6219 goto aux_unlock;
6220
6221
6222 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6223 goto aux_unlock;
6224
6225 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6226 goto aux_unlock;
6227
6228
6229 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6230 goto aux_unlock;
6231
6232 if (!is_power_of_2(nr_pages))
6233 goto aux_unlock;
6234
6235 if (!atomic_inc_not_zero(&rb->mmap_count))
6236 goto aux_unlock;
6237
6238 if (rb_has_aux(rb)) {
6239 atomic_inc(&rb->aux_mmap_count);
6240 ret = 0;
6241 goto unlock;
6242 }
6243
6244 atomic_set(&rb->aux_mmap_count, 1);
6245 user_extra = nr_pages;
6246
6247 goto accounting;
6248 }
6249
6250
6251
6252
6253
6254 if (nr_pages != 0 && !is_power_of_2(nr_pages))
6255 return -EINVAL;
6256
6257 if (vma_size != PAGE_SIZE * (1 + nr_pages))
6258 return -EINVAL;
6259
6260 WARN_ON_ONCE(event->ctx->parent_ctx);
6261again:
6262 mutex_lock(&event->mmap_mutex);
6263 if (event->rb) {
6264 if (event->rb->nr_pages != nr_pages) {
6265 ret = -EINVAL;
6266 goto unlock;
6267 }
6268
6269 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6270
6271
6272
6273
6274
6275 mutex_unlock(&event->mmap_mutex);
6276 goto again;
6277 }
6278
6279 goto unlock;
6280 }
6281
6282 user_extra = nr_pages + 1;
6283
6284accounting:
6285 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6286
6287
6288
6289
6290 user_lock_limit *= num_online_cpus();
6291
6292 user_locked = atomic_long_read(&user->locked_vm);
6293
6294
6295
6296
6297
6298 if (user_locked > user_lock_limit)
6299 user_locked = user_lock_limit;
6300 user_locked += user_extra;
6301
6302 if (user_locked > user_lock_limit) {
6303
6304
6305
6306
6307 extra = user_locked - user_lock_limit;
6308 user_extra -= extra;
6309 }
6310
6311 lock_limit = rlimit(RLIMIT_MEMLOCK);
6312 lock_limit >>= PAGE_SHIFT;
6313 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6314
6315 if ((locked > lock_limit) && perf_is_paranoid() &&
6316 !capable(CAP_IPC_LOCK)) {
6317 ret = -EPERM;
6318 goto unlock;
6319 }
6320
6321 WARN_ON(!rb && event->rb);
6322
6323 if (vma->vm_flags & VM_WRITE)
6324 flags |= RING_BUFFER_WRITABLE;
6325
6326 if (!rb) {
6327 rb = rb_alloc(nr_pages,
6328 event->attr.watermark ? event->attr.wakeup_watermark : 0,
6329 event->cpu, flags);
6330
6331 if (!rb) {
6332 ret = -ENOMEM;
6333 goto unlock;
6334 }
6335
6336 atomic_set(&rb->mmap_count, 1);
6337 rb->mmap_user = get_current_user();
6338 rb->mmap_locked = extra;
6339
6340 ring_buffer_attach(event, rb);
6341
6342 perf_event_init_userpage(event);
6343 perf_event_update_userpage(event);
6344 } else {
6345 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6346 event->attr.aux_watermark, flags);
6347 if (!ret)
6348 rb->aux_mmap_locked = extra;
6349 }
6350
6351unlock:
6352 if (!ret) {
6353 atomic_long_add(user_extra, &user->locked_vm);
6354 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6355
6356 atomic_inc(&event->mmap_count);
6357 } else if (rb) {
6358 atomic_dec(&rb->mmap_count);
6359 }
6360aux_unlock:
6361 mutex_unlock(&event->mmap_mutex);
6362
6363
6364
6365
6366
6367 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6368 vma->vm_ops = &perf_mmap_vmops;
6369
6370 if (event->pmu->event_mapped)
6371 event->pmu->event_mapped(event, vma->vm_mm);
6372
6373 return ret;
6374}
6375
6376static int perf_fasync(int fd, struct file *filp, int on)
6377{
6378 struct inode *inode = file_inode(filp);
6379 struct perf_event *event = filp->private_data;
6380 int retval;
6381
6382 inode_lock(inode);
6383 retval = fasync_helper(fd, filp, on, &event->fasync);
6384 inode_unlock(inode);
6385
6386 if (retval < 0)
6387 return retval;
6388
6389 return 0;
6390}
6391
6392static const struct file_operations perf_fops = {
6393 .llseek = no_llseek,
6394 .release = perf_release,
6395 .read = perf_read,
6396 .poll = perf_poll,
6397 .unlocked_ioctl = perf_ioctl,
6398 .compat_ioctl = perf_compat_ioctl,
6399 .mmap = perf_mmap,
6400 .fasync = perf_fasync,
6401};
6402
6403
6404
6405
6406
6407
6408
6409
6410static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6411{
6412
6413 if (event->parent)
6414 event = event->parent;
6415 return &event->fasync;
6416}
6417
6418void perf_event_wakeup(struct perf_event *event)
6419{
6420 ring_buffer_wakeup(event);
6421
6422 if (event->pending_kill) {
6423 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6424 event->pending_kill = 0;
6425 }
6426}
6427
6428static void perf_pending_event_disable(struct perf_event *event)
6429{
6430 int cpu = READ_ONCE(event->pending_disable);
6431
6432 if (cpu < 0)
6433 return;
6434
6435 if (cpu == smp_processor_id()) {
6436 WRITE_ONCE(event->pending_disable, -1);
6437 perf_event_disable_local(event);
6438 return;
6439 }
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461 irq_work_queue_on(&event->pending, cpu);
6462}
6463
6464static void perf_pending_event(struct irq_work *entry)
6465{
6466 struct perf_event *event = container_of(entry, struct perf_event, pending);
6467 int rctx;
6468
6469 rctx = perf_swevent_get_recursion_context();
6470
6471
6472
6473
6474
6475 perf_pending_event_disable(event);
6476
6477 if (event->pending_wakeup) {
6478 event->pending_wakeup = 0;
6479 perf_event_wakeup(event);
6480 }
6481
6482 if (rctx >= 0)
6483 perf_swevent_put_recursion_context(rctx);
6484}
6485
6486
6487
6488
6489
6490
6491struct perf_guest_info_callbacks *perf_guest_cbs;
6492
6493int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6494{
6495 perf_guest_cbs = cbs;
6496 return 0;
6497}
6498EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6499
6500int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6501{
6502 perf_guest_cbs = NULL;
6503 return 0;
6504}
6505EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6506
6507static void
6508perf_output_sample_regs(struct perf_output_handle *handle,
6509 struct pt_regs *regs, u64 mask)
6510{
6511 int bit;
6512 DECLARE_BITMAP(_mask, 64);
6513
6514 bitmap_from_u64(_mask, mask);
6515 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6516 u64 val;
6517
6518 val = perf_reg_value(regs, bit);
6519 perf_output_put(handle, val);
6520 }
6521}
6522
6523static void perf_sample_regs_user(struct perf_regs *regs_user,
6524 struct pt_regs *regs)
6525{
6526 if (user_mode(regs)) {
6527 regs_user->abi = perf_reg_abi(current);
6528 regs_user->regs = regs;
6529 } else if (!(current->flags & PF_KTHREAD)) {
6530 perf_get_regs_user(regs_user, regs);
6531 } else {
6532 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6533 regs_user->regs = NULL;
6534 }
6535}
6536
6537static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6538 struct pt_regs *regs)
6539{
6540 regs_intr->regs = regs;
6541 regs_intr->abi = perf_reg_abi(current);
6542}
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552static u64 perf_ustack_task_size(struct pt_regs *regs)
6553{
6554 unsigned long addr = perf_user_stack_pointer(regs);
6555
6556 if (!addr || addr >= TASK_SIZE)
6557 return 0;
6558
6559 return TASK_SIZE - addr;
6560}
6561
6562static u16
6563perf_sample_ustack_size(u16 stack_size, u16 header_size,
6564 struct pt_regs *regs)
6565{
6566 u64 task_size;
6567
6568
6569 if (!regs)
6570 return 0;
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6583 stack_size = min(stack_size, (u16) task_size);
6584
6585
6586 header_size += 2 * sizeof(u64);
6587
6588
6589 if ((u16) (header_size + stack_size) < header_size) {
6590
6591
6592
6593
6594 stack_size = USHRT_MAX - header_size - sizeof(u64);
6595 stack_size = round_up(stack_size, sizeof(u64));
6596 }
6597
6598 return stack_size;
6599}
6600
6601static void
6602perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6603 struct pt_regs *regs)
6604{
6605
6606 if (!regs) {
6607 u64 size = 0;
6608 perf_output_put(handle, size);
6609 } else {
6610 unsigned long sp;
6611 unsigned int rem;
6612 u64 dyn_size;
6613 mm_segment_t fs;
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627 perf_output_put(handle, dump_size);
6628
6629
6630 sp = perf_user_stack_pointer(regs);
6631 fs = get_fs();
6632 set_fs(USER_DS);
6633 rem = __output_copy_user(handle, (void *) sp, dump_size);
6634 set_fs(fs);
6635 dyn_size = dump_size - rem;
6636
6637 perf_output_skip(handle, rem);
6638
6639
6640 perf_output_put(handle, dyn_size);
6641 }
6642}
6643
6644static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6645 struct perf_sample_data *data,
6646 size_t size)
6647{
6648 struct perf_event *sampler = event->aux_event;
6649 struct perf_buffer *rb;
6650
6651 data->aux_size = 0;
6652
6653 if (!sampler)
6654 goto out;
6655
6656 if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6657 goto out;
6658
6659 if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6660 goto out;
6661
6662 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6663 if (!rb)
6664 goto out;
6665
6666
6667
6668
6669
6670 if (READ_ONCE(rb->aux_in_sampling)) {
6671 data->aux_size = 0;
6672 } else {
6673 size = min_t(size_t, size, perf_aux_size(rb));
6674 data->aux_size = ALIGN(size, sizeof(u64));
6675 }
6676 ring_buffer_put(rb);
6677
6678out:
6679 return data->aux_size;
6680}
6681
6682static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6683 struct perf_event *event,
6684 struct perf_output_handle *handle,
6685 unsigned long size)
6686{
6687 unsigned long flags;
6688 long ret;
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699 local_irq_save(flags);
6700
6701
6702
6703
6704 WRITE_ONCE(rb->aux_in_sampling, 1);
6705 barrier();
6706
6707 ret = event->pmu->snapshot_aux(event, handle, size);
6708
6709 barrier();
6710 WRITE_ONCE(rb->aux_in_sampling, 0);
6711 local_irq_restore(flags);
6712
6713 return ret;
6714}
6715
6716static void perf_aux_sample_output(struct perf_event *event,
6717 struct perf_output_handle *handle,
6718 struct perf_sample_data *data)
6719{
6720 struct perf_event *sampler = event->aux_event;
6721 struct perf_buffer *rb;
6722 unsigned long pad;
6723 long size;
6724
6725 if (WARN_ON_ONCE(!sampler || !data->aux_size))
6726 return;
6727
6728 rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6729 if (!rb)
6730 return;
6731
6732 size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6733
6734
6735
6736
6737
6738
6739
6740 if (WARN_ON_ONCE(size < 0))
6741 goto out_put;
6742
6743
6744
6745
6746
6747 pad = data->aux_size - size;
6748 if (WARN_ON_ONCE(pad >= sizeof(u64)))
6749 pad = 8;
6750
6751 if (pad) {
6752 u64 zero = 0;
6753 perf_output_copy(handle, &zero, pad);
6754 }
6755
6756out_put:
6757 ring_buffer_put(rb);
6758}
6759
6760static void __perf_event_header__init_id(struct perf_event_header *header,
6761 struct perf_sample_data *data,
6762 struct perf_event *event)
6763{
6764 u64 sample_type = event->attr.sample_type;
6765
6766 data->type = sample_type;
6767 header->size += event->id_header_size;
6768
6769 if (sample_type & PERF_SAMPLE_TID) {
6770
6771 data->tid_entry.pid = perf_event_pid(event, current);
6772 data->tid_entry.tid = perf_event_tid(event, current);
6773 }
6774
6775 if (sample_type & PERF_SAMPLE_TIME)
6776 data->time = perf_event_clock(event);
6777
6778 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6779 data->id = primary_event_id(event);
6780
6781 if (sample_type & PERF_SAMPLE_STREAM_ID)
6782 data->stream_id = event->id;
6783
6784 if (sample_type & PERF_SAMPLE_CPU) {
6785 data->cpu_entry.cpu = raw_smp_processor_id();
6786 data->cpu_entry.reserved = 0;
6787 }
6788}
6789
6790void perf_event_header__init_id(struct perf_event_header *header,
6791 struct perf_sample_data *data,
6792 struct perf_event *event)
6793{
6794 if (event->attr.sample_id_all)
6795 __perf_event_header__init_id(header, data, event);
6796}
6797
6798static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6799 struct perf_sample_data *data)
6800{
6801 u64 sample_type = data->type;
6802
6803 if (sample_type & PERF_SAMPLE_TID)
6804 perf_output_put(handle, data->tid_entry);
6805
6806 if (sample_type & PERF_SAMPLE_TIME)
6807 perf_output_put(handle, data->time);
6808
6809 if (sample_type & PERF_SAMPLE_ID)
6810 perf_output_put(handle, data->id);
6811
6812 if (sample_type & PERF_SAMPLE_STREAM_ID)
6813 perf_output_put(handle, data->stream_id);
6814
6815 if (sample_type & PERF_SAMPLE_CPU)
6816 perf_output_put(handle, data->cpu_entry);
6817
6818 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6819 perf_output_put(handle, data->id);
6820}
6821
6822void perf_event__output_id_sample(struct perf_event *event,
6823 struct perf_output_handle *handle,
6824 struct perf_sample_data *sample)
6825{
6826 if (event->attr.sample_id_all)
6827 __perf_event__output_id_sample(handle, sample);
6828}
6829
6830static void perf_output_read_one(struct perf_output_handle *handle,
6831 struct perf_event *event,
6832 u64 enabled, u64 running)
6833{
6834 u64 read_format = event->attr.read_format;
6835 u64 values[4];
6836 int n = 0;
6837
6838 values[n++] = perf_event_count(event);
6839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6840 values[n++] = enabled +
6841 atomic64_read(&event->child_total_time_enabled);
6842 }
6843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6844 values[n++] = running +
6845 atomic64_read(&event->child_total_time_running);
6846 }
6847 if (read_format & PERF_FORMAT_ID)
6848 values[n++] = primary_event_id(event);
6849
6850 __output_copy(handle, values, n * sizeof(u64));
6851}
6852
6853static void perf_output_read_group(struct perf_output_handle *handle,
6854 struct perf_event *event,
6855 u64 enabled, u64 running)
6856{
6857 struct perf_event *leader = event->group_leader, *sub;
6858 u64 read_format = event->attr.read_format;
6859 u64 values[5];
6860 int n = 0;
6861
6862 values[n++] = 1 + leader->nr_siblings;
6863
6864 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6865 values[n++] = enabled;
6866
6867 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6868 values[n++] = running;
6869
6870 if ((leader != event) &&
6871 (leader->state == PERF_EVENT_STATE_ACTIVE))
6872 leader->pmu->read(leader);
6873
6874 values[n++] = perf_event_count(leader);
6875 if (read_format & PERF_FORMAT_ID)
6876 values[n++] = primary_event_id(leader);
6877
6878 __output_copy(handle, values, n * sizeof(u64));
6879
6880 for_each_sibling_event(sub, leader) {
6881 n = 0;
6882
6883 if ((sub != event) &&
6884 (sub->state == PERF_EVENT_STATE_ACTIVE))
6885 sub->pmu->read(sub);
6886
6887 values[n++] = perf_event_count(sub);
6888 if (read_format & PERF_FORMAT_ID)
6889 values[n++] = primary_event_id(sub);
6890
6891 __output_copy(handle, values, n * sizeof(u64));
6892 }
6893}
6894
6895#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6896 PERF_FORMAT_TOTAL_TIME_RUNNING)
6897
6898
6899
6900
6901
6902
6903
6904
6905static void perf_output_read(struct perf_output_handle *handle,
6906 struct perf_event *event)
6907{
6908 u64 enabled = 0, running = 0, now;
6909 u64 read_format = event->attr.read_format;
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6921 calc_timer_values(event, &now, &enabled, &running);
6922
6923 if (event->attr.read_format & PERF_FORMAT_GROUP)
6924 perf_output_read_group(handle, event, enabled, running);
6925 else
6926 perf_output_read_one(handle, event, enabled, running);
6927}
6928
6929static inline bool perf_sample_save_hw_index(struct perf_event *event)
6930{
6931 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6932}
6933
6934void perf_output_sample(struct perf_output_handle *handle,
6935 struct perf_event_header *header,
6936 struct perf_sample_data *data,
6937 struct perf_event *event)
6938{
6939 u64 sample_type = data->type;
6940
6941 perf_output_put(handle, *header);
6942
6943 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6944 perf_output_put(handle, data->id);
6945
6946 if (sample_type & PERF_SAMPLE_IP)
6947 perf_output_put(handle, data->ip);
6948
6949 if (sample_type & PERF_SAMPLE_TID)
6950 perf_output_put(handle, data->tid_entry);
6951
6952 if (sample_type & PERF_SAMPLE_TIME)
6953 perf_output_put(handle, data->time);
6954
6955 if (sample_type & PERF_SAMPLE_ADDR)
6956 perf_output_put(handle, data->addr);
6957
6958 if (sample_type & PERF_SAMPLE_ID)
6959 perf_output_put(handle, data->id);
6960
6961 if (sample_type & PERF_SAMPLE_STREAM_ID)
6962 perf_output_put(handle, data->stream_id);
6963
6964 if (sample_type & PERF_SAMPLE_CPU)
6965 perf_output_put(handle, data->cpu_entry);
6966
6967 if (sample_type & PERF_SAMPLE_PERIOD)
6968 perf_output_put(handle, data->period);
6969
6970 if (sample_type & PERF_SAMPLE_READ)
6971 perf_output_read(handle, event);
6972
6973 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6974 int size = 1;
6975
6976 size += data->callchain->nr;
6977 size *= sizeof(u64);
6978 __output_copy(handle, data->callchain, size);
6979 }
6980
6981 if (sample_type & PERF_SAMPLE_RAW) {
6982 struct perf_raw_record *raw = data->raw;
6983
6984 if (raw) {
6985 struct perf_raw_frag *frag = &raw->frag;
6986
6987 perf_output_put(handle, raw->size);
6988 do {
6989 if (frag->copy) {
6990 __output_custom(handle, frag->copy,
6991 frag->data, frag->size);
6992 } else {
6993 __output_copy(handle, frag->data,
6994 frag->size);
6995 }
6996 if (perf_raw_frag_last(frag))
6997 break;
6998 frag = frag->next;
6999 } while (1);
7000 if (frag->pad)
7001 __output_skip(handle, NULL, frag->pad);
7002 } else {
7003 struct {
7004 u32 size;
7005 u32 data;
7006 } raw = {
7007 .size = sizeof(u32),
7008 .data = 0,
7009 };
7010 perf_output_put(handle, raw);
7011 }
7012 }
7013
7014 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7015 if (data->br_stack) {
7016 size_t size;
7017
7018 size = data->br_stack->nr
7019 * sizeof(struct perf_branch_entry);
7020
7021 perf_output_put(handle, data->br_stack->nr);
7022 if (perf_sample_save_hw_index(event))
7023 perf_output_put(handle, data->br_stack->hw_idx);
7024 perf_output_copy(handle, data->br_stack->entries, size);
7025 } else {
7026
7027
7028
7029 u64 nr = 0;
7030 perf_output_put(handle, nr);
7031 }
7032 }
7033
7034 if (sample_type & PERF_SAMPLE_REGS_USER) {
7035 u64 abi = data->regs_user.abi;
7036
7037
7038
7039
7040
7041 perf_output_put(handle, abi);
7042
7043 if (abi) {
7044 u64 mask = event->attr.sample_regs_user;
7045 perf_output_sample_regs(handle,
7046 data->regs_user.regs,
7047 mask);
7048 }
7049 }
7050
7051 if (sample_type & PERF_SAMPLE_STACK_USER) {
7052 perf_output_sample_ustack(handle,
7053 data->stack_user_size,
7054 data->regs_user.regs);
7055 }
7056
7057 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
7058 perf_output_put(handle, data->weight.full);
7059
7060 if (sample_type & PERF_SAMPLE_DATA_SRC)
7061 perf_output_put(handle, data->data_src.val);
7062
7063 if (sample_type & PERF_SAMPLE_TRANSACTION)
7064 perf_output_put(handle, data->txn);
7065
7066 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7067 u64 abi = data->regs_intr.abi;
7068
7069
7070
7071
7072 perf_output_put(handle, abi);
7073
7074 if (abi) {
7075 u64 mask = event->attr.sample_regs_intr;
7076
7077 perf_output_sample_regs(handle,
7078 data->regs_intr.regs,
7079 mask);
7080 }
7081 }
7082
7083 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7084 perf_output_put(handle, data->phys_addr);
7085
7086 if (sample_type & PERF_SAMPLE_CGROUP)
7087 perf_output_put(handle, data->cgroup);
7088
7089 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7090 perf_output_put(handle, data->data_page_size);
7091
7092 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7093 perf_output_put(handle, data->code_page_size);
7094
7095 if (sample_type & PERF_SAMPLE_AUX) {
7096 perf_output_put(handle, data->aux_size);
7097
7098 if (data->aux_size)
7099 perf_aux_sample_output(event, handle, data);
7100 }
7101
7102 if (!event->attr.watermark) {
7103 int wakeup_events = event->attr.wakeup_events;
7104
7105 if (wakeup_events) {
7106 struct perf_buffer *rb = handle->rb;
7107 int events = local_inc_return(&rb->events);
7108
7109 if (events >= wakeup_events) {
7110 local_sub(wakeup_events, &rb->events);
7111 local_inc(&rb->wakeup);
7112 }
7113 }
7114 }
7115}
7116
7117static u64 perf_virt_to_phys(u64 virt)
7118{
7119 u64 phys_addr = 0;
7120 struct page *p = NULL;
7121
7122 if (!virt)
7123 return 0;
7124
7125 if (virt >= TASK_SIZE) {
7126
7127 if (virt_addr_valid((void *)(uintptr_t)virt) &&
7128 !(virt >= VMALLOC_START && virt < VMALLOC_END))
7129 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7130 } else {
7131
7132
7133
7134
7135
7136
7137
7138 if (current->mm != NULL) {
7139 pagefault_disable();
7140 if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
7141 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7142 pagefault_enable();
7143 }
7144
7145 if (p)
7146 put_page(p);
7147 }
7148
7149 return phys_addr;
7150}
7151
7152#ifdef CONFIG_MMU
7153
7154
7155
7156
7157static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
7158{
7159 pgd_t *pgd;
7160 p4d_t *p4d;
7161 pud_t *pud;
7162 pmd_t *pmd;
7163 pte_t *pte;
7164
7165 pgd = pgd_offset(mm, addr);
7166 if (pgd_none(*pgd))
7167 return 0;
7168
7169 p4d = p4d_offset(pgd, addr);
7170 if (!p4d_present(*p4d))
7171 return 0;
7172
7173 if (p4d_leaf(*p4d))
7174 return 1ULL << P4D_SHIFT;
7175
7176 pud = pud_offset(p4d, addr);
7177 if (!pud_present(*pud))
7178 return 0;
7179
7180 if (pud_leaf(*pud))
7181 return 1ULL << PUD_SHIFT;
7182
7183 pmd = pmd_offset(pud, addr);
7184 if (!pmd_present(*pmd))
7185 return 0;
7186
7187 if (pmd_leaf(*pmd))
7188 return 1ULL << PMD_SHIFT;
7189
7190 pte = pte_offset_map(pmd, addr);
7191 if (!pte_present(*pte)) {
7192 pte_unmap(pte);
7193 return 0;
7194 }
7195
7196 pte_unmap(pte);
7197 return PAGE_SIZE;
7198}
7199
7200#else
7201
7202static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
7203{
7204 return 0;
7205}
7206
7207#endif
7208
7209static u64 perf_get_page_size(unsigned long addr)
7210{
7211 struct mm_struct *mm;
7212 unsigned long flags;
7213 u64 size;
7214
7215 if (!addr)
7216 return 0;
7217
7218
7219
7220
7221
7222 local_irq_save(flags);
7223
7224 mm = current->mm;
7225 if (!mm) {
7226
7227
7228
7229
7230 mm = &init_mm;
7231 }
7232
7233 size = __perf_get_page_size(mm, addr);
7234
7235 local_irq_restore(flags);
7236
7237 return size;
7238}
7239
7240static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7241
7242struct perf_callchain_entry *
7243perf_callchain(struct perf_event *event, struct pt_regs *regs)
7244{
7245 bool kernel = !event->attr.exclude_callchain_kernel;
7246 bool user = !event->attr.exclude_callchain_user;
7247
7248 bool crosstask = event->ctx->task && event->ctx->task != current;
7249 const u32 max_stack = event->attr.sample_max_stack;
7250 struct perf_callchain_entry *callchain;
7251
7252 if (!kernel && !user)
7253 return &__empty_callchain;
7254
7255 callchain = get_perf_callchain(regs, 0, kernel, user,
7256 max_stack, crosstask, true);
7257 return callchain ?: &__empty_callchain;
7258}
7259
7260void perf_prepare_sample(struct perf_event_header *header,
7261 struct perf_sample_data *data,
7262 struct perf_event *event,
7263 struct pt_regs *regs)
7264{
7265 u64 sample_type = event->attr.sample_type;
7266
7267 header->type = PERF_RECORD_SAMPLE;
7268 header->size = sizeof(*header) + event->header_size;
7269
7270 header->misc = 0;
7271 header->misc |= perf_misc_flags(regs);
7272
7273 __perf_event_header__init_id(header, data, event);
7274
7275 if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
7276 data->ip = perf_instruction_pointer(regs);
7277
7278 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7279 int size = 1;
7280
7281 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7282 data->callchain = perf_callchain(event, regs);
7283
7284 size += data->callchain->nr;
7285
7286 header->size += size * sizeof(u64);
7287 }
7288
7289 if (sample_type & PERF_SAMPLE_RAW) {
7290 struct perf_raw_record *raw = data->raw;
7291 int size;
7292
7293 if (raw) {
7294 struct perf_raw_frag *frag = &raw->frag;
7295 u32 sum = 0;
7296
7297 do {
7298 sum += frag->size;
7299 if (perf_raw_frag_last(frag))
7300 break;
7301 frag = frag->next;
7302 } while (1);
7303
7304 size = round_up(sum + sizeof(u32), sizeof(u64));
7305 raw->size = size - sizeof(u32);
7306 frag->pad = raw->size - sum;
7307 } else {
7308 size = sizeof(u64);
7309 }
7310
7311 header->size += size;
7312 }
7313
7314 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7315 int size = sizeof(u64);
7316 if (data->br_stack) {
7317 if (perf_sample_save_hw_index(event))
7318 size += sizeof(u64);
7319
7320 size += data->br_stack->nr
7321 * sizeof(struct perf_branch_entry);
7322 }
7323 header->size += size;
7324 }
7325
7326 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7327 perf_sample_regs_user(&data->regs_user, regs);
7328
7329 if (sample_type & PERF_SAMPLE_REGS_USER) {
7330
7331 int size = sizeof(u64);
7332
7333 if (data->regs_user.regs) {
7334 u64 mask = event->attr.sample_regs_user;
7335 size += hweight64(mask) * sizeof(u64);
7336 }
7337
7338 header->size += size;
7339 }
7340
7341 if (sample_type & PERF_SAMPLE_STACK_USER) {
7342
7343
7344
7345
7346
7347
7348 u16 stack_size = event->attr.sample_stack_user;
7349 u16 size = sizeof(u64);
7350
7351 stack_size = perf_sample_ustack_size(stack_size, header->size,
7352 data->regs_user.regs);
7353
7354
7355
7356
7357
7358
7359 if (stack_size)
7360 size += sizeof(u64) + stack_size;
7361
7362 data->stack_user_size = stack_size;
7363 header->size += size;
7364 }
7365
7366 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7367
7368 int size = sizeof(u64);
7369
7370 perf_sample_regs_intr(&data->regs_intr, regs);
7371
7372 if (data->regs_intr.regs) {
7373 u64 mask = event->attr.sample_regs_intr;
7374
7375 size += hweight64(mask) * sizeof(u64);
7376 }
7377
7378 header->size += size;
7379 }
7380
7381 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7382 data->phys_addr = perf_virt_to_phys(data->addr);
7383
7384#ifdef CONFIG_CGROUP_PERF
7385 if (sample_type & PERF_SAMPLE_CGROUP) {
7386 struct cgroup *cgrp;
7387
7388
7389 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7390 data->cgroup = cgrp->kn->id;
7391 }
7392#endif
7393
7394
7395
7396
7397
7398
7399 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7400 data->data_page_size = perf_get_page_size(data->addr);
7401
7402 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7403 data->code_page_size = perf_get_page_size(data->ip);
7404
7405 if (sample_type & PERF_SAMPLE_AUX) {
7406 u64 size;
7407
7408 header->size += sizeof(u64);
7409
7410
7411
7412
7413
7414
7415
7416 size = min_t(size_t, U16_MAX - header->size,
7417 event->attr.aux_sample_size);
7418 size = rounddown(size, 8);
7419 size = perf_prepare_sample_aux(event, data, size);
7420
7421 WARN_ON_ONCE(size + header->size > U16_MAX);
7422 header->size += size;
7423 }
7424
7425
7426
7427
7428
7429
7430
7431
7432 WARN_ON_ONCE(header->size & 7);
7433}
7434
7435static __always_inline int
7436__perf_event_output(struct perf_event *event,
7437 struct perf_sample_data *data,
7438 struct pt_regs *regs,
7439 int (*output_begin)(struct perf_output_handle *,
7440 struct perf_sample_data *,
7441 struct perf_event *,
7442 unsigned int))
7443{
7444 struct perf_output_handle handle;
7445 struct perf_event_header header;
7446 int err;
7447
7448
7449 rcu_read_lock();
7450
7451 perf_prepare_sample(&header, data, event, regs);
7452
7453 err = output_begin(&handle, data, event, header.size);
7454 if (err)
7455 goto exit;
7456
7457 perf_output_sample(&handle, &header, data, event);
7458
7459 perf_output_end(&handle);
7460
7461exit:
7462 rcu_read_unlock();
7463 return err;
7464}
7465
7466void
7467perf_event_output_forward(struct perf_event *event,
7468 struct perf_sample_data *data,
7469 struct pt_regs *regs)
7470{
7471 __perf_event_output(event, data, regs, perf_output_begin_forward);
7472}
7473
7474void
7475perf_event_output_backward(struct perf_event *event,
7476 struct perf_sample_data *data,
7477 struct pt_regs *regs)
7478{
7479 __perf_event_output(event, data, regs, perf_output_begin_backward);
7480}
7481
7482int
7483perf_event_output(struct perf_event *event,
7484 struct perf_sample_data *data,
7485 struct pt_regs *regs)
7486{
7487 return __perf_event_output(event, data, regs, perf_output_begin);
7488}
7489
7490
7491
7492
7493
7494struct perf_read_event {
7495 struct perf_event_header header;
7496
7497 u32 pid;
7498 u32 tid;
7499};
7500
7501static void
7502perf_event_read_event(struct perf_event *event,
7503 struct task_struct *task)
7504{
7505 struct perf_output_handle handle;
7506 struct perf_sample_data sample;
7507 struct perf_read_event read_event = {
7508 .header = {
7509 .type = PERF_RECORD_READ,
7510 .misc = 0,
7511 .size = sizeof(read_event) + event->read_size,
7512 },
7513 .pid = perf_event_pid(event, task),
7514 .tid = perf_event_tid(event, task),
7515 };
7516 int ret;
7517
7518 perf_event_header__init_id(&read_event.header, &sample, event);
7519 ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7520 if (ret)
7521 return;
7522
7523 perf_output_put(&handle, read_event);
7524 perf_output_read(&handle, event);
7525 perf_event__output_id_sample(event, &handle, &sample);
7526
7527 perf_output_end(&handle);
7528}
7529
7530typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7531
7532static void
7533perf_iterate_ctx(struct perf_event_context *ctx,
7534 perf_iterate_f output,
7535 void *data, bool all)
7536{
7537 struct perf_event *event;
7538
7539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7540 if (!all) {
7541 if (event->state < PERF_EVENT_STATE_INACTIVE)
7542 continue;
7543 if (!event_filter_match(event))
7544 continue;
7545 }
7546
7547 output(event, data);
7548 }
7549}
7550
7551static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7552{
7553 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7554 struct perf_event *event;
7555
7556 list_for_each_entry_rcu(event, &pel->list, sb_list) {
7557
7558
7559
7560
7561
7562 if (!smp_load_acquire(&event->ctx))
7563 continue;
7564
7565 if (event->state < PERF_EVENT_STATE_INACTIVE)
7566 continue;
7567 if (!event_filter_match(event))
7568 continue;
7569 output(event, data);
7570 }
7571}
7572
7573
7574
7575
7576
7577
7578
7579static void
7580perf_iterate_sb(perf_iterate_f output, void *data,
7581 struct perf_event_context *task_ctx)
7582{
7583 struct perf_event_context *ctx;
7584 int ctxn;
7585
7586 rcu_read_lock();
7587 preempt_disable();
7588
7589
7590
7591
7592
7593
7594 if (task_ctx) {
7595 perf_iterate_ctx(task_ctx, output, data, false);
7596 goto done;
7597 }
7598
7599 perf_iterate_sb_cpu(output, data);
7600
7601 for_each_task_context_nr(ctxn) {
7602 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7603 if (ctx)
7604 perf_iterate_ctx(ctx, output, data, false);
7605 }
7606done:
7607 preempt_enable();
7608 rcu_read_unlock();
7609}
7610
7611
7612
7613
7614
7615static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7616{
7617 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7618 struct perf_addr_filter *filter;
7619 unsigned int restart = 0, count = 0;
7620 unsigned long flags;
7621
7622 if (!has_addr_filter(event))
7623 return;
7624
7625 raw_spin_lock_irqsave(&ifh->lock, flags);
7626 list_for_each_entry(filter, &ifh->list, entry) {
7627 if (filter->path.dentry) {
7628 event->addr_filter_ranges[count].start = 0;
7629 event->addr_filter_ranges[count].size = 0;
7630 restart++;
7631 }
7632
7633 count++;
7634 }
7635
7636 if (restart)
7637 event->addr_filters_gen++;
7638 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7639
7640 if (restart)
7641 perf_event_stop(event, 1);
7642}
7643
7644void perf_event_exec(void)
7645{
7646 struct perf_event_context *ctx;
7647 int ctxn;
7648
7649 for_each_task_context_nr(ctxn) {
7650 perf_event_enable_on_exec(ctxn);
7651 perf_event_remove_on_exec(ctxn);
7652
7653 rcu_read_lock();
7654 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7655 if (ctx) {
7656 perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
7657 NULL, true);
7658 }
7659 rcu_read_unlock();
7660 }
7661}
7662
7663struct remote_output {
7664 struct perf_buffer *rb;
7665 int err;
7666};
7667
7668static void __perf_event_output_stop(struct perf_event *event, void *data)
7669{
7670 struct perf_event *parent = event->parent;
7671 struct remote_output *ro = data;
7672 struct perf_buffer *rb = ro->rb;
7673 struct stop_event_data sd = {
7674 .event = event,
7675 };
7676
7677 if (!has_aux(event))
7678 return;
7679
7680 if (!parent)
7681 parent = event;
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693 if (rcu_dereference(parent->rb) == rb)
7694 ro->err = __perf_event_stop(&sd);
7695}
7696
7697static int __perf_pmu_output_stop(void *info)
7698{
7699 struct perf_event *event = info;
7700 struct pmu *pmu = event->ctx->pmu;
7701 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7702 struct remote_output ro = {
7703 .rb = event->rb,
7704 };
7705
7706 rcu_read_lock();
7707 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7708 if (cpuctx->task_ctx)
7709 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7710 &ro, false);
7711 rcu_read_unlock();
7712
7713 return ro.err;
7714}
7715
7716static void perf_pmu_output_stop(struct perf_event *event)
7717{
7718 struct perf_event *iter;
7719 int err, cpu;
7720
7721restart:
7722 rcu_read_lock();
7723 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7724
7725
7726
7727
7728
7729
7730 cpu = iter->cpu;
7731 if (cpu == -1)
7732 cpu = READ_ONCE(iter->oncpu);
7733
7734 if (cpu == -1)
7735 continue;
7736
7737 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7738 if (err == -EAGAIN) {
7739 rcu_read_unlock();
7740 goto restart;
7741 }
7742 }
7743 rcu_read_unlock();
7744}
7745
7746
7747
7748
7749
7750
7751
7752struct perf_task_event {
7753 struct task_struct *task;
7754 struct perf_event_context *task_ctx;
7755
7756 struct {
7757 struct perf_event_header header;
7758
7759 u32 pid;
7760 u32 ppid;
7761 u32 tid;
7762 u32 ptid;
7763 u64 time;
7764 } event_id;
7765};
7766
7767static int perf_event_task_match(struct perf_event *event)
7768{
7769 return event->attr.comm || event->attr.mmap ||
7770 event->attr.mmap2 || event->attr.mmap_data ||
7771 event->attr.task;
7772}
7773
7774static void perf_event_task_output(struct perf_event *event,
7775 void *data)
7776{
7777 struct perf_task_event *task_event = data;
7778 struct perf_output_handle handle;
7779 struct perf_sample_data sample;
7780 struct task_struct *task = task_event->task;
7781 int ret, size = task_event->event_id.header.size;
7782
7783 if (!perf_event_task_match(event))
7784 return;
7785
7786 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7787
7788 ret = perf_output_begin(&handle, &sample, event,
7789 task_event->event_id.header.size);
7790 if (ret)
7791 goto out;
7792
7793 task_event->event_id.pid = perf_event_pid(event, task);
7794 task_event->event_id.tid = perf_event_tid(event, task);
7795
7796 if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7797 task_event->event_id.ppid = perf_event_pid(event,
7798 task->real_parent);
7799 task_event->event_id.ptid = perf_event_pid(event,
7800 task->real_parent);
7801 } else {
7802 task_event->event_id.ppid = perf_event_pid(event, current);
7803 task_event->event_id.ptid = perf_event_tid(event, current);
7804 }
7805
7806 task_event->event_id.time = perf_event_clock(event);
7807
7808 perf_output_put(&handle, task_event->event_id);
7809
7810 perf_event__output_id_sample(event, &handle, &sample);
7811
7812 perf_output_end(&handle);
7813out:
7814 task_event->event_id.header.size = size;
7815}
7816
7817static void perf_event_task(struct task_struct *task,
7818 struct perf_event_context *task_ctx,
7819 int new)
7820{
7821 struct perf_task_event task_event;
7822
7823 if (!atomic_read(&nr_comm_events) &&
7824 !atomic_read(&nr_mmap_events) &&
7825 !atomic_read(&nr_task_events))
7826 return;
7827
7828 task_event = (struct perf_task_event){
7829 .task = task,
7830 .task_ctx = task_ctx,
7831 .event_id = {
7832 .header = {
7833 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7834 .misc = 0,
7835 .size = sizeof(task_event.event_id),
7836 },
7837
7838
7839
7840
7841
7842 },
7843 };
7844
7845 perf_iterate_sb(perf_event_task_output,
7846 &task_event,
7847 task_ctx);
7848}
7849
7850void perf_event_fork(struct task_struct *task)
7851{
7852 perf_event_task(task, NULL, 1);
7853 perf_event_namespaces(task);
7854}
7855
7856
7857
7858
7859
7860struct perf_comm_event {
7861 struct task_struct *task;
7862 char *comm;
7863 int comm_size;
7864
7865 struct {
7866 struct perf_event_header header;
7867
7868 u32 pid;
7869 u32 tid;
7870 } event_id;
7871};
7872
7873static int perf_event_comm_match(struct perf_event *event)
7874{
7875 return event->attr.comm;
7876}
7877
7878static void perf_event_comm_output(struct perf_event *event,
7879 void *data)
7880{
7881 struct perf_comm_event *comm_event = data;
7882 struct perf_output_handle handle;
7883 struct perf_sample_data sample;
7884 int size = comm_event->event_id.header.size;
7885 int ret;
7886
7887 if (!perf_event_comm_match(event))
7888 return;
7889
7890 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7891 ret = perf_output_begin(&handle, &sample, event,
7892 comm_event->event_id.header.size);
7893
7894 if (ret)
7895 goto out;
7896
7897 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7898 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7899
7900 perf_output_put(&handle, comm_event->event_id);
7901 __output_copy(&handle, comm_event->comm,
7902 comm_event->comm_size);
7903
7904 perf_event__output_id_sample(event, &handle, &sample);
7905
7906 perf_output_end(&handle);
7907out:
7908 comm_event->event_id.header.size = size;
7909}
7910
7911static void perf_event_comm_event(struct perf_comm_event *comm_event)
7912{
7913 char comm[TASK_COMM_LEN];
7914 unsigned int size;
7915
7916 memset(comm, 0, sizeof(comm));
7917 strlcpy(comm, comm_event->task->comm, sizeof(comm));
7918 size = ALIGN(strlen(comm)+1, sizeof(u64));
7919
7920 comm_event->comm = comm;
7921 comm_event->comm_size = size;
7922
7923 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7924
7925 perf_iterate_sb(perf_event_comm_output,
7926 comm_event,
7927 NULL);
7928}
7929
7930void perf_event_comm(struct task_struct *task, bool exec)
7931{
7932 struct perf_comm_event comm_event;
7933
7934 if (!atomic_read(&nr_comm_events))
7935 return;
7936
7937 comm_event = (struct perf_comm_event){
7938 .task = task,
7939
7940
7941 .event_id = {
7942 .header = {
7943 .type = PERF_RECORD_COMM,
7944 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7945
7946 },
7947
7948
7949 },
7950 };
7951
7952 perf_event_comm_event(&comm_event);
7953}
7954
7955
7956
7957
7958
7959struct perf_namespaces_event {
7960 struct task_struct *task;
7961
7962 struct {
7963 struct perf_event_header header;
7964
7965 u32 pid;
7966 u32 tid;
7967 u64 nr_namespaces;
7968 struct perf_ns_link_info link_info[NR_NAMESPACES];
7969 } event_id;
7970};
7971
7972static int perf_event_namespaces_match(struct perf_event *event)
7973{
7974 return event->attr.namespaces;
7975}
7976
7977static void perf_event_namespaces_output(struct perf_event *event,
7978 void *data)
7979{
7980 struct perf_namespaces_event *namespaces_event = data;
7981 struct perf_output_handle handle;
7982 struct perf_sample_data sample;
7983 u16 header_size = namespaces_event->event_id.header.size;
7984 int ret;
7985
7986 if (!perf_event_namespaces_match(event))
7987 return;
7988
7989 perf_event_header__init_id(&namespaces_event->event_id.header,
7990 &sample, event);
7991 ret = perf_output_begin(&handle, &sample, event,
7992 namespaces_event->event_id.header.size);
7993 if (ret)
7994 goto out;
7995
7996 namespaces_event->event_id.pid = perf_event_pid(event,
7997 namespaces_event->task);
7998 namespaces_event->event_id.tid = perf_event_tid(event,
7999 namespaces_event->task);
8000
8001 perf_output_put(&handle, namespaces_event->event_id);
8002
8003 perf_event__output_id_sample(event, &handle, &sample);
8004
8005 perf_output_end(&handle);
8006out:
8007 namespaces_event->event_id.header.size = header_size;
8008}
8009
8010static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
8011 struct task_struct *task,
8012 const struct proc_ns_operations *ns_ops)
8013{
8014 struct path ns_path;
8015 struct inode *ns_inode;
8016 void *error;
8017
8018 error = ns_get_path(&ns_path, task, ns_ops);
8019 if (!error) {
8020 ns_inode = ns_path.dentry->d_inode;
8021 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
8022 ns_link_info->ino = ns_inode->i_ino;
8023 path_put(&ns_path);
8024 }
8025}
8026
8027void perf_event_namespaces(struct task_struct *task)
8028{
8029 struct perf_namespaces_event namespaces_event;
8030 struct perf_ns_link_info *ns_link_info;
8031
8032 if (!atomic_read(&nr_namespaces_events))
8033 return;
8034
8035 namespaces_event = (struct perf_namespaces_event){
8036 .task = task,
8037 .event_id = {
8038 .header = {
8039 .type = PERF_RECORD_NAMESPACES,
8040 .misc = 0,
8041 .size = sizeof(namespaces_event.event_id),
8042 },
8043
8044
8045 .nr_namespaces = NR_NAMESPACES,
8046
8047 },
8048 };
8049
8050 ns_link_info = namespaces_event.event_id.link_info;
8051
8052 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
8053 task, &mntns_operations);
8054
8055#ifdef CONFIG_USER_NS
8056 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
8057 task, &userns_operations);
8058#endif
8059#ifdef CONFIG_NET_NS
8060 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
8061 task, &netns_operations);
8062#endif
8063#ifdef CONFIG_UTS_NS
8064 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
8065 task, &utsns_operations);
8066#endif
8067#ifdef CONFIG_IPC_NS
8068 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
8069 task, &ipcns_operations);
8070#endif
8071#ifdef CONFIG_PID_NS
8072 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
8073 task, &pidns_operations);
8074#endif
8075#ifdef CONFIG_CGROUPS
8076 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
8077 task, &cgroupns_operations);
8078#endif
8079
8080 perf_iterate_sb(perf_event_namespaces_output,
8081 &namespaces_event,
8082 NULL);
8083}
8084
8085
8086
8087
8088#ifdef CONFIG_CGROUP_PERF
8089
8090struct perf_cgroup_event {
8091 char *path;
8092 int path_size;
8093 struct {
8094 struct perf_event_header header;
8095 u64 id;
8096 char path[];
8097 } event_id;
8098};
8099
8100static int perf_event_cgroup_match(struct perf_event *event)
8101{
8102 return event->attr.cgroup;
8103}
8104
8105static void perf_event_cgroup_output(struct perf_event *event, void *data)
8106{
8107 struct perf_cgroup_event *cgroup_event = data;
8108 struct perf_output_handle handle;
8109 struct perf_sample_data sample;
8110 u16 header_size = cgroup_event->event_id.header.size;
8111 int ret;
8112
8113 if (!perf_event_cgroup_match(event))
8114 return;
8115
8116 perf_event_header__init_id(&cgroup_event->event_id.header,
8117 &sample, event);
8118 ret = perf_output_begin(&handle, &sample, event,
8119 cgroup_event->event_id.header.size);
8120 if (ret)
8121 goto out;
8122
8123 perf_output_put(&handle, cgroup_event->event_id);
8124 __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8125
8126 perf_event__output_id_sample(event, &handle, &sample);
8127
8128 perf_output_end(&handle);
8129out:
8130 cgroup_event->event_id.header.size = header_size;
8131}
8132
8133static void perf_event_cgroup(struct cgroup *cgrp)
8134{
8135 struct perf_cgroup_event cgroup_event;
8136 char path_enomem[16] = "//enomem";
8137 char *pathname;
8138 size_t size;
8139
8140 if (!atomic_read(&nr_cgroup_events))
8141 return;
8142
8143 cgroup_event = (struct perf_cgroup_event){
8144 .event_id = {
8145 .header = {
8146 .type = PERF_RECORD_CGROUP,
8147 .misc = 0,
8148 .size = sizeof(cgroup_event.event_id),
8149 },
8150 .id = cgrp->kn->id,
8151 },
8152 };
8153
8154 pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8155 if (pathname == NULL) {
8156 cgroup_event.path = path_enomem;
8157 } else {
8158
8159 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8160 cgroup_event.path = pathname;
8161 }
8162
8163
8164
8165
8166
8167
8168 size = strlen(cgroup_event.path) + 1;
8169 while (!IS_ALIGNED(size, sizeof(u64)))
8170 cgroup_event.path[size++] = '\0';
8171
8172 cgroup_event.event_id.header.size += size;
8173 cgroup_event.path_size = size;
8174
8175 perf_iterate_sb(perf_event_cgroup_output,
8176 &cgroup_event,
8177 NULL);
8178
8179 kfree(pathname);
8180}
8181
8182#endif
8183
8184
8185
8186
8187
8188struct perf_mmap_event {
8189 struct vm_area_struct *vma;
8190
8191 const char *file_name;
8192 int file_size;
8193 int maj, min;
8194 u64 ino;
8195 u64 ino_generation;
8196 u32 prot, flags;
8197 u8 build_id[BUILD_ID_SIZE_MAX];
8198 u32 build_id_size;
8199
8200 struct {
8201 struct perf_event_header header;
8202
8203 u32 pid;
8204 u32 tid;
8205 u64 start;
8206 u64 len;
8207 u64 pgoff;
8208 } event_id;
8209};
8210
8211static int perf_event_mmap_match(struct perf_event *event,
8212 void *data)
8213{
8214 struct perf_mmap_event *mmap_event = data;
8215 struct vm_area_struct *vma = mmap_event->vma;
8216 int executable = vma->vm_flags & VM_EXEC;
8217
8218 return (!executable && event->attr.mmap_data) ||
8219 (executable && (event->attr.mmap || event->attr.mmap2));
8220}
8221
8222static void perf_event_mmap_output(struct perf_event *event,
8223 void *data)
8224{
8225 struct perf_mmap_event *mmap_event = data;
8226 struct perf_output_handle handle;
8227 struct perf_sample_data sample;
8228 int size = mmap_event->event_id.header.size;
8229 u32 type = mmap_event->event_id.header.type;
8230 bool use_build_id;
8231 int ret;
8232
8233 if (!perf_event_mmap_match(event, data))
8234 return;
8235
8236 if (event->attr.mmap2) {
8237 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8238 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8239 mmap_event->event_id.header.size += sizeof(mmap_event->min);
8240 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8241 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8242 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8243 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8244 }
8245
8246 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8247 ret = perf_output_begin(&handle, &sample, event,
8248 mmap_event->event_id.header.size);
8249 if (ret)
8250 goto out;
8251
8252 mmap_event->event_id.pid = perf_event_pid(event, current);
8253 mmap_event->event_id.tid = perf_event_tid(event, current);
8254
8255 use_build_id = event->attr.build_id && mmap_event->build_id_size;
8256
8257 if (event->attr.mmap2 && use_build_id)
8258 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8259
8260 perf_output_put(&handle, mmap_event->event_id);
8261
8262 if (event->attr.mmap2) {
8263 if (use_build_id) {
8264 u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8265
8266 __output_copy(&handle, size, 4);
8267 __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8268 } else {
8269 perf_output_put(&handle, mmap_event->maj);
8270 perf_output_put(&handle, mmap_event->min);
8271 perf_output_put(&handle, mmap_event->ino);
8272 perf_output_put(&handle, mmap_event->ino_generation);
8273 }
8274 perf_output_put(&handle, mmap_event->prot);
8275 perf_output_put(&handle, mmap_event->flags);
8276 }
8277
8278 __output_copy(&handle, mmap_event->file_name,
8279 mmap_event->file_size);
8280
8281 perf_event__output_id_sample(event, &handle, &sample);
8282
8283 perf_output_end(&handle);
8284out:
8285 mmap_event->event_id.header.size = size;
8286 mmap_event->event_id.header.type = type;
8287}
8288
8289static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8290{
8291 struct vm_area_struct *vma = mmap_event->vma;
8292 struct file *file = vma->vm_file;
8293 int maj = 0, min = 0;
8294 u64 ino = 0, gen = 0;
8295 u32 prot = 0, flags = 0;
8296 unsigned int size;
8297 char tmp[16];
8298 char *buf = NULL;
8299 char *name;
8300
8301 if (vma->vm_flags & VM_READ)
8302 prot |= PROT_READ;
8303 if (vma->vm_flags & VM_WRITE)
8304 prot |= PROT_WRITE;
8305 if (vma->vm_flags & VM_EXEC)
8306 prot |= PROT_EXEC;
8307
8308 if (vma->vm_flags & VM_MAYSHARE)
8309 flags = MAP_SHARED;
8310 else
8311 flags = MAP_PRIVATE;
8312
8313 if (vma->vm_flags & VM_DENYWRITE)
8314 flags |= MAP_DENYWRITE;
8315 if (vma->vm_flags & VM_LOCKED)
8316 flags |= MAP_LOCKED;
8317 if (vma->vm_flags & VM_HUGETLB)
8318 flags |= MAP_HUGETLB;
8319
8320 if (file) {
8321 struct inode *inode;
8322 dev_t dev;
8323
8324 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8325 if (!buf) {
8326 name = "//enomem";
8327 goto cpy_name;
8328 }
8329
8330
8331
8332
8333
8334 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8335 if (IS_ERR(name)) {
8336 name = "//toolong";
8337 goto cpy_name;
8338 }
8339 inode = file_inode(vma->vm_file);
8340 dev = inode->i_sb->s_dev;
8341 ino = inode->i_ino;
8342 gen = inode->i_generation;
8343 maj = MAJOR(dev);
8344 min = MINOR(dev);
8345
8346 goto got_name;
8347 } else {
8348 if (vma->vm_ops && vma->vm_ops->name) {
8349 name = (char *) vma->vm_ops->name(vma);
8350 if (name)
8351 goto cpy_name;
8352 }
8353
8354 name = (char *)arch_vma_name(vma);
8355 if (name)
8356 goto cpy_name;
8357
8358 if (vma->vm_start <= vma->vm_mm->start_brk &&
8359 vma->vm_end >= vma->vm_mm->brk) {
8360 name = "[heap]";
8361 goto cpy_name;
8362 }
8363 if (vma->vm_start <= vma->vm_mm->start_stack &&
8364 vma->vm_end >= vma->vm_mm->start_stack) {
8365 name = "[stack]";
8366 goto cpy_name;
8367 }
8368
8369 name = "//anon";
8370 goto cpy_name;
8371 }
8372
8373cpy_name:
8374 strlcpy(tmp, name, sizeof(tmp));
8375 name = tmp;
8376got_name:
8377
8378
8379
8380
8381
8382 size = strlen(name)+1;
8383 while (!IS_ALIGNED(size, sizeof(u64)))
8384 name[size++] = '\0';
8385
8386 mmap_event->file_name = name;
8387 mmap_event->file_size = size;
8388 mmap_event->maj = maj;
8389 mmap_event->min = min;
8390 mmap_event->ino = ino;
8391 mmap_event->ino_generation = gen;
8392 mmap_event->prot = prot;
8393 mmap_event->flags = flags;
8394
8395 if (!(vma->vm_flags & VM_EXEC))
8396 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8397
8398 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8399
8400 if (atomic_read(&nr_build_id_events))
8401 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8402
8403 perf_iterate_sb(perf_event_mmap_output,
8404 mmap_event,
8405 NULL);
8406
8407 kfree(buf);
8408}
8409
8410
8411
8412
8413static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8414 struct file *file, unsigned long offset,
8415 unsigned long size)
8416{
8417
8418 if (!filter->path.dentry)
8419 return false;
8420
8421 if (d_inode(filter->path.dentry) != file_inode(file))
8422 return false;
8423
8424 if (filter->offset > offset + size)
8425 return false;
8426
8427 if (filter->offset + filter->size < offset)
8428 return false;
8429
8430 return true;
8431}
8432
8433static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8434 struct vm_area_struct *vma,
8435 struct perf_addr_filter_range *fr)
8436{
8437 unsigned long vma_size = vma->vm_end - vma->vm_start;
8438 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8439 struct file *file = vma->vm_file;
8440
8441 if (!perf_addr_filter_match(filter, file, off, vma_size))
8442 return false;
8443
8444 if (filter->offset < off) {
8445 fr->start = vma->vm_start;
8446 fr->size = min(vma_size, filter->size - (off - filter->offset));
8447 } else {
8448 fr->start = vma->vm_start + filter->offset - off;
8449 fr->size = min(vma->vm_end - fr->start, filter->size);
8450 }
8451
8452 return true;
8453}
8454
8455static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8456{
8457 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8458 struct vm_area_struct *vma = data;
8459 struct perf_addr_filter *filter;
8460 unsigned int restart = 0, count = 0;
8461 unsigned long flags;
8462
8463 if (!has_addr_filter(event))
8464 return;
8465
8466 if (!vma->vm_file)
8467 return;
8468
8469 raw_spin_lock_irqsave(&ifh->lock, flags);
8470 list_for_each_entry(filter, &ifh->list, entry) {
8471 if (perf_addr_filter_vma_adjust(filter, vma,
8472 &event->addr_filter_ranges[count]))
8473 restart++;
8474
8475 count++;
8476 }
8477
8478 if (restart)
8479 event->addr_filters_gen++;
8480 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8481
8482 if (restart)
8483 perf_event_stop(event, 1);
8484}
8485
8486
8487
8488
8489static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8490{
8491 struct perf_event_context *ctx;
8492 int ctxn;
8493
8494
8495
8496
8497
8498 if (!(vma->vm_flags & VM_EXEC))
8499 return;
8500
8501 rcu_read_lock();
8502 for_each_task_context_nr(ctxn) {
8503 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8504 if (!ctx)
8505 continue;
8506
8507 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8508 }
8509 rcu_read_unlock();
8510}
8511
8512void perf_event_mmap(struct vm_area_struct *vma)
8513{
8514 struct perf_mmap_event mmap_event;
8515
8516 if (!atomic_read(&nr_mmap_events))
8517 return;
8518
8519 mmap_event = (struct perf_mmap_event){
8520 .vma = vma,
8521
8522
8523 .event_id = {
8524 .header = {
8525 .type = PERF_RECORD_MMAP,
8526 .misc = PERF_RECORD_MISC_USER,
8527
8528 },
8529
8530
8531 .start = vma->vm_start,
8532 .len = vma->vm_end - vma->vm_start,
8533 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
8534 },
8535
8536
8537
8538
8539
8540
8541 };
8542
8543 perf_addr_filters_adjust(vma);
8544 perf_event_mmap_event(&mmap_event);
8545}
8546
8547void perf_event_aux_event(struct perf_event *event, unsigned long head,
8548 unsigned long size, u64 flags)
8549{
8550 struct perf_output_handle handle;
8551 struct perf_sample_data sample;
8552 struct perf_aux_event {
8553 struct perf_event_header header;
8554 u64 offset;
8555 u64 size;
8556 u64 flags;
8557 } rec = {
8558 .header = {
8559 .type = PERF_RECORD_AUX,
8560 .misc = 0,
8561 .size = sizeof(rec),
8562 },
8563 .offset = head,
8564 .size = size,
8565 .flags = flags,
8566 };
8567 int ret;
8568
8569 perf_event_header__init_id(&rec.header, &sample, event);
8570 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8571
8572 if (ret)
8573 return;
8574
8575 perf_output_put(&handle, rec);
8576 perf_event__output_id_sample(event, &handle, &sample);
8577
8578 perf_output_end(&handle);
8579}
8580
8581
8582
8583
8584void perf_log_lost_samples(struct perf_event *event, u64 lost)
8585{
8586 struct perf_output_handle handle;
8587 struct perf_sample_data sample;
8588 int ret;
8589
8590 struct {
8591 struct perf_event_header header;
8592 u64 lost;
8593 } lost_samples_event = {
8594 .header = {
8595 .type = PERF_RECORD_LOST_SAMPLES,
8596 .misc = 0,
8597 .size = sizeof(lost_samples_event),
8598 },
8599 .lost = lost,
8600 };
8601
8602 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8603
8604 ret = perf_output_begin(&handle, &sample, event,
8605 lost_samples_event.header.size);
8606 if (ret)
8607 return;
8608
8609 perf_output_put(&handle, lost_samples_event);
8610 perf_event__output_id_sample(event, &handle, &sample);
8611 perf_output_end(&handle);
8612}
8613
8614
8615
8616
8617
8618struct perf_switch_event {
8619 struct task_struct *task;
8620 struct task_struct *next_prev;
8621
8622 struct {
8623 struct perf_event_header header;
8624 u32 next_prev_pid;
8625 u32 next_prev_tid;
8626 } event_id;
8627};
8628
8629static int perf_event_switch_match(struct perf_event *event)
8630{
8631 return event->attr.context_switch;
8632}
8633
8634static void perf_event_switch_output(struct perf_event *event, void *data)
8635{
8636 struct perf_switch_event *se = data;
8637 struct perf_output_handle handle;
8638 struct perf_sample_data sample;
8639 int ret;
8640
8641 if (!perf_event_switch_match(event))
8642 return;
8643
8644
8645 if (event->ctx->task) {
8646 se->event_id.header.type = PERF_RECORD_SWITCH;
8647 se->event_id.header.size = sizeof(se->event_id.header);
8648 } else {
8649 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8650 se->event_id.header.size = sizeof(se->event_id);
8651 se->event_id.next_prev_pid =
8652 perf_event_pid(event, se->next_prev);
8653 se->event_id.next_prev_tid =
8654 perf_event_tid(event, se->next_prev);
8655 }
8656
8657 perf_event_header__init_id(&se->event_id.header, &sample, event);
8658
8659 ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8660 if (ret)
8661 return;
8662
8663 if (event->ctx->task)
8664 perf_output_put(&handle, se->event_id.header);
8665 else
8666 perf_output_put(&handle, se->event_id);
8667
8668 perf_event__output_id_sample(event, &handle, &sample);
8669
8670 perf_output_end(&handle);
8671}
8672
8673static void perf_event_switch(struct task_struct *task,
8674 struct task_struct *next_prev, bool sched_in)
8675{
8676 struct perf_switch_event switch_event;
8677
8678
8679
8680 switch_event = (struct perf_switch_event){
8681 .task = task,
8682 .next_prev = next_prev,
8683 .event_id = {
8684 .header = {
8685
8686 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8687
8688 },
8689
8690
8691 },
8692 };
8693
8694 if (!sched_in && task->on_rq) {
8695 switch_event.event_id.header.misc |=
8696 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8697 }
8698
8699 perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
8700}
8701
8702
8703
8704
8705
8706static void perf_log_throttle(struct perf_event *event, int enable)
8707{
8708 struct perf_output_handle handle;
8709 struct perf_sample_data sample;
8710 int ret;
8711
8712 struct {
8713 struct perf_event_header header;
8714 u64 time;
8715 u64 id;
8716 u64 stream_id;
8717 } throttle_event = {
8718 .header = {
8719 .type = PERF_RECORD_THROTTLE,
8720 .misc = 0,
8721 .size = sizeof(throttle_event),
8722 },
8723 .time = perf_event_clock(event),
8724 .id = primary_event_id(event),
8725 .stream_id = event->id,
8726 };
8727
8728 if (enable)
8729 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8730
8731 perf_event_header__init_id(&throttle_event.header, &sample, event);
8732
8733 ret = perf_output_begin(&handle, &sample, event,
8734 throttle_event.header.size);
8735 if (ret)
8736 return;
8737
8738 perf_output_put(&handle, throttle_event);
8739 perf_event__output_id_sample(event, &handle, &sample);
8740 perf_output_end(&handle);
8741}
8742
8743
8744
8745
8746
8747struct perf_ksymbol_event {
8748 const char *name;
8749 int name_len;
8750 struct {
8751 struct perf_event_header header;
8752 u64 addr;
8753 u32 len;
8754 u16 ksym_type;
8755 u16 flags;
8756 } event_id;
8757};
8758
8759static int perf_event_ksymbol_match(struct perf_event *event)
8760{
8761 return event->attr.ksymbol;
8762}
8763
8764static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8765{
8766 struct perf_ksymbol_event *ksymbol_event = data;
8767 struct perf_output_handle handle;
8768 struct perf_sample_data sample;
8769 int ret;
8770
8771 if (!perf_event_ksymbol_match(event))
8772 return;
8773
8774 perf_event_header__init_id(&ksymbol_event->event_id.header,
8775 &sample, event);
8776 ret = perf_output_begin(&handle, &sample, event,
8777 ksymbol_event->event_id.header.size);
8778 if (ret)
8779 return;
8780
8781 perf_output_put(&handle, ksymbol_event->event_id);
8782 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8783 perf_event__output_id_sample(event, &handle, &sample);
8784
8785 perf_output_end(&handle);
8786}
8787
8788void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8789 const char *sym)
8790{
8791 struct perf_ksymbol_event ksymbol_event;
8792 char name[KSYM_NAME_LEN];
8793 u16 flags = 0;
8794 int name_len;
8795
8796 if (!atomic_read(&nr_ksymbol_events))
8797 return;
8798
8799 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8800 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8801 goto err;
8802
8803 strlcpy(name, sym, KSYM_NAME_LEN);
8804 name_len = strlen(name) + 1;
8805 while (!IS_ALIGNED(name_len, sizeof(u64)))
8806 name[name_len++] = '\0';
8807 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8808
8809 if (unregister)
8810 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8811
8812 ksymbol_event = (struct perf_ksymbol_event){
8813 .name = name,
8814 .name_len = name_len,
8815 .event_id = {
8816 .header = {
8817 .type = PERF_RECORD_KSYMBOL,
8818 .size = sizeof(ksymbol_event.event_id) +
8819 name_len,
8820 },
8821 .addr = addr,
8822 .len = len,
8823 .ksym_type = ksym_type,
8824 .flags = flags,
8825 },
8826 };
8827
8828 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8829 return;
8830err:
8831 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8832}
8833
8834
8835
8836
8837
8838struct perf_bpf_event {
8839 struct bpf_prog *prog;
8840 struct {
8841 struct perf_event_header header;
8842 u16 type;
8843 u16 flags;
8844 u32 id;
8845 u8 tag[BPF_TAG_SIZE];
8846 } event_id;
8847};
8848
8849static int perf_event_bpf_match(struct perf_event *event)
8850{
8851 return event->attr.bpf_event;
8852}
8853
8854static void perf_event_bpf_output(struct perf_event *event, void *data)
8855{
8856 struct perf_bpf_event *bpf_event = data;
8857 struct perf_output_handle handle;
8858 struct perf_sample_data sample;
8859 int ret;
8860
8861 if (!perf_event_bpf_match(event))
8862 return;
8863
8864 perf_event_header__init_id(&bpf_event->event_id.header,
8865 &sample, event);
8866 ret = perf_output_begin(&handle, data, event,
8867 bpf_event->event_id.header.size);
8868 if (ret)
8869 return;
8870
8871 perf_output_put(&handle, bpf_event->event_id);
8872 perf_event__output_id_sample(event, &handle, &sample);
8873
8874 perf_output_end(&handle);
8875}
8876
8877static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8878 enum perf_bpf_event_type type)
8879{
8880 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8881 int i;
8882
8883 if (prog->aux->func_cnt == 0) {
8884 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8885 (u64)(unsigned long)prog->bpf_func,
8886 prog->jited_len, unregister,
8887 prog->aux->ksym.name);
8888 } else {
8889 for (i = 0; i < prog->aux->func_cnt; i++) {
8890 struct bpf_prog *subprog = prog->aux->func[i];
8891
8892 perf_event_ksymbol(
8893 PERF_RECORD_KSYMBOL_TYPE_BPF,
8894 (u64)(unsigned long)subprog->bpf_func,
8895 subprog->jited_len, unregister,
8896 prog->aux->ksym.name);
8897 }
8898 }
8899}
8900
8901void perf_event_bpf_event(struct bpf_prog *prog,
8902 enum perf_bpf_event_type type,
8903 u16 flags)
8904{
8905 struct perf_bpf_event bpf_event;
8906
8907 if (type <= PERF_BPF_EVENT_UNKNOWN ||
8908 type >= PERF_BPF_EVENT_MAX)
8909 return;
8910
8911 switch (type) {
8912 case PERF_BPF_EVENT_PROG_LOAD:
8913 case PERF_BPF_EVENT_PROG_UNLOAD:
8914 if (atomic_read(&nr_ksymbol_events))
8915 perf_event_bpf_emit_ksymbols(prog, type);
8916 break;
8917 default:
8918 break;
8919 }
8920
8921 if (!atomic_read(&nr_bpf_events))
8922 return;
8923
8924 bpf_event = (struct perf_bpf_event){
8925 .prog = prog,
8926 .event_id = {
8927 .header = {
8928 .type = PERF_RECORD_BPF_EVENT,
8929 .size = sizeof(bpf_event.event_id),
8930 },
8931 .type = type,
8932 .flags = flags,
8933 .id = prog->aux->id,
8934 },
8935 };
8936
8937 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8938
8939 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8940 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8941}
8942
8943struct perf_text_poke_event {
8944 const void *old_bytes;
8945 const void *new_bytes;
8946 size_t pad;
8947 u16 old_len;
8948 u16 new_len;
8949
8950 struct {
8951 struct perf_event_header header;
8952
8953 u64 addr;
8954 } event_id;
8955};
8956
8957static int perf_event_text_poke_match(struct perf_event *event)
8958{
8959 return event->attr.text_poke;
8960}
8961
8962static void perf_event_text_poke_output(struct perf_event *event, void *data)
8963{
8964 struct perf_text_poke_event *text_poke_event = data;
8965 struct perf_output_handle handle;
8966 struct perf_sample_data sample;
8967 u64 padding = 0;
8968 int ret;
8969
8970 if (!perf_event_text_poke_match(event))
8971 return;
8972
8973 perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
8974
8975 ret = perf_output_begin(&handle, &sample, event,
8976 text_poke_event->event_id.header.size);
8977 if (ret)
8978 return;
8979
8980 perf_output_put(&handle, text_poke_event->event_id);
8981 perf_output_put(&handle, text_poke_event->old_len);
8982 perf_output_put(&handle, text_poke_event->new_len);
8983
8984 __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
8985 __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
8986
8987 if (text_poke_event->pad)
8988 __output_copy(&handle, &padding, text_poke_event->pad);
8989
8990 perf_event__output_id_sample(event, &handle, &sample);
8991
8992 perf_output_end(&handle);
8993}
8994
8995void perf_event_text_poke(const void *addr, const void *old_bytes,
8996 size_t old_len, const void *new_bytes, size_t new_len)
8997{
8998 struct perf_text_poke_event text_poke_event;
8999 size_t tot, pad;
9000
9001 if (!atomic_read(&nr_text_poke_events))
9002 return;
9003
9004 tot = sizeof(text_poke_event.old_len) + old_len;
9005 tot += sizeof(text_poke_event.new_len) + new_len;
9006 pad = ALIGN(tot, sizeof(u64)) - tot;
9007
9008 text_poke_event = (struct perf_text_poke_event){
9009 .old_bytes = old_bytes,
9010 .new_bytes = new_bytes,
9011 .pad = pad,
9012 .old_len = old_len,
9013 .new_len = new_len,
9014 .event_id = {
9015 .header = {
9016 .type = PERF_RECORD_TEXT_POKE,
9017 .misc = PERF_RECORD_MISC_KERNEL,
9018 .size = sizeof(text_poke_event.event_id) + tot + pad,
9019 },
9020 .addr = (unsigned long)addr,
9021 },
9022 };
9023
9024 perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
9025}
9026
9027void perf_event_itrace_started(struct perf_event *event)
9028{
9029 event->attach_state |= PERF_ATTACH_ITRACE;
9030}
9031
9032static void perf_log_itrace_start(struct perf_event *event)
9033{
9034 struct perf_output_handle handle;
9035 struct perf_sample_data sample;
9036 struct perf_aux_event {
9037 struct perf_event_header header;
9038 u32 pid;
9039 u32 tid;
9040 } rec;
9041 int ret;
9042
9043 if (event->parent)
9044 event = event->parent;
9045
9046 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
9047 event->attach_state & PERF_ATTACH_ITRACE)
9048 return;
9049
9050 rec.header.type = PERF_RECORD_ITRACE_START;
9051 rec.header.misc = 0;
9052 rec.header.size = sizeof(rec);
9053 rec.pid = perf_event_pid(event, current);
9054 rec.tid = perf_event_tid(event, current);
9055
9056 perf_event_header__init_id(&rec.header, &sample, event);
9057 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9058
9059 if (ret)
9060 return;
9061
9062 perf_output_put(&handle, rec);
9063 perf_event__output_id_sample(event, &handle, &sample);
9064
9065 perf_output_end(&handle);
9066}
9067
9068static int
9069__perf_event_account_interrupt(struct perf_event *event, int throttle)
9070{
9071 struct hw_perf_event *hwc = &event->hw;
9072 int ret = 0;
9073 u64 seq;
9074
9075 seq = __this_cpu_read(perf_throttled_seq);
9076 if (seq != hwc->interrupts_seq) {
9077 hwc->interrupts_seq = seq;
9078 hwc->interrupts = 1;
9079 } else {
9080 hwc->interrupts++;
9081 if (unlikely(throttle
9082 && hwc->interrupts >= max_samples_per_tick)) {
9083 __this_cpu_inc(perf_throttled_count);
9084 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
9085 hwc->interrupts = MAX_INTERRUPTS;
9086 perf_log_throttle(event, 0);
9087 ret = 1;
9088 }
9089 }
9090
9091 if (event->attr.freq) {
9092 u64 now = perf_clock();
9093 s64 delta = now - hwc->freq_time_stamp;
9094
9095 hwc->freq_time_stamp = now;
9096
9097 if (delta > 0 && delta < 2*TICK_NSEC)
9098 perf_adjust_period(event, delta, hwc->last_period, true);
9099 }
9100
9101 return ret;
9102}
9103
9104int perf_event_account_interrupt(struct perf_event *event)
9105{
9106 return __perf_event_account_interrupt(event, 1);
9107}
9108
9109
9110
9111
9112
9113static int __perf_event_overflow(struct perf_event *event,
9114 int throttle, struct perf_sample_data *data,
9115 struct pt_regs *regs)
9116{
9117 int events = atomic_read(&event->event_limit);
9118 int ret = 0;
9119
9120
9121
9122
9123
9124 if (unlikely(!is_sampling_event(event)))
9125 return 0;
9126
9127 ret = __perf_event_account_interrupt(event, throttle);
9128
9129
9130
9131
9132
9133
9134 event->pending_kill = POLL_IN;
9135 if (events && atomic_dec_and_test(&event->event_limit)) {
9136 ret = 1;
9137 event->pending_kill = POLL_HUP;
9138
9139 perf_event_disable_inatomic(event);
9140 }
9141
9142 READ_ONCE(event->overflow_handler)(event, data, regs);
9143
9144 if (*perf_event_fasync(event) && event->pending_kill) {
9145 event->pending_wakeup = 1;
9146 irq_work_queue(&event->pending);
9147 }
9148
9149 return ret;
9150}
9151
9152int perf_event_overflow(struct perf_event *event,
9153 struct perf_sample_data *data,
9154 struct pt_regs *regs)
9155{
9156 return __perf_event_overflow(event, 1, data, regs);
9157}
9158
9159
9160
9161
9162
9163struct swevent_htable {
9164 struct swevent_hlist *swevent_hlist;
9165 struct mutex hlist_mutex;
9166 int hlist_refcount;
9167
9168
9169 int recursion[PERF_NR_CONTEXTS];
9170};
9171
9172static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9173
9174
9175
9176
9177
9178
9179
9180
9181u64 perf_swevent_set_period(struct perf_event *event)
9182{
9183 struct hw_perf_event *hwc = &event->hw;
9184 u64 period = hwc->last_period;
9185 u64 nr, offset;
9186 s64 old, val;
9187
9188 hwc->last_period = hwc->sample_period;
9189
9190again:
9191 old = val = local64_read(&hwc->period_left);
9192 if (val < 0)
9193 return 0;
9194
9195 nr = div64_u64(period + val, period);
9196 offset = nr * period;
9197 val -= offset;
9198 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
9199 goto again;
9200
9201 return nr;
9202}
9203
9204static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9205 struct perf_sample_data *data,
9206 struct pt_regs *regs)
9207{
9208 struct hw_perf_event *hwc = &event->hw;
9209 int throttle = 0;
9210
9211 if (!overflow)
9212 overflow = perf_swevent_set_period(event);
9213
9214 if (hwc->interrupts == MAX_INTERRUPTS)
9215 return;
9216
9217 for (; overflow; overflow--) {
9218 if (__perf_event_overflow(event, throttle,
9219 data, regs)) {
9220
9221
9222
9223
9224 break;
9225 }
9226 throttle = 1;
9227 }
9228}
9229
9230static void perf_swevent_event(struct perf_event *event, u64 nr,
9231 struct perf_sample_data *data,
9232 struct pt_regs *regs)
9233{
9234 struct hw_perf_event *hwc = &event->hw;
9235
9236 local64_add(nr, &event->count);
9237
9238 if (!regs)
9239 return;
9240
9241 if (!is_sampling_event(event))
9242 return;
9243
9244 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9245 data->period = nr;
9246 return perf_swevent_overflow(event, 1, data, regs);
9247 } else
9248 data->period = event->hw.last_period;
9249
9250 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9251 return perf_swevent_overflow(event, 1, data, regs);
9252
9253 if (local64_add_negative(nr, &hwc->period_left))
9254 return;
9255
9256 perf_swevent_overflow(event, 0, data, regs);
9257}
9258
9259static int perf_exclude_event(struct perf_event *event,
9260 struct pt_regs *regs)
9261{
9262 if (event->hw.state & PERF_HES_STOPPED)
9263 return 1;
9264
9265 if (regs) {
9266 if (event->attr.exclude_user && user_mode(regs))
9267 return 1;
9268
9269 if (event->attr.exclude_kernel && !user_mode(regs))
9270 return 1;
9271 }
9272
9273 return 0;
9274}
9275
9276static int perf_swevent_match(struct perf_event *event,
9277 enum perf_type_id type,
9278 u32 event_id,
9279 struct perf_sample_data *data,
9280 struct pt_regs *regs)
9281{
9282 if (event->attr.type != type)
9283 return 0;
9284
9285 if (event->attr.config != event_id)
9286 return 0;
9287
9288 if (perf_exclude_event(event, regs))
9289 return 0;
9290
9291 return 1;
9292}
9293
9294static inline u64 swevent_hash(u64 type, u32 event_id)
9295{
9296 u64 val = event_id | (type << 32);
9297
9298 return hash_64(val, SWEVENT_HLIST_BITS);
9299}
9300
9301static inline struct hlist_head *
9302__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9303{
9304 u64 hash = swevent_hash(type, event_id);
9305
9306 return &hlist->heads[hash];
9307}
9308
9309
9310static inline struct hlist_head *
9311find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9312{
9313 struct swevent_hlist *hlist;
9314
9315 hlist = rcu_dereference(swhash->swevent_hlist);
9316 if (!hlist)
9317 return NULL;
9318
9319 return __find_swevent_head(hlist, type, event_id);
9320}
9321
9322
9323static inline struct hlist_head *
9324find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9325{
9326 struct swevent_hlist *hlist;
9327 u32 event_id = event->attr.config;
9328 u64 type = event->attr.type;
9329
9330
9331
9332
9333
9334
9335 hlist = rcu_dereference_protected(swhash->swevent_hlist,
9336 lockdep_is_held(&event->ctx->lock));
9337 if (!hlist)
9338 return NULL;
9339
9340 return __find_swevent_head(hlist, type, event_id);
9341}
9342
9343static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9344 u64 nr,
9345 struct perf_sample_data *data,
9346 struct pt_regs *regs)
9347{
9348 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9349 struct perf_event *event;
9350 struct hlist_head *head;
9351
9352 rcu_read_lock();
9353 head = find_swevent_head_rcu(swhash, type, event_id);
9354 if (!head)
9355 goto end;
9356
9357 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9358 if (perf_swevent_match(event, type, event_id, data, regs))
9359 perf_swevent_event(event, nr, data, regs);
9360 }
9361end:
9362 rcu_read_unlock();
9363}
9364
9365DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9366
9367int perf_swevent_get_recursion_context(void)
9368{
9369 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9370
9371 return get_recursion_context(swhash->recursion);
9372}
9373EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9374
9375void perf_swevent_put_recursion_context(int rctx)
9376{
9377 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9378
9379 put_recursion_context(swhash->recursion, rctx);
9380}
9381
9382void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9383{
9384 struct perf_sample_data data;
9385
9386 if (WARN_ON_ONCE(!regs))
9387 return;
9388
9389 perf_sample_data_init(&data, addr, 0);
9390 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9391}
9392
9393void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9394{
9395 int rctx;
9396
9397 preempt_disable_notrace();
9398 rctx = perf_swevent_get_recursion_context();
9399 if (unlikely(rctx < 0))
9400 goto fail;
9401
9402 ___perf_sw_event(event_id, nr, regs, addr);
9403
9404 perf_swevent_put_recursion_context(rctx);
9405fail:
9406 preempt_enable_notrace();
9407}
9408
9409static void perf_swevent_read(struct perf_event *event)
9410{
9411}
9412
9413static int perf_swevent_add(struct perf_event *event, int flags)
9414{
9415 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9416 struct hw_perf_event *hwc = &event->hw;
9417 struct hlist_head *head;
9418
9419 if (is_sampling_event(event)) {
9420 hwc->last_period = hwc->sample_period;
9421 perf_swevent_set_period(event);
9422 }
9423
9424 hwc->state = !(flags & PERF_EF_START);
9425
9426 head = find_swevent_head(swhash, event);
9427 if (WARN_ON_ONCE(!head))
9428 return -EINVAL;
9429
9430 hlist_add_head_rcu(&event->hlist_entry, head);
9431 perf_event_update_userpage(event);
9432
9433 return 0;
9434}
9435
9436static void perf_swevent_del(struct perf_event *event, int flags)
9437{
9438 hlist_del_rcu(&event->hlist_entry);
9439}
9440
9441static void perf_swevent_start(struct perf_event *event, int flags)
9442{
9443 event->hw.state = 0;
9444}
9445
9446static void perf_swevent_stop(struct perf_event *event, int flags)
9447{
9448 event->hw.state = PERF_HES_STOPPED;
9449}
9450
9451
9452static inline struct swevent_hlist *
9453swevent_hlist_deref(struct swevent_htable *swhash)
9454{
9455 return rcu_dereference_protected(swhash->swevent_hlist,
9456 lockdep_is_held(&swhash->hlist_mutex));
9457}
9458
9459static void swevent_hlist_release(struct swevent_htable *swhash)
9460{
9461 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9462
9463 if (!hlist)
9464 return;
9465
9466 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9467 kfree_rcu(hlist, rcu_head);
9468}
9469
9470static void swevent_hlist_put_cpu(int cpu)
9471{
9472 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9473
9474 mutex_lock(&swhash->hlist_mutex);
9475
9476 if (!--swhash->hlist_refcount)
9477 swevent_hlist_release(swhash);
9478
9479 mutex_unlock(&swhash->hlist_mutex);
9480}
9481
9482static void swevent_hlist_put(void)
9483{
9484 int cpu;
9485
9486 for_each_possible_cpu(cpu)
9487 swevent_hlist_put_cpu(cpu);
9488}
9489
9490static int swevent_hlist_get_cpu(int cpu)
9491{
9492 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9493 int err = 0;
9494
9495 mutex_lock(&swhash->hlist_mutex);
9496 if (!swevent_hlist_deref(swhash) &&
9497 cpumask_test_cpu(cpu, perf_online_mask)) {
9498 struct swevent_hlist *hlist;
9499
9500 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9501 if (!hlist) {
9502 err = -ENOMEM;
9503 goto exit;
9504 }
9505 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9506 }
9507 swhash->hlist_refcount++;
9508exit:
9509 mutex_unlock(&swhash->hlist_mutex);
9510
9511 return err;
9512}
9513
9514static int swevent_hlist_get(void)
9515{
9516 int err, cpu, failed_cpu;
9517
9518 mutex_lock(&pmus_lock);
9519 for_each_possible_cpu(cpu) {
9520 err = swevent_hlist_get_cpu(cpu);
9521 if (err) {
9522 failed_cpu = cpu;
9523 goto fail;
9524 }
9525 }
9526 mutex_unlock(&pmus_lock);
9527 return 0;
9528fail:
9529 for_each_possible_cpu(cpu) {
9530 if (cpu == failed_cpu)
9531 break;
9532 swevent_hlist_put_cpu(cpu);
9533 }
9534 mutex_unlock(&pmus_lock);
9535 return err;
9536}
9537
9538struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9539
9540static void sw_perf_event_destroy(struct perf_event *event)
9541{
9542 u64 event_id = event->attr.config;
9543
9544 WARN_ON(event->parent);
9545
9546 static_key_slow_dec(&perf_swevent_enabled[event_id]);
9547 swevent_hlist_put();
9548}
9549
9550static int perf_swevent_init(struct perf_event *event)
9551{
9552 u64 event_id = event->attr.config;
9553
9554 if (event->attr.type != PERF_TYPE_SOFTWARE)
9555 return -ENOENT;
9556
9557
9558
9559
9560 if (has_branch_stack(event))
9561 return -EOPNOTSUPP;
9562
9563 switch (event_id) {
9564 case PERF_COUNT_SW_CPU_CLOCK:
9565 case PERF_COUNT_SW_TASK_CLOCK:
9566 return -ENOENT;
9567
9568 default:
9569 break;
9570 }
9571
9572 if (event_id >= PERF_COUNT_SW_MAX)
9573 return -ENOENT;
9574
9575 if (!event->parent) {
9576 int err;
9577
9578 err = swevent_hlist_get();
9579 if (err)
9580 return err;
9581
9582 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9583 event->destroy = sw_perf_event_destroy;
9584 }
9585
9586 return 0;
9587}
9588
9589static struct pmu perf_swevent = {
9590 .task_ctx_nr = perf_sw_context,
9591
9592 .capabilities = PERF_PMU_CAP_NO_NMI,
9593
9594 .event_init = perf_swevent_init,
9595 .add = perf_swevent_add,
9596 .del = perf_swevent_del,
9597 .start = perf_swevent_start,
9598 .stop = perf_swevent_stop,
9599 .read = perf_swevent_read,
9600};
9601
9602#ifdef CONFIG_EVENT_TRACING
9603
9604static int perf_tp_filter_match(struct perf_event *event,
9605 struct perf_sample_data *data)
9606{
9607 void *record = data->raw->frag.data;
9608
9609
9610 if (event->parent)
9611 event = event->parent;
9612
9613 if (likely(!event->filter) || filter_match_preds(event->filter, record))
9614 return 1;
9615 return 0;
9616}
9617
9618static int perf_tp_event_match(struct perf_event *event,
9619 struct perf_sample_data *data,
9620 struct pt_regs *regs)
9621{
9622 if (event->hw.state & PERF_HES_STOPPED)
9623 return 0;
9624
9625
9626
9627 if (event->attr.exclude_kernel && !user_mode(regs))
9628 return 0;
9629
9630 if (!perf_tp_filter_match(event, data))
9631 return 0;
9632
9633 return 1;
9634}
9635
9636void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9637 struct trace_event_call *call, u64 count,
9638 struct pt_regs *regs, struct hlist_head *head,
9639 struct task_struct *task)
9640{
9641 if (bpf_prog_array_valid(call)) {
9642 *(struct pt_regs **)raw_data = regs;
9643 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9644 perf_swevent_put_recursion_context(rctx);
9645 return;
9646 }
9647 }
9648 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9649 rctx, task);
9650}
9651EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9652
9653void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9654 struct pt_regs *regs, struct hlist_head *head, int rctx,
9655 struct task_struct *task)
9656{
9657 struct perf_sample_data data;
9658 struct perf_event *event;
9659
9660 struct perf_raw_record raw = {
9661 .frag = {
9662 .size = entry_size,
9663 .data = record,
9664 },
9665 };
9666
9667 perf_sample_data_init(&data, 0, 0);
9668 data.raw = &raw;
9669
9670 perf_trace_buf_update(record, event_type);
9671
9672 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9673 if (perf_tp_event_match(event, &data, regs))
9674 perf_swevent_event(event, count, &data, regs);
9675 }
9676
9677
9678
9679
9680
9681 if (task && task != current) {
9682 struct perf_event_context *ctx;
9683 struct trace_entry *entry = record;
9684
9685 rcu_read_lock();
9686 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9687 if (!ctx)
9688 goto unlock;
9689
9690 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9691 if (event->cpu != smp_processor_id())
9692 continue;
9693 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9694 continue;
9695 if (event->attr.config != entry->type)
9696 continue;
9697 if (perf_tp_event_match(event, &data, regs))
9698 perf_swevent_event(event, count, &data, regs);
9699 }
9700unlock:
9701 rcu_read_unlock();
9702 }
9703
9704 perf_swevent_put_recursion_context(rctx);
9705}
9706EXPORT_SYMBOL_GPL(perf_tp_event);
9707
9708static void tp_perf_event_destroy(struct perf_event *event)
9709{
9710 perf_trace_destroy(event);
9711}
9712
9713static int perf_tp_event_init(struct perf_event *event)
9714{
9715 int err;
9716
9717 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9718 return -ENOENT;
9719
9720
9721
9722
9723 if (has_branch_stack(event))
9724 return -EOPNOTSUPP;
9725
9726 err = perf_trace_init(event);
9727 if (err)
9728 return err;
9729
9730 event->destroy = tp_perf_event_destroy;
9731
9732 return 0;
9733}
9734
9735static struct pmu perf_tracepoint = {
9736 .task_ctx_nr = perf_sw_context,
9737
9738 .event_init = perf_tp_event_init,
9739 .add = perf_trace_add,
9740 .del = perf_trace_del,
9741 .start = perf_swevent_start,
9742 .stop = perf_swevent_stop,
9743 .read = perf_swevent_read,
9744};
9745
9746#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9747
9748
9749
9750
9751
9752
9753
9754enum perf_probe_config {
9755 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
9756};
9757
9758PMU_FORMAT_ATTR(retprobe, "config:0");
9759
9760static struct attribute *probe_attrs[] = {
9761 &format_attr_retprobe.attr,
9762 NULL,
9763};
9764
9765static struct attribute_group probe_format_group = {
9766 .name = "format",
9767 .attrs = probe_attrs,
9768};
9769
9770static const struct attribute_group *probe_attr_groups[] = {
9771 &probe_format_group,
9772 NULL,
9773};
9774#endif
9775
9776#ifdef CONFIG_KPROBE_EVENTS
9777static int perf_kprobe_event_init(struct perf_event *event);
9778static struct pmu perf_kprobe = {
9779 .task_ctx_nr = perf_sw_context,
9780 .event_init = perf_kprobe_event_init,
9781 .add = perf_trace_add,
9782 .del = perf_trace_del,
9783 .start = perf_swevent_start,
9784 .stop = perf_swevent_stop,
9785 .read = perf_swevent_read,
9786 .attr_groups = probe_attr_groups,
9787};
9788
9789static int perf_kprobe_event_init(struct perf_event *event)
9790{
9791 int err;
9792 bool is_retprobe;
9793
9794 if (event->attr.type != perf_kprobe.type)
9795 return -ENOENT;
9796
9797 if (!perfmon_capable())
9798 return -EACCES;
9799
9800
9801
9802
9803 if (has_branch_stack(event))
9804 return -EOPNOTSUPP;
9805
9806 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9807 err = perf_kprobe_init(event, is_retprobe);
9808 if (err)
9809 return err;
9810
9811 event->destroy = perf_kprobe_destroy;
9812
9813 return 0;
9814}
9815#endif
9816
9817#ifdef CONFIG_UPROBE_EVENTS
9818static int perf_uprobe_event_init(struct perf_event *event);
9819static struct pmu perf_uprobe = {
9820 .task_ctx_nr = perf_sw_context,
9821 .event_init = perf_uprobe_event_init,
9822 .add = perf_trace_add,
9823 .del = perf_trace_del,
9824 .start = perf_swevent_start,
9825 .stop = perf_swevent_stop,
9826 .read = perf_swevent_read,
9827 .attr_groups = probe_attr_groups,
9828};
9829
9830static int perf_uprobe_event_init(struct perf_event *event)
9831{
9832 int err;
9833 bool is_retprobe;
9834
9835 if (event->attr.type != perf_uprobe.type)
9836 return -ENOENT;
9837
9838 if (!perfmon_capable())
9839 return -EACCES;
9840
9841
9842
9843
9844 if (has_branch_stack(event))
9845 return -EOPNOTSUPP;
9846
9847 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9848 err = perf_uprobe_init(event, is_retprobe);
9849 if (err)
9850 return err;
9851
9852 event->destroy = perf_uprobe_destroy;
9853
9854 return 0;
9855}
9856#endif
9857
9858static inline void perf_tp_register(void)
9859{
9860 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9861#ifdef CONFIG_KPROBE_EVENTS
9862 perf_pmu_register(&perf_kprobe, "kprobe", -1);
9863#endif
9864#ifdef CONFIG_UPROBE_EVENTS
9865 perf_pmu_register(&perf_uprobe, "uprobe", -1);
9866#endif
9867}
9868
9869static void perf_event_free_filter(struct perf_event *event)
9870{
9871 ftrace_profile_free_filter(event);
9872}
9873
9874#ifdef CONFIG_BPF_SYSCALL
9875static void bpf_overflow_handler(struct perf_event *event,
9876 struct perf_sample_data *data,
9877 struct pt_regs *regs)
9878{
9879 struct bpf_perf_event_data_kern ctx = {
9880 .data = data,
9881 .event = event,
9882 };
9883 int ret = 0;
9884
9885 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9886 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9887 goto out;
9888 rcu_read_lock();
9889 ret = BPF_PROG_RUN(event->prog, &ctx);
9890 rcu_read_unlock();
9891out:
9892 __this_cpu_dec(bpf_prog_active);
9893 if (!ret)
9894 return;
9895
9896 event->orig_overflow_handler(event, data, regs);
9897}
9898
9899static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9900{
9901 struct bpf_prog *prog;
9902
9903 if (event->overflow_handler_context)
9904
9905 return -EINVAL;
9906
9907 if (event->prog)
9908 return -EEXIST;
9909
9910 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
9911 if (IS_ERR(prog))
9912 return PTR_ERR(prog);
9913
9914 if (event->attr.precise_ip &&
9915 prog->call_get_stack &&
9916 (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9917 event->attr.exclude_callchain_kernel ||
9918 event->attr.exclude_callchain_user)) {
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928 bpf_prog_put(prog);
9929 return -EPROTO;
9930 }
9931
9932 event->prog = prog;
9933 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
9934 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
9935 return 0;
9936}
9937
9938static void perf_event_free_bpf_handler(struct perf_event *event)
9939{
9940 struct bpf_prog *prog = event->prog;
9941
9942 if (!prog)
9943 return;
9944
9945 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
9946 event->prog = NULL;
9947 bpf_prog_put(prog);
9948}
9949#else
9950static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9951{
9952 return -EOPNOTSUPP;
9953}
9954static void perf_event_free_bpf_handler(struct perf_event *event)
9955{
9956}
9957#endif
9958
9959
9960
9961
9962
9963static inline bool perf_event_is_tracing(struct perf_event *event)
9964{
9965 if (event->pmu == &perf_tracepoint)
9966 return true;
9967#ifdef CONFIG_KPROBE_EVENTS
9968 if (event->pmu == &perf_kprobe)
9969 return true;
9970#endif
9971#ifdef CONFIG_UPROBE_EVENTS
9972 if (event->pmu == &perf_uprobe)
9973 return true;
9974#endif
9975 return false;
9976}
9977
9978static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9979{
9980 bool is_kprobe, is_tracepoint, is_syscall_tp;
9981 struct bpf_prog *prog;
9982 int ret;
9983
9984 rh_add_flag("eBPF/event");
9985
9986 if (!perf_event_is_tracing(event))
9987 return perf_event_set_bpf_handler(event, prog_fd);
9988
9989 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
9990 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
9991 is_syscall_tp = is_syscall_trace_event(event->tp_event);
9992 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
9993
9994 return -EINVAL;
9995
9996 prog = bpf_prog_get(prog_fd);
9997 if (IS_ERR(prog))
9998 return PTR_ERR(prog);
9999
10000 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
10001 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
10002 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
10003
10004 bpf_prog_put(prog);
10005 return -EINVAL;
10006 }
10007
10008
10009 if (prog->kprobe_override &&
10010 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
10011 bpf_prog_put(prog);
10012 return -EINVAL;
10013 }
10014
10015 if (is_tracepoint || is_syscall_tp) {
10016 int off = trace_event_get_offsets(event->tp_event);
10017
10018 if (prog->aux->max_ctx_offset > off) {
10019 bpf_prog_put(prog);
10020 return -EACCES;
10021 }
10022 }
10023
10024 ret = perf_event_attach_bpf_prog(event, prog);
10025 if (ret)
10026 bpf_prog_put(prog);
10027 return ret;
10028}
10029
10030static void perf_event_free_bpf_prog(struct perf_event *event)
10031{
10032 if (!perf_event_is_tracing(event)) {
10033 perf_event_free_bpf_handler(event);
10034 return;
10035 }
10036 perf_event_detach_bpf_prog(event);
10037}
10038
10039#else
10040
10041static inline void perf_tp_register(void)
10042{
10043}
10044
10045static void perf_event_free_filter(struct perf_event *event)
10046{
10047}
10048
10049static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
10050{
10051 return -ENOENT;
10052}
10053
10054static void perf_event_free_bpf_prog(struct perf_event *event)
10055{
10056}
10057#endif
10058
10059#ifdef CONFIG_HAVE_HW_BREAKPOINT
10060void perf_bp_event(struct perf_event *bp, void *data)
10061{
10062 struct perf_sample_data sample;
10063 struct pt_regs *regs = data;
10064
10065 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
10066
10067 if (!bp->hw.state && !perf_exclude_event(bp, regs))
10068 perf_swevent_event(bp, 1, &sample, regs);
10069}
10070#endif
10071
10072
10073
10074
10075static struct perf_addr_filter *
10076perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10077{
10078 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10079 struct perf_addr_filter *filter;
10080
10081 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10082 if (!filter)
10083 return NULL;
10084
10085 INIT_LIST_HEAD(&filter->entry);
10086 list_add_tail(&filter->entry, filters);
10087
10088 return filter;
10089}
10090
10091static void free_filters_list(struct list_head *filters)
10092{
10093 struct perf_addr_filter *filter, *iter;
10094
10095 list_for_each_entry_safe(filter, iter, filters, entry) {
10096 path_put(&filter->path);
10097 list_del(&filter->entry);
10098 kfree(filter);
10099 }
10100}
10101
10102
10103
10104
10105static void perf_addr_filters_splice(struct perf_event *event,
10106 struct list_head *head)
10107{
10108 unsigned long flags;
10109 LIST_HEAD(list);
10110
10111 if (!has_addr_filter(event))
10112 return;
10113
10114
10115 if (event->parent)
10116 return;
10117
10118 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10119
10120 list_splice_init(&event->addr_filters.list, &list);
10121 if (head)
10122 list_splice(head, &event->addr_filters.list);
10123
10124 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10125
10126 free_filters_list(&list);
10127}
10128
10129
10130
10131
10132
10133
10134static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10135 struct mm_struct *mm,
10136 struct perf_addr_filter_range *fr)
10137{
10138 struct vm_area_struct *vma;
10139
10140 for (vma = mm->mmap; vma; vma = vma->vm_next) {
10141 if (!vma->vm_file)
10142 continue;
10143
10144 if (perf_addr_filter_vma_adjust(filter, vma, fr))
10145 return;
10146 }
10147}
10148
10149
10150
10151
10152
10153static void perf_event_addr_filters_apply(struct perf_event *event)
10154{
10155 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10156 struct task_struct *task = READ_ONCE(event->ctx->task);
10157 struct perf_addr_filter *filter;
10158 struct mm_struct *mm = NULL;
10159 unsigned int count = 0;
10160 unsigned long flags;
10161
10162
10163
10164
10165
10166 if (task == TASK_TOMBSTONE)
10167 return;
10168
10169 if (ifh->nr_file_filters) {
10170 mm = get_task_mm(event->ctx->task);
10171 if (!mm)
10172 goto restart;
10173
10174 mmap_read_lock(mm);
10175 }
10176
10177 raw_spin_lock_irqsave(&ifh->lock, flags);
10178 list_for_each_entry(filter, &ifh->list, entry) {
10179 if (filter->path.dentry) {
10180
10181
10182
10183
10184 event->addr_filter_ranges[count].start = 0;
10185 event->addr_filter_ranges[count].size = 0;
10186
10187 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10188 } else {
10189 event->addr_filter_ranges[count].start = filter->offset;
10190 event->addr_filter_ranges[count].size = filter->size;
10191 }
10192
10193 count++;
10194 }
10195
10196 event->addr_filters_gen++;
10197 raw_spin_unlock_irqrestore(&ifh->lock, flags);
10198
10199 if (ifh->nr_file_filters) {
10200 mmap_read_unlock(mm);
10201
10202 mmput(mm);
10203 }
10204
10205restart:
10206 perf_event_stop(event, 1);
10207}
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228enum {
10229 IF_ACT_NONE = -1,
10230 IF_ACT_FILTER,
10231 IF_ACT_START,
10232 IF_ACT_STOP,
10233 IF_SRC_FILE,
10234 IF_SRC_KERNEL,
10235 IF_SRC_FILEADDR,
10236 IF_SRC_KERNELADDR,
10237};
10238
10239enum {
10240 IF_STATE_ACTION = 0,
10241 IF_STATE_SOURCE,
10242 IF_STATE_END,
10243};
10244
10245static const match_table_t if_tokens = {
10246 { IF_ACT_FILTER, "filter" },
10247 { IF_ACT_START, "start" },
10248 { IF_ACT_STOP, "stop" },
10249 { IF_SRC_FILE, "%u/%u@%s" },
10250 { IF_SRC_KERNEL, "%u/%u" },
10251 { IF_SRC_FILEADDR, "%u@%s" },
10252 { IF_SRC_KERNELADDR, "%u" },
10253 { IF_ACT_NONE, NULL },
10254};
10255
10256
10257
10258
10259static int
10260perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10261 struct list_head *filters)
10262{
10263 struct perf_addr_filter *filter = NULL;
10264 char *start, *orig, *filename = NULL;
10265 substring_t args[MAX_OPT_ARGS];
10266 int state = IF_STATE_ACTION, token;
10267 unsigned int kernel = 0;
10268 int ret = -EINVAL;
10269
10270 orig = fstr = kstrdup(fstr, GFP_KERNEL);
10271 if (!fstr)
10272 return -ENOMEM;
10273
10274 while ((start = strsep(&fstr, " ,\n")) != NULL) {
10275 static const enum perf_addr_filter_action_t actions[] = {
10276 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10277 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
10278 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
10279 };
10280 ret = -EINVAL;
10281
10282 if (!*start)
10283 continue;
10284
10285
10286 if (state == IF_STATE_ACTION) {
10287 filter = perf_addr_filter_new(event, filters);
10288 if (!filter)
10289 goto fail;
10290 }
10291
10292 token = match_token(start, if_tokens, args);
10293 switch (token) {
10294 case IF_ACT_FILTER:
10295 case IF_ACT_START:
10296 case IF_ACT_STOP:
10297 if (state != IF_STATE_ACTION)
10298 goto fail;
10299
10300 filter->action = actions[token];
10301 state = IF_STATE_SOURCE;
10302 break;
10303
10304 case IF_SRC_KERNELADDR:
10305 case IF_SRC_KERNEL:
10306 kernel = 1;
10307
10308
10309 case IF_SRC_FILEADDR:
10310 case IF_SRC_FILE:
10311 if (state != IF_STATE_SOURCE)
10312 goto fail;
10313
10314 *args[0].to = 0;
10315 ret = kstrtoul(args[0].from, 0, &filter->offset);
10316 if (ret)
10317 goto fail;
10318
10319 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10320 *args[1].to = 0;
10321 ret = kstrtoul(args[1].from, 0, &filter->size);
10322 if (ret)
10323 goto fail;
10324 }
10325
10326 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10327 int fpos = token == IF_SRC_FILE ? 2 : 1;
10328
10329 kfree(filename);
10330 filename = match_strdup(&args[fpos]);
10331 if (!filename) {
10332 ret = -ENOMEM;
10333 goto fail;
10334 }
10335 }
10336
10337 state = IF_STATE_END;
10338 break;
10339
10340 default:
10341 goto fail;
10342 }
10343
10344
10345
10346
10347
10348
10349 if (state == IF_STATE_END) {
10350 ret = -EINVAL;
10351 if (kernel && event->attr.exclude_kernel)
10352 goto fail;
10353
10354
10355
10356
10357
10358 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10359 !filter->size)
10360 goto fail;
10361
10362 if (!kernel) {
10363 if (!filename)
10364 goto fail;
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374 ret = -EOPNOTSUPP;
10375 if (!event->ctx->task)
10376 goto fail;
10377
10378
10379 ret = kern_path(filename, LOOKUP_FOLLOW,
10380 &filter->path);
10381 if (ret)
10382 goto fail;
10383
10384 ret = -EINVAL;
10385 if (!filter->path.dentry ||
10386 !S_ISREG(d_inode(filter->path.dentry)
10387 ->i_mode))
10388 goto fail;
10389
10390 event->addr_filters.nr_file_filters++;
10391 }
10392
10393
10394 state = IF_STATE_ACTION;
10395 filter = NULL;
10396 }
10397 }
10398
10399 if (state != IF_STATE_ACTION)
10400 goto fail;
10401
10402 kfree(filename);
10403 kfree(orig);
10404
10405 return 0;
10406
10407fail:
10408 kfree(filename);
10409 free_filters_list(filters);
10410 kfree(orig);
10411
10412 return ret;
10413}
10414
10415static int
10416perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10417{
10418 LIST_HEAD(filters);
10419 int ret;
10420
10421
10422
10423
10424
10425 lockdep_assert_held(&event->ctx->mutex);
10426
10427 if (WARN_ON_ONCE(event->parent))
10428 return -EINVAL;
10429
10430 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10431 if (ret)
10432 goto fail_clear_files;
10433
10434 ret = event->pmu->addr_filters_validate(&filters);
10435 if (ret)
10436 goto fail_free_filters;
10437
10438
10439 perf_addr_filters_splice(event, &filters);
10440
10441
10442 perf_event_for_each_child(event, perf_event_addr_filters_apply);
10443
10444 return ret;
10445
10446fail_free_filters:
10447 free_filters_list(&filters);
10448
10449fail_clear_files:
10450 event->addr_filters.nr_file_filters = 0;
10451
10452 return ret;
10453}
10454
10455static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10456{
10457 int ret = -EINVAL;
10458 char *filter_str;
10459
10460 filter_str = strndup_user(arg, PAGE_SIZE);
10461 if (IS_ERR(filter_str))
10462 return PTR_ERR(filter_str);
10463
10464#ifdef CONFIG_EVENT_TRACING
10465 if (perf_event_is_tracing(event)) {
10466 struct perf_event_context *ctx = event->ctx;
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479 mutex_unlock(&ctx->mutex);
10480 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10481 mutex_lock(&ctx->mutex);
10482 } else
10483#endif
10484 if (has_addr_filter(event))
10485 ret = perf_event_set_addr_filter(event, filter_str);
10486
10487 kfree(filter_str);
10488 return ret;
10489}
10490
10491
10492
10493
10494
10495static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10496{
10497 enum hrtimer_restart ret = HRTIMER_RESTART;
10498 struct perf_sample_data data;
10499 struct pt_regs *regs;
10500 struct perf_event *event;
10501 u64 period;
10502
10503 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10504
10505 if (event->state != PERF_EVENT_STATE_ACTIVE)
10506 return HRTIMER_NORESTART;
10507
10508 event->pmu->read(event);
10509
10510 perf_sample_data_init(&data, 0, event->hw.last_period);
10511 regs = get_irq_regs();
10512
10513 if (regs && !perf_exclude_event(event, regs)) {
10514 if (!(event->attr.exclude_idle && is_idle_task(current)))
10515 if (__perf_event_overflow(event, 1, &data, regs))
10516 ret = HRTIMER_NORESTART;
10517 }
10518
10519 period = max_t(u64, 10000, event->hw.sample_period);
10520 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10521
10522 return ret;
10523}
10524
10525static void perf_swevent_start_hrtimer(struct perf_event *event)
10526{
10527 struct hw_perf_event *hwc = &event->hw;
10528 s64 period;
10529
10530 if (!is_sampling_event(event))
10531 return;
10532
10533 period = local64_read(&hwc->period_left);
10534 if (period) {
10535 if (period < 0)
10536 period = 10000;
10537
10538 local64_set(&hwc->period_left, 0);
10539 } else {
10540 period = max_t(u64, 10000, hwc->sample_period);
10541 }
10542 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10543 HRTIMER_MODE_REL_PINNED_HARD);
10544}
10545
10546static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10547{
10548 struct hw_perf_event *hwc = &event->hw;
10549
10550 if (is_sampling_event(event)) {
10551 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10552 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10553
10554 hrtimer_cancel(&hwc->hrtimer);
10555 }
10556}
10557
10558static void perf_swevent_init_hrtimer(struct perf_event *event)
10559{
10560 struct hw_perf_event *hwc = &event->hw;
10561
10562 if (!is_sampling_event(event))
10563 return;
10564
10565 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10566 hwc->hrtimer.function = perf_swevent_hrtimer;
10567
10568
10569
10570
10571
10572 if (event->attr.freq) {
10573 long freq = event->attr.sample_freq;
10574
10575 event->attr.sample_period = NSEC_PER_SEC / freq;
10576 hwc->sample_period = event->attr.sample_period;
10577 local64_set(&hwc->period_left, hwc->sample_period);
10578 hwc->last_period = hwc->sample_period;
10579 event->attr.freq = 0;
10580 }
10581}
10582
10583
10584
10585
10586
10587static void cpu_clock_event_update(struct perf_event *event)
10588{
10589 s64 prev;
10590 u64 now;
10591
10592 now = local_clock();
10593 prev = local64_xchg(&event->hw.prev_count, now);
10594 local64_add(now - prev, &event->count);
10595}
10596
10597static void cpu_clock_event_start(struct perf_event *event, int flags)
10598{
10599 local64_set(&event->hw.prev_count, local_clock());
10600 perf_swevent_start_hrtimer(event);
10601}
10602
10603static void cpu_clock_event_stop(struct perf_event *event, int flags)
10604{
10605 perf_swevent_cancel_hrtimer(event);
10606 cpu_clock_event_update(event);
10607}
10608
10609static int cpu_clock_event_add(struct perf_event *event, int flags)
10610{
10611 if (flags & PERF_EF_START)
10612 cpu_clock_event_start(event, flags);
10613 perf_event_update_userpage(event);
10614
10615 return 0;
10616}
10617
10618static void cpu_clock_event_del(struct perf_event *event, int flags)
10619{
10620 cpu_clock_event_stop(event, flags);
10621}
10622
10623static void cpu_clock_event_read(struct perf_event *event)
10624{
10625 cpu_clock_event_update(event);
10626}
10627
10628static int cpu_clock_event_init(struct perf_event *event)
10629{
10630 if (event->attr.type != PERF_TYPE_SOFTWARE)
10631 return -ENOENT;
10632
10633 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10634 return -ENOENT;
10635
10636
10637
10638
10639 if (has_branch_stack(event))
10640 return -EOPNOTSUPP;
10641
10642 perf_swevent_init_hrtimer(event);
10643
10644 return 0;
10645}
10646
10647static struct pmu perf_cpu_clock = {
10648 .task_ctx_nr = perf_sw_context,
10649
10650 .capabilities = PERF_PMU_CAP_NO_NMI,
10651
10652 .event_init = cpu_clock_event_init,
10653 .add = cpu_clock_event_add,
10654 .del = cpu_clock_event_del,
10655 .start = cpu_clock_event_start,
10656 .stop = cpu_clock_event_stop,
10657 .read = cpu_clock_event_read,
10658};
10659
10660
10661
10662
10663
10664static void task_clock_event_update(struct perf_event *event, u64 now)
10665{
10666 u64 prev;
10667 s64 delta;
10668
10669 prev = local64_xchg(&event->hw.prev_count, now);
10670 delta = now - prev;
10671 local64_add(delta, &event->count);
10672}
10673
10674static void task_clock_event_start(struct perf_event *event, int flags)
10675{
10676 local64_set(&event->hw.prev_count, event->ctx->time);
10677 perf_swevent_start_hrtimer(event);
10678}
10679
10680static void task_clock_event_stop(struct perf_event *event, int flags)
10681{
10682 perf_swevent_cancel_hrtimer(event);
10683 task_clock_event_update(event, event->ctx->time);
10684}
10685
10686static int task_clock_event_add(struct perf_event *event, int flags)
10687{
10688 if (flags & PERF_EF_START)
10689 task_clock_event_start(event, flags);
10690 perf_event_update_userpage(event);
10691
10692 return 0;
10693}
10694
10695static void task_clock_event_del(struct perf_event *event, int flags)
10696{
10697 task_clock_event_stop(event, PERF_EF_UPDATE);
10698}
10699
10700static void task_clock_event_read(struct perf_event *event)
10701{
10702 u64 now = perf_clock();
10703 u64 delta = now - event->ctx->timestamp;
10704 u64 time = event->ctx->time + delta;
10705
10706 task_clock_event_update(event, time);
10707}
10708
10709static int task_clock_event_init(struct perf_event *event)
10710{
10711 if (event->attr.type != PERF_TYPE_SOFTWARE)
10712 return -ENOENT;
10713
10714 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10715 return -ENOENT;
10716
10717
10718
10719
10720 if (has_branch_stack(event))
10721 return -EOPNOTSUPP;
10722
10723 perf_swevent_init_hrtimer(event);
10724
10725 return 0;
10726}
10727
10728static struct pmu perf_task_clock = {
10729 .task_ctx_nr = perf_sw_context,
10730
10731 .capabilities = PERF_PMU_CAP_NO_NMI,
10732
10733 .event_init = task_clock_event_init,
10734 .add = task_clock_event_add,
10735 .del = task_clock_event_del,
10736 .start = task_clock_event_start,
10737 .stop = task_clock_event_stop,
10738 .read = task_clock_event_read,
10739};
10740
10741static void perf_pmu_nop_void(struct pmu *pmu)
10742{
10743}
10744
10745static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10746{
10747}
10748
10749static int perf_pmu_nop_int(struct pmu *pmu)
10750{
10751 return 0;
10752}
10753
10754static int perf_event_nop_int(struct perf_event *event, u64 value)
10755{
10756 return 0;
10757}
10758
10759static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10760
10761static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10762{
10763 __this_cpu_write(nop_txn_flags, flags);
10764
10765 if (flags & ~PERF_PMU_TXN_ADD)
10766 return;
10767
10768 perf_pmu_disable(pmu);
10769}
10770
10771static int perf_pmu_commit_txn(struct pmu *pmu)
10772{
10773 unsigned int flags = __this_cpu_read(nop_txn_flags);
10774
10775 __this_cpu_write(nop_txn_flags, 0);
10776
10777 if (flags & ~PERF_PMU_TXN_ADD)
10778 return 0;
10779
10780 perf_pmu_enable(pmu);
10781 return 0;
10782}
10783
10784static void perf_pmu_cancel_txn(struct pmu *pmu)
10785{
10786 unsigned int flags = __this_cpu_read(nop_txn_flags);
10787
10788 __this_cpu_write(nop_txn_flags, 0);
10789
10790 if (flags & ~PERF_PMU_TXN_ADD)
10791 return;
10792
10793 perf_pmu_enable(pmu);
10794}
10795
10796static int perf_event_idx_default(struct perf_event *event)
10797{
10798 return 0;
10799}
10800
10801
10802
10803
10804
10805static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10806{
10807 struct pmu *pmu;
10808
10809 if (ctxn < 0)
10810 return NULL;
10811
10812 list_for_each_entry(pmu, &pmus, entry) {
10813 if (pmu->task_ctx_nr == ctxn)
10814 return pmu->pmu_cpu_context;
10815 }
10816
10817 return NULL;
10818}
10819
10820static void free_pmu_context(struct pmu *pmu)
10821{
10822
10823
10824
10825
10826
10827 if (pmu->task_ctx_nr > perf_invalid_context)
10828 return;
10829
10830 free_percpu(pmu->pmu_cpu_context);
10831}
10832
10833
10834
10835
10836static ssize_t nr_addr_filters_show(struct device *dev,
10837 struct device_attribute *attr,
10838 char *page)
10839{
10840 struct pmu *pmu = dev_get_drvdata(dev);
10841
10842 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10843}
10844DEVICE_ATTR_RO(nr_addr_filters);
10845
10846static struct idr pmu_idr;
10847
10848static ssize_t
10849type_show(struct device *dev, struct device_attribute *attr, char *page)
10850{
10851 struct pmu *pmu = dev_get_drvdata(dev);
10852
10853 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10854}
10855static DEVICE_ATTR_RO(type);
10856
10857static ssize_t
10858perf_event_mux_interval_ms_show(struct device *dev,
10859 struct device_attribute *attr,
10860 char *page)
10861{
10862 struct pmu *pmu = dev_get_drvdata(dev);
10863
10864 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10865}
10866
10867static DEFINE_MUTEX(mux_interval_mutex);
10868
10869static ssize_t
10870perf_event_mux_interval_ms_store(struct device *dev,
10871 struct device_attribute *attr,
10872 const char *buf, size_t count)
10873{
10874 struct pmu *pmu = dev_get_drvdata(dev);
10875 int timer, cpu, ret;
10876
10877 ret = kstrtoint(buf, 0, &timer);
10878 if (ret)
10879 return ret;
10880
10881 if (timer < 1)
10882 return -EINVAL;
10883
10884
10885 if (timer == pmu->hrtimer_interval_ms)
10886 return count;
10887
10888 mutex_lock(&mux_interval_mutex);
10889 pmu->hrtimer_interval_ms = timer;
10890
10891
10892 cpus_read_lock();
10893 for_each_online_cpu(cpu) {
10894 struct perf_cpu_context *cpuctx;
10895 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10896 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
10897
10898 cpu_function_call(cpu,
10899 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10900 }
10901 cpus_read_unlock();
10902 mutex_unlock(&mux_interval_mutex);
10903
10904 return count;
10905}
10906static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
10907
10908static struct attribute *pmu_dev_attrs[] = {
10909 &dev_attr_type.attr,
10910 &dev_attr_perf_event_mux_interval_ms.attr,
10911 NULL,
10912};
10913ATTRIBUTE_GROUPS(pmu_dev);
10914
10915static int pmu_bus_running;
10916static struct bus_type pmu_bus = {
10917 .name = "event_source",
10918 .dev_groups = pmu_dev_groups,
10919};
10920
10921static void pmu_dev_release(struct device *dev)
10922{
10923 kfree(dev);
10924}
10925
10926static int pmu_dev_alloc(struct pmu *pmu)
10927{
10928 int ret = -ENOMEM;
10929
10930 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
10931 if (!pmu->dev)
10932 goto out;
10933
10934 pmu->dev->groups = pmu->attr_groups;
10935 device_initialize(pmu->dev);
10936 ret = dev_set_name(pmu->dev, "%s", pmu->name);
10937 if (ret)
10938 goto free_dev;
10939
10940 dev_set_drvdata(pmu->dev, pmu);
10941 pmu->dev->bus = &pmu_bus;
10942 pmu->dev->release = pmu_dev_release;
10943 ret = device_add(pmu->dev);
10944 if (ret)
10945 goto free_dev;
10946
10947
10948 if (pmu->nr_addr_filters)
10949 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
10950
10951 if (ret)
10952 goto del_dev;
10953
10954 if (pmu->attr_update)
10955 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
10956
10957 if (ret)
10958 goto del_dev;
10959
10960out:
10961 return ret;
10962
10963del_dev:
10964 device_del(pmu->dev);
10965
10966free_dev:
10967 put_device(pmu->dev);
10968 goto out;
10969}
10970
10971static struct lock_class_key cpuctx_mutex;
10972static struct lock_class_key cpuctx_lock;
10973
10974int perf_pmu_register(struct pmu *pmu, const char *name, int type)
10975{
10976 int cpu, ret, max = PERF_TYPE_MAX;
10977
10978 mutex_lock(&pmus_lock);
10979 ret = -ENOMEM;
10980 pmu->pmu_disable_count = alloc_percpu(int);
10981 if (!pmu->pmu_disable_count)
10982 goto unlock;
10983
10984 pmu->type = -1;
10985 if (!name)
10986 goto skip_type;
10987 pmu->name = name;
10988
10989 if (type != PERF_TYPE_SOFTWARE) {
10990 if (type >= 0)
10991 max = type;
10992
10993 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10994 if (ret < 0)
10995 goto free_pdc;
10996
10997 WARN_ON(type >= 0 && ret != type);
10998
10999 type = ret;
11000 }
11001 pmu->type = type;
11002
11003 if (pmu_bus_running) {
11004 ret = pmu_dev_alloc(pmu);
11005 if (ret)
11006 goto free_idr;
11007 }
11008
11009skip_type:
11010 if (pmu->task_ctx_nr == perf_hw_context) {
11011 static int hw_context_taken = 0;
11012
11013
11014
11015
11016
11017
11018 if (WARN_ON_ONCE(hw_context_taken &&
11019 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
11020 pmu->task_ctx_nr = perf_invalid_context;
11021
11022 hw_context_taken = 1;
11023 }
11024
11025 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
11026 if (pmu->pmu_cpu_context)
11027 goto got_cpu_context;
11028
11029 ret = -ENOMEM;
11030 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
11031 if (!pmu->pmu_cpu_context)
11032 goto free_dev;
11033
11034 for_each_possible_cpu(cpu) {
11035 struct perf_cpu_context *cpuctx;
11036
11037 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11038 __perf_event_init_context(&cpuctx->ctx);
11039 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
11040 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
11041 cpuctx->ctx.pmu = pmu;
11042 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
11043
11044 __perf_mux_hrtimer_init(cpuctx, cpu);
11045
11046 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
11047 cpuctx->heap = cpuctx->heap_default;
11048 }
11049
11050got_cpu_context:
11051 if (!pmu->start_txn) {
11052 if (pmu->pmu_enable) {
11053
11054
11055
11056
11057
11058 pmu->start_txn = perf_pmu_start_txn;
11059 pmu->commit_txn = perf_pmu_commit_txn;
11060 pmu->cancel_txn = perf_pmu_cancel_txn;
11061 } else {
11062 pmu->start_txn = perf_pmu_nop_txn;
11063 pmu->commit_txn = perf_pmu_nop_int;
11064 pmu->cancel_txn = perf_pmu_nop_void;
11065 }
11066 }
11067
11068 if (!pmu->pmu_enable) {
11069 pmu->pmu_enable = perf_pmu_nop_void;
11070 pmu->pmu_disable = perf_pmu_nop_void;
11071 }
11072
11073 if (!pmu->check_period)
11074 pmu->check_period = perf_event_nop_int;
11075
11076 if (!pmu->event_idx)
11077 pmu->event_idx = perf_event_idx_default;
11078
11079
11080
11081
11082
11083
11084 if (type == PERF_TYPE_SOFTWARE || !name)
11085 list_add_rcu(&pmu->entry, &pmus);
11086 else
11087 list_add_tail_rcu(&pmu->entry, &pmus);
11088
11089 atomic_set(&pmu->exclusive_cnt, 0);
11090 ret = 0;
11091unlock:
11092 mutex_unlock(&pmus_lock);
11093
11094 return ret;
11095
11096free_dev:
11097 device_del(pmu->dev);
11098 put_device(pmu->dev);
11099
11100free_idr:
11101 if (pmu->type != PERF_TYPE_SOFTWARE)
11102 idr_remove(&pmu_idr, pmu->type);
11103
11104free_pdc:
11105 free_percpu(pmu->pmu_disable_count);
11106 goto unlock;
11107}
11108EXPORT_SYMBOL_GPL(perf_pmu_register);
11109
11110void perf_pmu_unregister(struct pmu *pmu)
11111{
11112 mutex_lock(&pmus_lock);
11113 list_del_rcu(&pmu->entry);
11114
11115
11116
11117
11118
11119 synchronize_srcu(&pmus_srcu);
11120 synchronize_rcu();
11121
11122 free_percpu(pmu->pmu_disable_count);
11123 if (pmu->type != PERF_TYPE_SOFTWARE)
11124 idr_remove(&pmu_idr, pmu->type);
11125 if (pmu_bus_running) {
11126 if (pmu->nr_addr_filters)
11127 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11128 device_del(pmu->dev);
11129 put_device(pmu->dev);
11130 }
11131 free_pmu_context(pmu);
11132 mutex_unlock(&pmus_lock);
11133}
11134EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11135
11136static inline bool has_extended_regs(struct perf_event *event)
11137{
11138 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11139 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11140}
11141
11142static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11143{
11144 struct perf_event_context *ctx = NULL;
11145 int ret;
11146
11147 if (!try_module_get(pmu->module))
11148 return -ENODEV;
11149
11150
11151
11152
11153
11154
11155
11156 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11157
11158
11159
11160
11161 ctx = perf_event_ctx_lock_nested(event->group_leader,
11162 SINGLE_DEPTH_NESTING);
11163 BUG_ON(!ctx);
11164 }
11165
11166 event->pmu = pmu;
11167 ret = pmu->event_init(event);
11168
11169 if (ctx)
11170 perf_event_ctx_unlock(event->group_leader, ctx);
11171
11172 if (!ret) {
11173 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11174 has_extended_regs(event))
11175 ret = -EOPNOTSUPP;
11176
11177 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11178 event_has_any_exclude_flag(event))
11179 ret = -EINVAL;
11180
11181 if (ret && event->destroy)
11182 event->destroy(event);
11183 }
11184
11185 if (ret)
11186 module_put(pmu->module);
11187
11188 return ret;
11189}
11190
11191static struct pmu *perf_init_event(struct perf_event *event)
11192{
11193 bool extended_type = false;
11194 int idx, type, ret;
11195 struct pmu *pmu;
11196
11197 idx = srcu_read_lock(&pmus_srcu);
11198
11199
11200 if (event->parent && event->parent->pmu) {
11201 pmu = event->parent->pmu;
11202 ret = perf_try_init_event(pmu, event);
11203 if (!ret)
11204 goto unlock;
11205 }
11206
11207
11208
11209
11210
11211 type = event->attr.type;
11212 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
11213 type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
11214 if (!type) {
11215 type = PERF_TYPE_RAW;
11216 } else {
11217 extended_type = true;
11218 event->attr.config &= PERF_HW_EVENT_MASK;
11219 }
11220 }
11221
11222again:
11223 rcu_read_lock();
11224 pmu = idr_find(&pmu_idr, type);
11225 rcu_read_unlock();
11226 if (pmu) {
11227 if (event->attr.type != type && type != PERF_TYPE_RAW &&
11228 !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
11229 goto fail;
11230
11231 ret = perf_try_init_event(pmu, event);
11232 if (ret == -ENOENT && event->attr.type != type && !extended_type) {
11233 type = event->attr.type;
11234 goto again;
11235 }
11236
11237 if (ret)
11238 pmu = ERR_PTR(ret);
11239
11240 goto unlock;
11241 }
11242
11243 list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11244 ret = perf_try_init_event(pmu, event);
11245 if (!ret)
11246 goto unlock;
11247
11248 if (ret != -ENOENT) {
11249 pmu = ERR_PTR(ret);
11250 goto unlock;
11251 }
11252 }
11253fail:
11254 pmu = ERR_PTR(-ENOENT);
11255unlock:
11256 srcu_read_unlock(&pmus_srcu, idx);
11257
11258 return pmu;
11259}
11260
11261static void attach_sb_event(struct perf_event *event)
11262{
11263 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11264
11265 raw_spin_lock(&pel->lock);
11266 list_add_rcu(&event->sb_list, &pel->list);
11267 raw_spin_unlock(&pel->lock);
11268}
11269
11270
11271
11272
11273
11274
11275
11276
11277static void account_pmu_sb_event(struct perf_event *event)
11278{
11279 if (is_sb_event(event))
11280 attach_sb_event(event);
11281}
11282
11283static void account_event_cpu(struct perf_event *event, int cpu)
11284{
11285 if (event->parent)
11286 return;
11287
11288 if (is_cgroup_event(event))
11289 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11290}
11291
11292
11293static void account_freq_event_nohz(void)
11294{
11295#ifdef CONFIG_NO_HZ_FULL
11296
11297 spin_lock(&nr_freq_lock);
11298 if (atomic_inc_return(&nr_freq_events) == 1)
11299 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11300 spin_unlock(&nr_freq_lock);
11301#endif
11302}
11303
11304static void account_freq_event(void)
11305{
11306 if (tick_nohz_full_enabled())
11307 account_freq_event_nohz();
11308 else
11309 atomic_inc(&nr_freq_events);
11310}
11311
11312
11313static void account_event(struct perf_event *event)
11314{
11315 bool inc = false;
11316
11317 if (event->parent)
11318 return;
11319
11320 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11321 inc = true;
11322 if (event->attr.mmap || event->attr.mmap_data)
11323 atomic_inc(&nr_mmap_events);
11324 if (event->attr.build_id)
11325 atomic_inc(&nr_build_id_events);
11326 if (event->attr.comm)
11327 atomic_inc(&nr_comm_events);
11328 if (event->attr.namespaces)
11329 atomic_inc(&nr_namespaces_events);
11330 if (event->attr.cgroup)
11331 atomic_inc(&nr_cgroup_events);
11332 if (event->attr.task)
11333 atomic_inc(&nr_task_events);
11334 if (event->attr.freq)
11335 account_freq_event();
11336 if (event->attr.context_switch) {
11337 atomic_inc(&nr_switch_events);
11338 inc = true;
11339 }
11340 if (has_branch_stack(event))
11341 inc = true;
11342 if (is_cgroup_event(event))
11343 inc = true;
11344 if (event->attr.ksymbol)
11345 atomic_inc(&nr_ksymbol_events);
11346 if (event->attr.bpf_event)
11347 atomic_inc(&nr_bpf_events);
11348 if (event->attr.text_poke)
11349 atomic_inc(&nr_text_poke_events);
11350
11351 if (inc) {
11352
11353
11354
11355
11356
11357 if (atomic_inc_not_zero(&perf_sched_count))
11358 goto enabled;
11359
11360 mutex_lock(&perf_sched_mutex);
11361 if (!atomic_read(&perf_sched_count)) {
11362 static_branch_enable(&perf_sched_events);
11363
11364
11365
11366
11367
11368 synchronize_rcu();
11369 }
11370
11371
11372
11373
11374 atomic_inc(&perf_sched_count);
11375 mutex_unlock(&perf_sched_mutex);
11376 }
11377enabled:
11378
11379 account_event_cpu(event, event->cpu);
11380
11381 account_pmu_sb_event(event);
11382}
11383
11384
11385
11386
11387static struct perf_event *
11388perf_event_alloc(struct perf_event_attr *attr, int cpu,
11389 struct task_struct *task,
11390 struct perf_event *group_leader,
11391 struct perf_event *parent_event,
11392 perf_overflow_handler_t overflow_handler,
11393 void *context, int cgroup_fd)
11394{
11395 struct pmu *pmu;
11396 struct perf_event *event;
11397 struct hw_perf_event *hwc;
11398 long err = -EINVAL;
11399 int node;
11400
11401 if ((unsigned)cpu >= nr_cpu_ids) {
11402 if (!task || cpu != -1)
11403 return ERR_PTR(-EINVAL);
11404 }
11405
11406 node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11407 event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11408 node);
11409 if (!event)
11410 return ERR_PTR(-ENOMEM);
11411
11412
11413
11414
11415
11416 if (!group_leader)
11417 group_leader = event;
11418
11419 mutex_init(&event->child_mutex);
11420 INIT_LIST_HEAD(&event->child_list);
11421
11422 INIT_LIST_HEAD(&event->event_entry);
11423 INIT_LIST_HEAD(&event->sibling_list);
11424 INIT_LIST_HEAD(&event->active_list);
11425 init_event_group(event);
11426 INIT_LIST_HEAD(&event->rb_entry);
11427 INIT_LIST_HEAD(&event->active_entry);
11428 INIT_LIST_HEAD(&event->addr_filters.list);
11429 INIT_HLIST_NODE(&event->hlist_entry);
11430
11431
11432 init_waitqueue_head(&event->waitq);
11433 event->pending_disable = -1;
11434 init_irq_work(&event->pending, perf_pending_event);
11435
11436 mutex_init(&event->mmap_mutex);
11437 raw_spin_lock_init(&event->addr_filters.lock);
11438
11439 atomic_long_set(&event->refcount, 1);
11440 event->cpu = cpu;
11441 event->attr = *attr;
11442 event->group_leader = group_leader;
11443 event->pmu = NULL;
11444 event->oncpu = -1;
11445
11446 event->parent = parent_event;
11447
11448 event->ns = get_pid_ns(task_active_pid_ns(current));
11449 event->id = atomic64_inc_return(&perf_event_id);
11450
11451 event->state = PERF_EVENT_STATE_INACTIVE;
11452
11453 if (task) {
11454 event->attach_state = PERF_ATTACH_TASK;
11455
11456
11457
11458
11459
11460 event->hw.target = get_task_struct(task);
11461 }
11462
11463 event->clock = &local_clock;
11464 if (parent_event)
11465 event->clock = parent_event->clock;
11466
11467 if (!overflow_handler && parent_event) {
11468 overflow_handler = parent_event->overflow_handler;
11469 context = parent_event->overflow_handler_context;
11470#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11471 if (overflow_handler == bpf_overflow_handler) {
11472 struct bpf_prog *prog = parent_event->prog;
11473
11474 bpf_prog_inc(prog);
11475 event->prog = prog;
11476 event->orig_overflow_handler =
11477 parent_event->orig_overflow_handler;
11478 }
11479#endif
11480 }
11481
11482 if (overflow_handler) {
11483 event->overflow_handler = overflow_handler;
11484 event->overflow_handler_context = context;
11485 } else if (is_write_backward(event)){
11486 event->overflow_handler = perf_event_output_backward;
11487 event->overflow_handler_context = NULL;
11488 } else {
11489 event->overflow_handler = perf_event_output_forward;
11490 event->overflow_handler_context = NULL;
11491 }
11492
11493 perf_event__state_init(event);
11494
11495 pmu = NULL;
11496
11497 hwc = &event->hw;
11498 hwc->sample_period = attr->sample_period;
11499 if (attr->freq && attr->sample_freq)
11500 hwc->sample_period = 1;
11501 hwc->last_period = hwc->sample_period;
11502
11503 local64_set(&hwc->period_left, hwc->sample_period);
11504
11505
11506
11507
11508
11509 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11510 goto err_ns;
11511
11512 if (!has_branch_stack(event))
11513 event->attr.branch_sample_type = 0;
11514
11515 pmu = perf_init_event(event);
11516 if (IS_ERR(pmu)) {
11517 err = PTR_ERR(pmu);
11518 goto err_ns;
11519 }
11520
11521
11522
11523
11524
11525 if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11526 err = -EINVAL;
11527 goto err_pmu;
11528 }
11529
11530 if (event->attr.aux_output &&
11531 !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11532 err = -EOPNOTSUPP;
11533 goto err_pmu;
11534 }
11535
11536 if (cgroup_fd != -1) {
11537 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11538 if (err)
11539 goto err_pmu;
11540 }
11541
11542 err = exclusive_event_init(event);
11543 if (err)
11544 goto err_pmu;
11545
11546 if (has_addr_filter(event)) {
11547 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11548 sizeof(struct perf_addr_filter_range),
11549 GFP_KERNEL);
11550 if (!event->addr_filter_ranges) {
11551 err = -ENOMEM;
11552 goto err_per_task;
11553 }
11554
11555
11556
11557
11558
11559 if (event->parent) {
11560 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11561
11562 raw_spin_lock_irq(&ifh->lock);
11563 memcpy(event->addr_filter_ranges,
11564 event->parent->addr_filter_ranges,
11565 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11566 raw_spin_unlock_irq(&ifh->lock);
11567 }
11568
11569
11570 event->addr_filters_gen = 1;
11571 }
11572
11573 if (!event->parent) {
11574 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11575 err = get_callchain_buffers(attr->sample_max_stack);
11576 if (err)
11577 goto err_addr_filters;
11578 }
11579 }
11580
11581 err = security_perf_event_alloc(event);
11582 if (err)
11583 goto err_callchain_buffer;
11584
11585
11586 account_event(event);
11587
11588 return event;
11589
11590err_callchain_buffer:
11591 if (!event->parent) {
11592 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11593 put_callchain_buffers();
11594 }
11595err_addr_filters:
11596 kfree(event->addr_filter_ranges);
11597
11598err_per_task:
11599 exclusive_event_destroy(event);
11600
11601err_pmu:
11602 if (is_cgroup_event(event))
11603 perf_detach_cgroup(event);
11604 if (event->destroy)
11605 event->destroy(event);
11606 module_put(pmu->module);
11607err_ns:
11608 if (event->ns)
11609 put_pid_ns(event->ns);
11610 if (event->hw.target)
11611 put_task_struct(event->hw.target);
11612 kmem_cache_free(perf_event_cache, event);
11613
11614 return ERR_PTR(err);
11615}
11616
11617static int perf_copy_attr(struct perf_event_attr __user *uattr,
11618 struct perf_event_attr *attr)
11619{
11620 u32 size;
11621 int ret;
11622
11623 if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
11624 return -EFAULT;
11625
11626
11627
11628
11629 memset(attr, 0, sizeof(*attr));
11630
11631 ret = get_user(size, &uattr->size);
11632 if (ret)
11633 return ret;
11634
11635 if (size > PAGE_SIZE)
11636 goto err_size;
11637
11638 if (!size)
11639 size = PERF_ATTR_SIZE_VER0;
11640
11641 if (size < PERF_ATTR_SIZE_VER0)
11642 goto err_size;
11643
11644
11645
11646
11647
11648
11649
11650 if (size > sizeof(*attr)) {
11651 unsigned char __user *addr;
11652 unsigned char __user *end;
11653 unsigned char val;
11654
11655 addr = (void __user *)uattr + sizeof(*attr);
11656 end = (void __user *)uattr + size;
11657
11658 for (; addr < end; addr++) {
11659 ret = get_user(val, addr);
11660 if (ret)
11661 return ret;
11662 if (val)
11663 goto err_size;
11664 }
11665 size = sizeof(*attr);
11666 }
11667
11668 ret = copy_from_user(attr, uattr, size);
11669 if (ret)
11670 return -EFAULT;
11671
11672 attr->size = size;
11673
11674 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11675 return -EINVAL;
11676
11677 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11678 return -EINVAL;
11679
11680 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11681 return -EINVAL;
11682
11683 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11684 u64 mask = attr->branch_sample_type;
11685
11686
11687 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11688 return -EINVAL;
11689
11690
11691 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11692 return -EINVAL;
11693
11694
11695 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11696
11697
11698 if (!attr->exclude_kernel)
11699 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11700
11701 if (!attr->exclude_user)
11702 mask |= PERF_SAMPLE_BRANCH_USER;
11703
11704 if (!attr->exclude_hv)
11705 mask |= PERF_SAMPLE_BRANCH_HV;
11706
11707
11708
11709 attr->branch_sample_type = mask;
11710 }
11711
11712 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11713 ret = perf_allow_kernel(attr);
11714 if (ret)
11715 return ret;
11716 }
11717 }
11718
11719 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11720 ret = perf_reg_validate(attr->sample_regs_user);
11721 if (ret)
11722 return ret;
11723 }
11724
11725 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11726 if (!arch_perf_have_user_stack_dump())
11727 return -ENOSYS;
11728
11729
11730
11731
11732
11733
11734 if (attr->sample_stack_user >= USHRT_MAX)
11735 return -EINVAL;
11736 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11737 return -EINVAL;
11738 }
11739
11740 if (!attr->sample_max_stack)
11741 attr->sample_max_stack = sysctl_perf_event_max_stack;
11742
11743 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11744 ret = perf_reg_validate(attr->sample_regs_intr);
11745
11746#ifndef CONFIG_CGROUP_PERF
11747 if (attr->sample_type & PERF_SAMPLE_CGROUP)
11748 return -EINVAL;
11749#endif
11750 if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
11751 (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
11752 return -EINVAL;
11753
11754 if (!attr->inherit && attr->inherit_thread)
11755 return -EINVAL;
11756
11757 if (attr->remove_on_exec && attr->enable_on_exec)
11758 return -EINVAL;
11759
11760out:
11761 return ret;
11762
11763err_size:
11764 put_user(sizeof(*attr), &uattr->size);
11765 ret = -E2BIG;
11766 goto out;
11767}
11768
11769static int
11770perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11771{
11772 struct perf_buffer *rb = NULL;
11773 int ret = -EINVAL;
11774
11775 if (!output_event)
11776 goto set;
11777
11778
11779 if (event == output_event)
11780 goto out;
11781
11782
11783
11784
11785 if (output_event->cpu != event->cpu)
11786 goto out;
11787
11788
11789
11790
11791 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11792 goto out;
11793
11794
11795
11796
11797 if (output_event->clock != event->clock)
11798 goto out;
11799
11800
11801
11802
11803
11804 if (is_write_backward(output_event) != is_write_backward(event))
11805 goto out;
11806
11807
11808
11809
11810 if (has_aux(event) && has_aux(output_event) &&
11811 event->pmu != output_event->pmu)
11812 goto out;
11813
11814set:
11815 mutex_lock(&event->mmap_mutex);
11816
11817 if (atomic_read(&event->mmap_count))
11818 goto unlock;
11819
11820 if (output_event) {
11821
11822 rb = ring_buffer_get(output_event);
11823 if (!rb)
11824 goto unlock;
11825 }
11826
11827 ring_buffer_attach(event, rb);
11828
11829 ret = 0;
11830unlock:
11831 mutex_unlock(&event->mmap_mutex);
11832
11833out:
11834 return ret;
11835}
11836
11837static void mutex_lock_double(struct mutex *a, struct mutex *b)
11838{
11839 if (b < a)
11840 swap(a, b);
11841
11842 mutex_lock(a);
11843 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11844}
11845
11846static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11847{
11848 bool nmi_safe = false;
11849
11850 switch (clk_id) {
11851 case CLOCK_MONOTONIC:
11852 event->clock = &ktime_get_mono_fast_ns;
11853 nmi_safe = true;
11854 break;
11855
11856 case CLOCK_MONOTONIC_RAW:
11857 event->clock = &ktime_get_raw_fast_ns;
11858 nmi_safe = true;
11859 break;
11860
11861 case CLOCK_REALTIME:
11862 event->clock = &ktime_get_real_ns;
11863 break;
11864
11865 case CLOCK_BOOTTIME:
11866 event->clock = &ktime_get_boottime_ns;
11867 break;
11868
11869 case CLOCK_TAI:
11870 event->clock = &ktime_get_clocktai_ns;
11871 break;
11872
11873 default:
11874 return -EINVAL;
11875 }
11876
11877 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11878 return -EINVAL;
11879
11880 return 0;
11881}
11882
11883
11884
11885
11886
11887static struct perf_event_context *
11888__perf_event_ctx_lock_double(struct perf_event *group_leader,
11889 struct perf_event_context *ctx)
11890{
11891 struct perf_event_context *gctx;
11892
11893again:
11894 rcu_read_lock();
11895 gctx = READ_ONCE(group_leader->ctx);
11896 if (!refcount_inc_not_zero(&gctx->refcount)) {
11897 rcu_read_unlock();
11898 goto again;
11899 }
11900 rcu_read_unlock();
11901
11902 mutex_lock_double(&gctx->mutex, &ctx->mutex);
11903
11904 if (group_leader->ctx != gctx) {
11905 mutex_unlock(&ctx->mutex);
11906 mutex_unlock(&gctx->mutex);
11907 put_ctx(gctx);
11908 goto again;
11909 }
11910
11911 return gctx;
11912}
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923SYSCALL_DEFINE5(perf_event_open,
11924 struct perf_event_attr __user *, attr_uptr,
11925 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
11926{
11927 struct perf_event *group_leader = NULL, *output_event = NULL;
11928 struct perf_event *event, *sibling;
11929 struct perf_event_attr attr;
11930 struct perf_event_context *ctx, *uninitialized_var(gctx);
11931 struct file *event_file = NULL;
11932 struct fd group = {NULL, 0};
11933 struct task_struct *task = NULL;
11934 struct pmu *pmu;
11935 int event_fd;
11936 int move_group = 0;
11937 int err;
11938 int f_flags = O_RDWR;
11939 int cgroup_fd = -1;
11940
11941
11942 if (flags & ~PERF_FLAG_ALL)
11943 return -EINVAL;
11944
11945
11946 err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
11947 if (err)
11948 return err;
11949
11950 err = perf_copy_attr(attr_uptr, &attr);
11951 if (err)
11952 return err;
11953
11954 if (!attr.exclude_kernel) {
11955 err = perf_allow_kernel(&attr);
11956 if (err)
11957 return err;
11958 }
11959
11960 if (attr.namespaces) {
11961 if (!perfmon_capable())
11962 return -EACCES;
11963 }
11964
11965 if (attr.freq) {
11966 if (attr.sample_freq > sysctl_perf_event_sample_rate)
11967 return -EINVAL;
11968 } else {
11969 if (attr.sample_period & (1ULL << 63))
11970 return -EINVAL;
11971 }
11972
11973
11974 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
11975 err = perf_allow_kernel(&attr);
11976 if (err)
11977 return err;
11978 }
11979
11980
11981
11982
11983
11984
11985
11986 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
11987 return -EINVAL;
11988
11989 if (flags & PERF_FLAG_FD_CLOEXEC)
11990 f_flags |= O_CLOEXEC;
11991
11992 event_fd = get_unused_fd_flags(f_flags);
11993 if (event_fd < 0)
11994 return event_fd;
11995
11996 if (group_fd != -1) {
11997 err = perf_fget_light(group_fd, &group);
11998 if (err)
11999 goto err_fd;
12000 group_leader = group.file->private_data;
12001 if (flags & PERF_FLAG_FD_OUTPUT)
12002 output_event = group_leader;
12003 if (flags & PERF_FLAG_FD_NO_GROUP)
12004 group_leader = NULL;
12005 }
12006
12007 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
12008 task = find_lively_task_by_vpid(pid);
12009 if (IS_ERR(task)) {
12010 err = PTR_ERR(task);
12011 goto err_group_fd;
12012 }
12013 }
12014
12015 if (task && group_leader &&
12016 group_leader->attr.inherit != attr.inherit) {
12017 err = -EINVAL;
12018 goto err_task;
12019 }
12020
12021 if (flags & PERF_FLAG_PID_CGROUP)
12022 cgroup_fd = pid;
12023
12024 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
12025 NULL, NULL, cgroup_fd);
12026 if (IS_ERR(event)) {
12027 err = PTR_ERR(event);
12028 goto err_task;
12029 }
12030
12031 if (is_sampling_event(event)) {
12032 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
12033 err = -EOPNOTSUPP;
12034 goto err_alloc;
12035 }
12036 }
12037
12038
12039
12040
12041
12042 pmu = event->pmu;
12043
12044 if (attr.use_clockid) {
12045 err = perf_event_set_clock(event, attr.clockid);
12046 if (err)
12047 goto err_alloc;
12048 }
12049
12050 if (pmu->task_ctx_nr == perf_sw_context)
12051 event->event_caps |= PERF_EV_CAP_SOFTWARE;
12052
12053 if (group_leader) {
12054 if (is_software_event(event) &&
12055 !in_software_context(group_leader)) {
12056
12057
12058
12059
12060
12061
12062
12063
12064 pmu = group_leader->ctx->pmu;
12065 } else if (!is_software_event(event) &&
12066 is_software_event(group_leader) &&
12067 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12068
12069
12070
12071
12072
12073 move_group = 1;
12074 }
12075 }
12076
12077
12078
12079
12080 ctx = find_get_context(pmu, task, event);
12081 if (IS_ERR(ctx)) {
12082 err = PTR_ERR(ctx);
12083 goto err_alloc;
12084 }
12085
12086
12087
12088
12089 if (group_leader) {
12090 err = -EINVAL;
12091
12092
12093
12094
12095
12096 if (group_leader->group_leader != group_leader)
12097 goto err_context;
12098
12099
12100 if (group_leader->clock != event->clock)
12101 goto err_context;
12102
12103
12104
12105
12106
12107
12108 if (group_leader->cpu != event->cpu)
12109 goto err_context;
12110
12111
12112
12113
12114
12115 if (group_leader->ctx->task != ctx->task)
12116 goto err_context;
12117
12118
12119
12120
12121
12122
12123 if (!move_group && group_leader->ctx != ctx)
12124 goto err_context;
12125
12126
12127
12128
12129 if (attr.exclusive || attr.pinned)
12130 goto err_context;
12131 }
12132
12133 if (output_event) {
12134 err = perf_event_set_output(event, output_event);
12135 if (err)
12136 goto err_context;
12137 }
12138
12139 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
12140 f_flags);
12141 if (IS_ERR(event_file)) {
12142 err = PTR_ERR(event_file);
12143 event_file = NULL;
12144 goto err_context;
12145 }
12146
12147 if (task) {
12148 err = down_read_interruptible(&task->signal->exec_update_lock);
12149 if (err)
12150 goto err_file;
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160 err = -EACCES;
12161 if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
12162 goto err_cred;
12163 }
12164
12165 if (move_group) {
12166 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
12167
12168 if (gctx->task == TASK_TOMBSTONE) {
12169 err = -ESRCH;
12170 goto err_locked;
12171 }
12172
12173
12174
12175
12176
12177 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12178
12179
12180
12181
12182
12183 if (gctx != ctx) {
12184 err = -EINVAL;
12185 goto err_locked;
12186 } else {
12187 perf_event_ctx_unlock(group_leader, gctx);
12188 move_group = 0;
12189 }
12190 }
12191
12192
12193
12194
12195 err = -EBUSY;
12196 if (!exclusive_event_installable(group_leader, ctx))
12197 goto err_locked;
12198
12199 for_each_sibling_event(sibling, group_leader) {
12200 if (!exclusive_event_installable(sibling, ctx))
12201 goto err_locked;
12202 }
12203 } else {
12204 mutex_lock(&ctx->mutex);
12205 }
12206
12207 if (ctx->task == TASK_TOMBSTONE) {
12208 err = -ESRCH;
12209 goto err_locked;
12210 }
12211
12212 if (!perf_event_validate_size(event)) {
12213 err = -E2BIG;
12214 goto err_locked;
12215 }
12216
12217 if (!task) {
12218
12219
12220
12221
12222
12223
12224 struct perf_cpu_context *cpuctx =
12225 container_of(ctx, struct perf_cpu_context, ctx);
12226
12227 if (!cpuctx->online) {
12228 err = -ENODEV;
12229 goto err_locked;
12230 }
12231 }
12232
12233 if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12234 err = -EINVAL;
12235 goto err_locked;
12236 }
12237
12238
12239
12240
12241
12242 if (!exclusive_event_installable(event, ctx)) {
12243 err = -EBUSY;
12244 goto err_locked;
12245 }
12246
12247 WARN_ON_ONCE(ctx->parent_ctx);
12248
12249
12250
12251
12252
12253
12254 if (move_group) {
12255
12256
12257
12258
12259 perf_remove_from_context(group_leader, 0);
12260 put_ctx(gctx);
12261
12262 for_each_sibling_event(sibling, group_leader) {
12263 perf_remove_from_context(sibling, 0);
12264 put_ctx(gctx);
12265 }
12266
12267
12268
12269
12270
12271 synchronize_rcu();
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283 for_each_sibling_event(sibling, group_leader) {
12284 perf_event__state_init(sibling);
12285 perf_install_in_context(ctx, sibling, sibling->cpu);
12286 get_ctx(ctx);
12287 }
12288
12289
12290
12291
12292
12293
12294 perf_event__state_init(group_leader);
12295 perf_install_in_context(ctx, group_leader, group_leader->cpu);
12296 get_ctx(ctx);
12297 }
12298
12299
12300
12301
12302
12303
12304
12305 perf_event__header_size(event);
12306 perf_event__id_header_size(event);
12307
12308 event->owner = current;
12309
12310 perf_install_in_context(ctx, event, event->cpu);
12311 perf_unpin_context(ctx);
12312
12313 if (move_group)
12314 perf_event_ctx_unlock(group_leader, gctx);
12315 mutex_unlock(&ctx->mutex);
12316
12317 if (task) {
12318 up_read(&task->signal->exec_update_lock);
12319 put_task_struct(task);
12320 }
12321
12322 mutex_lock(¤t->perf_event_mutex);
12323 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
12324 mutex_unlock(¤t->perf_event_mutex);
12325
12326
12327
12328
12329
12330
12331
12332 fdput(group);
12333 fd_install(event_fd, event_file);
12334 return event_fd;
12335
12336err_locked:
12337 if (move_group)
12338 perf_event_ctx_unlock(group_leader, gctx);
12339 mutex_unlock(&ctx->mutex);
12340err_cred:
12341 if (task)
12342 up_read(&task->signal->exec_update_lock);
12343err_file:
12344 fput(event_file);
12345err_context:
12346 perf_unpin_context(ctx);
12347 put_ctx(ctx);
12348err_alloc:
12349
12350
12351
12352
12353 if (!event_file)
12354 free_event(event);
12355err_task:
12356 if (task)
12357 put_task_struct(task);
12358err_group_fd:
12359 fdput(group);
12360err_fd:
12361 put_unused_fd(event_fd);
12362 return err;
12363}
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374struct perf_event *
12375perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12376 struct task_struct *task,
12377 perf_overflow_handler_t overflow_handler,
12378 void *context)
12379{
12380 struct perf_event_context *ctx;
12381 struct perf_event *event;
12382 int err;
12383
12384
12385
12386
12387
12388 if (attr->aux_output)
12389 return ERR_PTR(-EINVAL);
12390
12391 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12392 overflow_handler, context, -1);
12393 if (IS_ERR(event)) {
12394 err = PTR_ERR(event);
12395 goto err;
12396 }
12397
12398
12399 event->owner = TASK_TOMBSTONE;
12400
12401
12402
12403
12404 ctx = find_get_context(event->pmu, task, event);
12405 if (IS_ERR(ctx)) {
12406 err = PTR_ERR(ctx);
12407 goto err_free;
12408 }
12409
12410 WARN_ON_ONCE(ctx->parent_ctx);
12411 mutex_lock(&ctx->mutex);
12412 if (ctx->task == TASK_TOMBSTONE) {
12413 err = -ESRCH;
12414 goto err_unlock;
12415 }
12416
12417 if (!task) {
12418
12419
12420
12421
12422
12423
12424 struct perf_cpu_context *cpuctx =
12425 container_of(ctx, struct perf_cpu_context, ctx);
12426 if (!cpuctx->online) {
12427 err = -ENODEV;
12428 goto err_unlock;
12429 }
12430 }
12431
12432 if (!exclusive_event_installable(event, ctx)) {
12433 err = -EBUSY;
12434 goto err_unlock;
12435 }
12436
12437 perf_install_in_context(ctx, event, event->cpu);
12438 perf_unpin_context(ctx);
12439 mutex_unlock(&ctx->mutex);
12440
12441 return event;
12442
12443err_unlock:
12444 mutex_unlock(&ctx->mutex);
12445 perf_unpin_context(ctx);
12446 put_ctx(ctx);
12447err_free:
12448 free_event(event);
12449err:
12450 return ERR_PTR(err);
12451}
12452EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12453
12454void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12455{
12456 struct perf_event_context *src_ctx;
12457 struct perf_event_context *dst_ctx;
12458 struct perf_event *event, *tmp;
12459 LIST_HEAD(events);
12460
12461 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12462 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12463
12464
12465
12466
12467
12468 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12469 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12470 event_entry) {
12471 perf_remove_from_context(event, 0);
12472 unaccount_event_cpu(event, src_cpu);
12473 put_ctx(src_ctx);
12474 list_add(&event->migrate_entry, &events);
12475 }
12476
12477
12478
12479
12480 synchronize_rcu();
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12491 if (event->group_leader == event)
12492 continue;
12493
12494 list_del(&event->migrate_entry);
12495 if (event->state >= PERF_EVENT_STATE_OFF)
12496 event->state = PERF_EVENT_STATE_INACTIVE;
12497 account_event_cpu(event, dst_cpu);
12498 perf_install_in_context(dst_ctx, event, dst_cpu);
12499 get_ctx(dst_ctx);
12500 }
12501
12502
12503
12504
12505
12506 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12507 list_del(&event->migrate_entry);
12508 if (event->state >= PERF_EVENT_STATE_OFF)
12509 event->state = PERF_EVENT_STATE_INACTIVE;
12510 account_event_cpu(event, dst_cpu);
12511 perf_install_in_context(dst_ctx, event, dst_cpu);
12512 get_ctx(dst_ctx);
12513 }
12514 mutex_unlock(&dst_ctx->mutex);
12515 mutex_unlock(&src_ctx->mutex);
12516}
12517EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12518
12519static void sync_child_event(struct perf_event *child_event)
12520{
12521 struct perf_event *parent_event = child_event->parent;
12522 u64 child_val;
12523
12524 if (child_event->attr.inherit_stat) {
12525 struct task_struct *task = child_event->ctx->task;
12526
12527 if (task && task != TASK_TOMBSTONE)
12528 perf_event_read_event(child_event, task);
12529 }
12530
12531 child_val = perf_event_count(child_event);
12532
12533
12534
12535
12536 atomic64_add(child_val, &parent_event->child_count);
12537 atomic64_add(child_event->total_time_enabled,
12538 &parent_event->child_total_time_enabled);
12539 atomic64_add(child_event->total_time_running,
12540 &parent_event->child_total_time_running);
12541}
12542
12543static void
12544perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
12545{
12546 struct perf_event *parent_event = event->parent;
12547 unsigned long detach_flags = 0;
12548
12549 if (parent_event) {
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562 detach_flags = DETACH_GROUP | DETACH_CHILD;
12563 mutex_lock(&parent_event->child_mutex);
12564 }
12565
12566 perf_remove_from_context(event, detach_flags);
12567
12568 raw_spin_lock_irq(&ctx->lock);
12569 if (event->state > PERF_EVENT_STATE_EXIT)
12570 perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12571 raw_spin_unlock_irq(&ctx->lock);
12572
12573
12574
12575
12576 if (parent_event) {
12577 mutex_unlock(&parent_event->child_mutex);
12578
12579
12580
12581 perf_event_wakeup(parent_event);
12582 free_event(event);
12583 put_event(parent_event);
12584 return;
12585 }
12586
12587
12588
12589
12590 perf_event_wakeup(event);
12591}
12592
12593static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12594{
12595 struct perf_event_context *child_ctx, *clone_ctx = NULL;
12596 struct perf_event *child_event, *next;
12597
12598 WARN_ON_ONCE(child != current);
12599
12600 child_ctx = perf_pin_task_context(child, ctxn);
12601 if (!child_ctx)
12602 return;
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614 mutex_lock(&child_ctx->mutex);
12615
12616
12617
12618
12619
12620
12621 raw_spin_lock_irq(&child_ctx->lock);
12622 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12623
12624
12625
12626
12627
12628 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12629 put_ctx(child_ctx);
12630 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12631 put_task_struct(current);
12632
12633 clone_ctx = unclone_ctx(child_ctx);
12634 raw_spin_unlock_irq(&child_ctx->lock);
12635
12636 if (clone_ctx)
12637 put_ctx(clone_ctx);
12638
12639
12640
12641
12642
12643
12644 perf_event_task(child, child_ctx, 0);
12645
12646 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12647 perf_event_exit_event(child_event, child_ctx);
12648
12649 mutex_unlock(&child_ctx->mutex);
12650
12651 put_ctx(child_ctx);
12652}
12653
12654
12655
12656
12657
12658
12659
12660void perf_event_exit_task(struct task_struct *child)
12661{
12662 struct perf_event *event, *tmp;
12663 int ctxn;
12664
12665 mutex_lock(&child->perf_event_mutex);
12666 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12667 owner_entry) {
12668 list_del_init(&event->owner_entry);
12669
12670
12671
12672
12673
12674
12675 smp_store_release(&event->owner, NULL);
12676 }
12677 mutex_unlock(&child->perf_event_mutex);
12678
12679 for_each_task_context_nr(ctxn)
12680 perf_event_exit_task_context(child, ctxn);
12681
12682
12683
12684
12685
12686
12687
12688 perf_event_task(child, NULL, 0);
12689}
12690
12691static void perf_free_event(struct perf_event *event,
12692 struct perf_event_context *ctx)
12693{
12694 struct perf_event *parent = event->parent;
12695
12696 if (WARN_ON_ONCE(!parent))
12697 return;
12698
12699 mutex_lock(&parent->child_mutex);
12700 list_del_init(&event->child_list);
12701 mutex_unlock(&parent->child_mutex);
12702
12703 put_event(parent);
12704
12705 raw_spin_lock_irq(&ctx->lock);
12706 perf_group_detach(event);
12707 list_del_event(event, ctx);
12708 raw_spin_unlock_irq(&ctx->lock);
12709 free_event(event);
12710}
12711
12712
12713
12714
12715
12716
12717
12718
12719void perf_event_free_task(struct task_struct *task)
12720{
12721 struct perf_event_context *ctx;
12722 struct perf_event *event, *tmp;
12723 int ctxn;
12724
12725 for_each_task_context_nr(ctxn) {
12726 ctx = task->perf_event_ctxp[ctxn];
12727 if (!ctx)
12728 continue;
12729
12730 mutex_lock(&ctx->mutex);
12731 raw_spin_lock_irq(&ctx->lock);
12732
12733
12734
12735
12736
12737
12738 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12739 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12740 put_task_struct(task);
12741 raw_spin_unlock_irq(&ctx->lock);
12742
12743 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12744 perf_free_event(event, ctx);
12745
12746 mutex_unlock(&ctx->mutex);
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12763 put_ctx(ctx);
12764 }
12765}
12766
12767void perf_event_delayed_put(struct task_struct *task)
12768{
12769 int ctxn;
12770
12771 for_each_task_context_nr(ctxn)
12772 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12773}
12774
12775struct file *perf_event_get(unsigned int fd)
12776{
12777 struct file *file = fget(fd);
12778 if (!file)
12779 return ERR_PTR(-EBADF);
12780
12781 if (file->f_op != &perf_fops) {
12782 fput(file);
12783 return ERR_PTR(-EBADF);
12784 }
12785
12786 return file;
12787}
12788
12789const struct perf_event *perf_get_event(struct file *file)
12790{
12791 if (file->f_op != &perf_fops)
12792 return ERR_PTR(-EINVAL);
12793
12794 return file->private_data;
12795}
12796
12797const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12798{
12799 if (!event)
12800 return ERR_PTR(-EINVAL);
12801
12802 return &event->attr;
12803}
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813static struct perf_event *
12814inherit_event(struct perf_event *parent_event,
12815 struct task_struct *parent,
12816 struct perf_event_context *parent_ctx,
12817 struct task_struct *child,
12818 struct perf_event *group_leader,
12819 struct perf_event_context *child_ctx)
12820{
12821 enum perf_event_state parent_state = parent_event->state;
12822 struct perf_event *child_event;
12823 unsigned long flags;
12824
12825
12826
12827
12828
12829
12830
12831 if (parent_event->parent)
12832 parent_event = parent_event->parent;
12833
12834 child_event = perf_event_alloc(&parent_event->attr,
12835 parent_event->cpu,
12836 child,
12837 group_leader, parent_event,
12838 NULL, NULL, -1);
12839 if (IS_ERR(child_event))
12840 return child_event;
12841
12842
12843 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12844 !child_ctx->task_ctx_data) {
12845 struct pmu *pmu = child_event->pmu;
12846
12847 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
12848 if (!child_ctx->task_ctx_data) {
12849 free_event(child_event);
12850 return ERR_PTR(-ENOMEM);
12851 }
12852 }
12853
12854
12855
12856
12857
12858
12859
12860 mutex_lock(&parent_event->child_mutex);
12861 if (is_orphaned_event(parent_event) ||
12862 !atomic_long_inc_not_zero(&parent_event->refcount)) {
12863 mutex_unlock(&parent_event->child_mutex);
12864
12865 free_event(child_event);
12866 return NULL;
12867 }
12868
12869 get_ctx(child_ctx);
12870
12871
12872
12873
12874
12875
12876 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
12877 child_event->state = PERF_EVENT_STATE_INACTIVE;
12878 else
12879 child_event->state = PERF_EVENT_STATE_OFF;
12880
12881 if (parent_event->attr.freq) {
12882 u64 sample_period = parent_event->hw.sample_period;
12883 struct hw_perf_event *hwc = &child_event->hw;
12884
12885 hwc->sample_period = sample_period;
12886 hwc->last_period = sample_period;
12887
12888 local64_set(&hwc->period_left, sample_period);
12889 }
12890
12891 child_event->ctx = child_ctx;
12892 child_event->overflow_handler = parent_event->overflow_handler;
12893 child_event->overflow_handler_context
12894 = parent_event->overflow_handler_context;
12895
12896
12897
12898
12899 perf_event__header_size(child_event);
12900 perf_event__id_header_size(child_event);
12901
12902
12903
12904
12905 raw_spin_lock_irqsave(&child_ctx->lock, flags);
12906 add_event_to_ctx(child_event, child_ctx);
12907 child_event->attach_state |= PERF_ATTACH_CHILD;
12908 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
12909
12910
12911
12912
12913 list_add_tail(&child_event->child_list, &parent_event->child_list);
12914 mutex_unlock(&parent_event->child_mutex);
12915
12916 return child_event;
12917}
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929static int inherit_group(struct perf_event *parent_event,
12930 struct task_struct *parent,
12931 struct perf_event_context *parent_ctx,
12932 struct task_struct *child,
12933 struct perf_event_context *child_ctx)
12934{
12935 struct perf_event *leader;
12936 struct perf_event *sub;
12937 struct perf_event *child_ctr;
12938
12939 leader = inherit_event(parent_event, parent, parent_ctx,
12940 child, NULL, child_ctx);
12941 if (IS_ERR(leader))
12942 return PTR_ERR(leader);
12943
12944
12945
12946
12947
12948 for_each_sibling_event(sub, parent_event) {
12949 child_ctr = inherit_event(sub, parent, parent_ctx,
12950 child, leader, child_ctx);
12951 if (IS_ERR(child_ctr))
12952 return PTR_ERR(child_ctr);
12953
12954 if (sub->aux_event == parent_event && child_ctr &&
12955 !perf_get_aux_event(child_ctr, leader))
12956 return -EINVAL;
12957 }
12958 return 0;
12959}
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972static int
12973inherit_task_group(struct perf_event *event, struct task_struct *parent,
12974 struct perf_event_context *parent_ctx,
12975 struct task_struct *child, int ctxn,
12976 u64 clone_flags, int *inherited_all)
12977{
12978 int ret;
12979 struct perf_event_context *child_ctx;
12980
12981 if (!event->attr.inherit ||
12982 (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
12983 *inherited_all = 0;
12984 return 0;
12985 }
12986
12987 child_ctx = child->perf_event_ctxp[ctxn];
12988 if (!child_ctx) {
12989
12990
12991
12992
12993
12994
12995 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
12996 if (!child_ctx)
12997 return -ENOMEM;
12998
12999 child->perf_event_ctxp[ctxn] = child_ctx;
13000 }
13001
13002 ret = inherit_group(event, parent, parent_ctx,
13003 child, child_ctx);
13004
13005 if (ret)
13006 *inherited_all = 0;
13007
13008 return ret;
13009}
13010
13011
13012
13013
13014static int perf_event_init_context(struct task_struct *child, int ctxn,
13015 u64 clone_flags)
13016{
13017 struct perf_event_context *child_ctx, *parent_ctx;
13018 struct perf_event_context *cloned_ctx;
13019 struct perf_event *event;
13020 struct task_struct *parent = current;
13021 int inherited_all = 1;
13022 unsigned long flags;
13023 int ret = 0;
13024
13025 if (likely(!parent->perf_event_ctxp[ctxn]))
13026 return 0;
13027
13028
13029
13030
13031
13032 parent_ctx = perf_pin_task_context(parent, ctxn);
13033 if (!parent_ctx)
13034 return 0;
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047 mutex_lock(&parent_ctx->mutex);
13048
13049
13050
13051
13052
13053 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
13054 ret = inherit_task_group(event, parent, parent_ctx,
13055 child, ctxn, clone_flags,
13056 &inherited_all);
13057 if (ret)
13058 goto out_unlock;
13059 }
13060
13061
13062
13063
13064
13065
13066 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13067 parent_ctx->rotate_disable = 1;
13068 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13069
13070 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
13071 ret = inherit_task_group(event, parent, parent_ctx,
13072 child, ctxn, clone_flags,
13073 &inherited_all);
13074 if (ret)
13075 goto out_unlock;
13076 }
13077
13078 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13079 parent_ctx->rotate_disable = 0;
13080
13081 child_ctx = child->perf_event_ctxp[ctxn];
13082
13083 if (child_ctx && inherited_all) {
13084
13085
13086
13087
13088
13089
13090
13091 cloned_ctx = parent_ctx->parent_ctx;
13092 if (cloned_ctx) {
13093 child_ctx->parent_ctx = cloned_ctx;
13094 child_ctx->parent_gen = parent_ctx->parent_gen;
13095 } else {
13096 child_ctx->parent_ctx = parent_ctx;
13097 child_ctx->parent_gen = parent_ctx->generation;
13098 }
13099 get_ctx(child_ctx->parent_ctx);
13100 }
13101
13102 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13103out_unlock:
13104 mutex_unlock(&parent_ctx->mutex);
13105
13106 perf_unpin_context(parent_ctx);
13107 put_ctx(parent_ctx);
13108
13109 return ret;
13110}
13111
13112
13113
13114
13115int perf_event_init_task(struct task_struct *child, u64 clone_flags)
13116{
13117 int ctxn, ret;
13118
13119 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
13120 mutex_init(&child->perf_event_mutex);
13121 INIT_LIST_HEAD(&child->perf_event_list);
13122
13123 for_each_task_context_nr(ctxn) {
13124 ret = perf_event_init_context(child, ctxn, clone_flags);
13125 if (ret) {
13126 perf_event_free_task(child);
13127 return ret;
13128 }
13129 }
13130
13131 return 0;
13132}
13133
13134static void __init perf_event_init_all_cpus(void)
13135{
13136 struct swevent_htable *swhash;
13137 int cpu;
13138
13139 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13140
13141 for_each_possible_cpu(cpu) {
13142 swhash = &per_cpu(swevent_htable, cpu);
13143 mutex_init(&swhash->hlist_mutex);
13144 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
13145
13146 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13147 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13148
13149#ifdef CONFIG_CGROUP_PERF
13150 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
13151#endif
13152 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13153 }
13154}
13155
13156static void perf_swevent_init_cpu(unsigned int cpu)
13157{
13158 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13159
13160 mutex_lock(&swhash->hlist_mutex);
13161 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13162 struct swevent_hlist *hlist;
13163
13164 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13165 WARN_ON(!hlist);
13166 rcu_assign_pointer(swhash->swevent_hlist, hlist);
13167 }
13168 mutex_unlock(&swhash->hlist_mutex);
13169}
13170
13171#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13172static void __perf_event_exit_context(void *__info)
13173{
13174 struct perf_event_context *ctx = __info;
13175 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
13176 struct perf_event *event;
13177
13178 raw_spin_lock(&ctx->lock);
13179 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
13180 list_for_each_entry(event, &ctx->event_list, event_entry)
13181 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13182 raw_spin_unlock(&ctx->lock);
13183}
13184
13185static void perf_event_exit_cpu_context(int cpu)
13186{
13187 struct perf_cpu_context *cpuctx;
13188 struct perf_event_context *ctx;
13189 struct pmu *pmu;
13190
13191 mutex_lock(&pmus_lock);
13192 list_for_each_entry(pmu, &pmus, entry) {
13193 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13194 ctx = &cpuctx->ctx;
13195
13196 mutex_lock(&ctx->mutex);
13197 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13198 cpuctx->online = 0;
13199 mutex_unlock(&ctx->mutex);
13200 }
13201 cpumask_clear_cpu(cpu, perf_online_mask);
13202 mutex_unlock(&pmus_lock);
13203}
13204#else
13205
13206static void perf_event_exit_cpu_context(int cpu) { }
13207
13208#endif
13209
13210int perf_event_init_cpu(unsigned int cpu)
13211{
13212 struct perf_cpu_context *cpuctx;
13213 struct perf_event_context *ctx;
13214 struct pmu *pmu;
13215
13216 perf_swevent_init_cpu(cpu);
13217
13218 mutex_lock(&pmus_lock);
13219 cpumask_set_cpu(cpu, perf_online_mask);
13220 list_for_each_entry(pmu, &pmus, entry) {
13221 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13222 ctx = &cpuctx->ctx;
13223
13224 mutex_lock(&ctx->mutex);
13225 cpuctx->online = 1;
13226 mutex_unlock(&ctx->mutex);
13227 }
13228 mutex_unlock(&pmus_lock);
13229
13230 return 0;
13231}
13232
13233int perf_event_exit_cpu(unsigned int cpu)
13234{
13235 perf_event_exit_cpu_context(cpu);
13236 return 0;
13237}
13238
13239static int
13240perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13241{
13242 int cpu;
13243
13244 for_each_online_cpu(cpu)
13245 perf_event_exit_cpu(cpu);
13246
13247 return NOTIFY_OK;
13248}
13249
13250
13251
13252
13253
13254static struct notifier_block perf_reboot_notifier = {
13255 .notifier_call = perf_reboot,
13256 .priority = INT_MIN,
13257};
13258
13259void __init perf_event_init(void)
13260{
13261 int ret;
13262
13263 idr_init(&pmu_idr);
13264
13265 perf_event_init_all_cpus();
13266 init_srcu_struct(&pmus_srcu);
13267 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13268 perf_pmu_register(&perf_cpu_clock, NULL, -1);
13269 perf_pmu_register(&perf_task_clock, NULL, -1);
13270 perf_tp_register();
13271 perf_event_init_cpu(smp_processor_id());
13272 register_reboot_notifier(&perf_reboot_notifier);
13273
13274 ret = init_hw_breakpoint();
13275 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13276
13277 perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13278
13279
13280
13281
13282
13283 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13284 != 1024);
13285}
13286
13287ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13288 char *page)
13289{
13290 struct perf_pmu_events_attr *pmu_attr =
13291 container_of(attr, struct perf_pmu_events_attr, attr);
13292
13293 if (pmu_attr->event_str)
13294 return sprintf(page, "%s\n", pmu_attr->event_str);
13295
13296 return 0;
13297}
13298EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13299
13300static int __init perf_event_sysfs_init(void)
13301{
13302 struct pmu *pmu;
13303 int ret;
13304
13305 mutex_lock(&pmus_lock);
13306
13307 ret = bus_register(&pmu_bus);
13308 if (ret)
13309 goto unlock;
13310
13311 list_for_each_entry(pmu, &pmus, entry) {
13312 if (!pmu->name || pmu->type < 0)
13313 continue;
13314
13315 ret = pmu_dev_alloc(pmu);
13316 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13317 }
13318 pmu_bus_running = 1;
13319 ret = 0;
13320
13321unlock:
13322 mutex_unlock(&pmus_lock);
13323
13324 return ret;
13325}
13326device_initcall(perf_event_sysfs_init);
13327
13328#ifdef CONFIG_CGROUP_PERF
13329static struct cgroup_subsys_state *
13330perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13331{
13332 struct perf_cgroup *jc;
13333
13334 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13335 if (!jc)
13336 return ERR_PTR(-ENOMEM);
13337
13338 jc->info = alloc_percpu(struct perf_cgroup_info);
13339 if (!jc->info) {
13340 kfree(jc);
13341 return ERR_PTR(-ENOMEM);
13342 }
13343
13344 return &jc->css;
13345}
13346
13347static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13348{
13349 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13350
13351 free_percpu(jc->info);
13352 kfree(jc);
13353}
13354
13355static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13356{
13357 perf_event_cgroup(css->cgroup);
13358 return 0;
13359}
13360
13361static int __perf_cgroup_move(void *info)
13362{
13363 struct task_struct *task = info;
13364 rcu_read_lock();
13365 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13366 rcu_read_unlock();
13367 return 0;
13368}
13369
13370static void perf_cgroup_attach(struct cgroup_taskset *tset)
13371{
13372 struct task_struct *task;
13373 struct cgroup_subsys_state *css;
13374
13375 cgroup_taskset_for_each(task, css, tset)
13376 task_function_call(task, __perf_cgroup_move, task);
13377}
13378
13379struct cgroup_subsys perf_event_cgrp_subsys = {
13380 .css_alloc = perf_cgroup_css_alloc,
13381 .css_free = perf_cgroup_css_free,
13382 .css_online = perf_cgroup_css_online,
13383 .attach = perf_cgroup_attach,
13384
13385
13386
13387
13388
13389 .implicit_on_dfl = true,
13390 .threaded = true,
13391};
13392#endif
13393