1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53
54#include "internal.h"
55
56#include <asm/irq_regs.h>
57
58typedef int (*remote_function_f)(void *);
59
60struct remote_function_call {
61 struct task_struct *p;
62 remote_function_f func;
63 void *info;
64 int ret;
65};
66
67static void remote_function(void *data)
68{
69 struct remote_function_call *tfc = data;
70 struct task_struct *p = tfc->p;
71
72 if (p) {
73
74 if (task_cpu(p) != smp_processor_id())
75 return;
76
77
78
79
80
81
82 tfc->ret = -ESRCH;
83 if (p != current)
84 return;
85 }
86
87 tfc->ret = tfc->func(tfc->info);
88}
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103static int
104task_function_call(struct task_struct *p, remote_function_f func, void *info)
105{
106 struct remote_function_call data = {
107 .p = p,
108 .func = func,
109 .info = info,
110 .ret = -EAGAIN,
111 };
112 int ret;
113
114 do {
115 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
116 if (!ret)
117 ret = data.ret;
118 } while (ret == -EAGAIN);
119
120 return ret;
121}
122
123
124
125
126
127
128
129
130
131
132static int cpu_function_call(int cpu, remote_function_f func, void *info)
133{
134 struct remote_function_call data = {
135 .p = NULL,
136 .func = func,
137 .info = info,
138 .ret = -ENXIO,
139 };
140
141 smp_call_function_single(cpu, remote_function, &data, 1);
142
143 return data.ret;
144}
145
146static inline struct perf_cpu_context *
147__get_cpu_context(struct perf_event_context *ctx)
148{
149 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
150}
151
152static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
153 struct perf_event_context *ctx)
154{
155 raw_spin_lock(&cpuctx->ctx.lock);
156 if (ctx)
157 raw_spin_lock(&ctx->lock);
158}
159
160static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
161 struct perf_event_context *ctx)
162{
163 if (ctx)
164 raw_spin_unlock(&ctx->lock);
165 raw_spin_unlock(&cpuctx->ctx.lock);
166}
167
168#define TASK_TOMBSTONE ((void *)-1L)
169
170static bool is_kernel_event(struct perf_event *event)
171{
172 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
173}
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
195 struct perf_event_context *, void *);
196
197struct event_function_struct {
198 struct perf_event *event;
199 event_f func;
200 void *data;
201};
202
203static int event_function(void *info)
204{
205 struct event_function_struct *efs = info;
206 struct perf_event *event = efs->event;
207 struct perf_event_context *ctx = event->ctx;
208 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
209 struct perf_event_context *task_ctx = cpuctx->task_ctx;
210 int ret = 0;
211
212 lockdep_assert_irqs_disabled();
213
214 perf_ctx_lock(cpuctx, task_ctx);
215
216
217
218
219 if (ctx->task) {
220 if (ctx->task != current) {
221 ret = -ESRCH;
222 goto unlock;
223 }
224
225
226
227
228
229
230
231
232 WARN_ON_ONCE(!ctx->is_active);
233
234
235
236
237 WARN_ON_ONCE(task_ctx != ctx);
238 } else {
239 WARN_ON_ONCE(&cpuctx->ctx != ctx);
240 }
241
242 efs->func(event, cpuctx, ctx, efs->data);
243unlock:
244 perf_ctx_unlock(cpuctx, task_ctx);
245
246 return ret;
247}
248
249static void event_function_call(struct perf_event *event, event_f func, void *data)
250{
251 struct perf_event_context *ctx = event->ctx;
252 struct task_struct *task = READ_ONCE(ctx->task);
253 struct event_function_struct efs = {
254 .event = event,
255 .func = func,
256 .data = data,
257 };
258
259 if (!event->parent) {
260
261
262
263
264
265 lockdep_assert_held(&ctx->mutex);
266 }
267
268 if (!task) {
269 cpu_function_call(event->cpu, event_function, &efs);
270 return;
271 }
272
273 if (task == TASK_TOMBSTONE)
274 return;
275
276again:
277 if (!task_function_call(task, event_function, &efs))
278 return;
279
280 raw_spin_lock_irq(&ctx->lock);
281
282
283
284
285 task = ctx->task;
286 if (task == TASK_TOMBSTONE) {
287 raw_spin_unlock_irq(&ctx->lock);
288 return;
289 }
290 if (ctx->is_active) {
291 raw_spin_unlock_irq(&ctx->lock);
292 goto again;
293 }
294 func(event, NULL, ctx, data);
295 raw_spin_unlock_irq(&ctx->lock);
296}
297
298
299
300
301
302static void event_function_local(struct perf_event *event, event_f func, void *data)
303{
304 struct perf_event_context *ctx = event->ctx;
305 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
306 struct task_struct *task = READ_ONCE(ctx->task);
307 struct perf_event_context *task_ctx = NULL;
308
309 lockdep_assert_irqs_disabled();
310
311 if (task) {
312 if (task == TASK_TOMBSTONE)
313 return;
314
315 task_ctx = ctx;
316 }
317
318 perf_ctx_lock(cpuctx, task_ctx);
319
320 task = ctx->task;
321 if (task == TASK_TOMBSTONE)
322 goto unlock;
323
324 if (task) {
325
326
327
328
329
330 if (ctx->is_active) {
331 if (WARN_ON_ONCE(task != current))
332 goto unlock;
333
334 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
335 goto unlock;
336 }
337 } else {
338 WARN_ON_ONCE(&cpuctx->ctx != ctx);
339 }
340
341 func(event, cpuctx, ctx, data);
342unlock:
343 perf_ctx_unlock(cpuctx, task_ctx);
344}
345
346#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
347 PERF_FLAG_FD_OUTPUT |\
348 PERF_FLAG_PID_CGROUP |\
349 PERF_FLAG_FD_CLOEXEC)
350
351
352
353
354#define PERF_SAMPLE_BRANCH_PERM_PLM \
355 (PERF_SAMPLE_BRANCH_KERNEL |\
356 PERF_SAMPLE_BRANCH_HV)
357
358enum event_type_t {
359 EVENT_FLEXIBLE = 0x1,
360 EVENT_PINNED = 0x2,
361 EVENT_TIME = 0x4,
362
363 EVENT_CPU = 0x8,
364 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
365};
366
367
368
369
370
371
372static void perf_sched_delayed(struct work_struct *work);
373DEFINE_STATIC_KEY_FALSE(perf_sched_events);
374static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
375static DEFINE_MUTEX(perf_sched_mutex);
376static atomic_t perf_sched_count;
377
378static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
379static DEFINE_PER_CPU(int, perf_sched_cb_usages);
380static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
381
382static atomic_t nr_mmap_events __read_mostly;
383static atomic_t nr_comm_events __read_mostly;
384static atomic_t nr_namespaces_events __read_mostly;
385static atomic_t nr_task_events __read_mostly;
386static atomic_t nr_freq_events __read_mostly;
387static atomic_t nr_switch_events __read_mostly;
388
389static LIST_HEAD(pmus);
390static DEFINE_MUTEX(pmus_lock);
391static struct srcu_struct pmus_srcu;
392static cpumask_var_t perf_online_mask;
393
394
395
396
397
398
399
400
401int sysctl_perf_event_paranoid __read_mostly = 2;
402
403
404int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
405
406
407
408
409#define DEFAULT_MAX_SAMPLE_RATE 100000
410#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
411#define DEFAULT_CPU_TIME_MAX_PERCENT 25
412
413int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
414
415static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
416static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
417
418static int perf_sample_allowed_ns __read_mostly =
419 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
420
421static void update_perf_cpu_limits(void)
422{
423 u64 tmp = perf_sample_period_ns;
424
425 tmp *= sysctl_perf_cpu_time_max_percent;
426 tmp = div_u64(tmp, 100);
427 if (!tmp)
428 tmp = 1;
429
430 WRITE_ONCE(perf_sample_allowed_ns, tmp);
431}
432
433static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
434
435int perf_proc_update_handler(struct ctl_table *table, int write,
436 void __user *buffer, size_t *lenp,
437 loff_t *ppos)
438{
439 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
440
441 if (ret || !write)
442 return ret;
443
444
445
446
447 if (sysctl_perf_cpu_time_max_percent == 100 ||
448 sysctl_perf_cpu_time_max_percent == 0)
449 return -EINVAL;
450
451 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
452 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
453 update_perf_cpu_limits();
454
455 return 0;
456}
457
458int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
459
460int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
461 void __user *buffer, size_t *lenp,
462 loff_t *ppos)
463{
464 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465
466 if (ret || !write)
467 return ret;
468
469 if (sysctl_perf_cpu_time_max_percent == 100 ||
470 sysctl_perf_cpu_time_max_percent == 0) {
471 printk(KERN_WARNING
472 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
473 WRITE_ONCE(perf_sample_allowed_ns, 0);
474 } else {
475 update_perf_cpu_limits();
476 }
477
478 return 0;
479}
480
481
482
483
484
485
486
487#define NR_ACCUMULATED_SAMPLES 128
488static DEFINE_PER_CPU(u64, running_sample_length);
489
490static u64 __report_avg;
491static u64 __report_allowed;
492
493static void perf_duration_warn(struct irq_work *w)
494{
495 printk_ratelimited(KERN_INFO
496 "perf: interrupt took too long (%lld > %lld), lowering "
497 "kernel.perf_event_max_sample_rate to %d\n",
498 __report_avg, __report_allowed,
499 sysctl_perf_event_sample_rate);
500}
501
502static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
503
504void perf_sample_event_took(u64 sample_len_ns)
505{
506 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
507 u64 running_len;
508 u64 avg_len;
509 u32 max;
510
511 if (max_len == 0)
512 return;
513
514
515 running_len = __this_cpu_read(running_sample_length);
516 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
517 running_len += sample_len_ns;
518 __this_cpu_write(running_sample_length, running_len);
519
520
521
522
523
524
525 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
526 if (avg_len <= max_len)
527 return;
528
529 __report_avg = avg_len;
530 __report_allowed = max_len;
531
532
533
534
535 avg_len += avg_len / 4;
536 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
537 if (avg_len < max)
538 max /= (u32)avg_len;
539 else
540 max = 1;
541
542 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
543 WRITE_ONCE(max_samples_per_tick, max);
544
545 sysctl_perf_event_sample_rate = max * HZ;
546 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
547
548 if (!irq_work_queue(&perf_duration_work)) {
549 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
550 "kernel.perf_event_max_sample_rate to %d\n",
551 __report_avg, __report_allowed,
552 sysctl_perf_event_sample_rate);
553 }
554}
555
556static atomic64_t perf_event_id;
557
558static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
559 enum event_type_t event_type);
560
561static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
562 enum event_type_t event_type,
563 struct task_struct *task);
564
565static void update_context_time(struct perf_event_context *ctx);
566static u64 perf_event_time(struct perf_event *event);
567
568void __weak perf_event_print_debug(void) { }
569
570extern __weak const char *perf_pmu_name(void)
571{
572 return "pmu";
573}
574
575static inline u64 perf_clock(void)
576{
577 return local_clock();
578}
579
580static inline u64 perf_event_clock(struct perf_event *event)
581{
582 return event->clock();
583}
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607static __always_inline enum perf_event_state
608__perf_effective_state(struct perf_event *event)
609{
610 struct perf_event *leader = event->group_leader;
611
612 if (leader->state <= PERF_EVENT_STATE_OFF)
613 return leader->state;
614
615 return event->state;
616}
617
618static __always_inline void
619__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
620{
621 enum perf_event_state state = __perf_effective_state(event);
622 u64 delta = now - event->tstamp;
623
624 *enabled = event->total_time_enabled;
625 if (state >= PERF_EVENT_STATE_INACTIVE)
626 *enabled += delta;
627
628 *running = event->total_time_running;
629 if (state >= PERF_EVENT_STATE_ACTIVE)
630 *running += delta;
631}
632
633static void perf_event_update_time(struct perf_event *event)
634{
635 u64 now = perf_event_time(event);
636
637 __perf_update_times(event, now, &event->total_time_enabled,
638 &event->total_time_running);
639 event->tstamp = now;
640}
641
642static void perf_event_update_sibling_time(struct perf_event *leader)
643{
644 struct perf_event *sibling;
645
646 for_each_sibling_event(sibling, leader)
647 perf_event_update_time(sibling);
648}
649
650static void
651perf_event_set_state(struct perf_event *event, enum perf_event_state state)
652{
653 if (event->state == state)
654 return;
655
656 perf_event_update_time(event);
657
658
659
660
661 if ((event->state < 0) ^ (state < 0))
662 perf_event_update_sibling_time(event);
663
664 WRITE_ONCE(event->state, state);
665}
666
667#ifdef CONFIG_CGROUP_PERF
668
669static inline bool
670perf_cgroup_match(struct perf_event *event)
671{
672 struct perf_event_context *ctx = event->ctx;
673 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
674
675
676 if (!event->cgrp)
677 return true;
678
679
680 if (!cpuctx->cgrp)
681 return false;
682
683
684
685
686
687
688
689 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
690 event->cgrp->css.cgroup);
691}
692
693static inline void perf_detach_cgroup(struct perf_event *event)
694{
695 css_put(&event->cgrp->css);
696 event->cgrp = NULL;
697}
698
699static inline int is_cgroup_event(struct perf_event *event)
700{
701 return event->cgrp != NULL;
702}
703
704static inline u64 perf_cgroup_event_time(struct perf_event *event)
705{
706 struct perf_cgroup_info *t;
707
708 t = per_cpu_ptr(event->cgrp->info, event->cpu);
709 return t->time;
710}
711
712static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
713{
714 struct perf_cgroup_info *info;
715 u64 now;
716
717 now = perf_clock();
718
719 info = this_cpu_ptr(cgrp->info);
720
721 info->time += now - info->timestamp;
722 info->timestamp = now;
723}
724
725static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
726{
727 struct perf_cgroup *cgrp = cpuctx->cgrp;
728 struct cgroup_subsys_state *css;
729
730 if (cgrp) {
731 for (css = &cgrp->css; css; css = css->parent) {
732 cgrp = container_of(css, struct perf_cgroup, css);
733 __update_cgrp_time(cgrp);
734 }
735 }
736}
737
738static inline void update_cgrp_time_from_event(struct perf_event *event)
739{
740 struct perf_cgroup *cgrp;
741
742
743
744
745
746 if (!is_cgroup_event(event))
747 return;
748
749 cgrp = perf_cgroup_from_task(current, event->ctx);
750
751
752
753 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
754 __update_cgrp_time(event->cgrp);
755}
756
757static inline void
758perf_cgroup_set_timestamp(struct task_struct *task,
759 struct perf_event_context *ctx)
760{
761 struct perf_cgroup *cgrp;
762 struct perf_cgroup_info *info;
763 struct cgroup_subsys_state *css;
764
765
766
767
768
769
770 if (!task || !ctx->nr_cgroups)
771 return;
772
773 cgrp = perf_cgroup_from_task(task, ctx);
774
775 for (css = &cgrp->css; css; css = css->parent) {
776 cgrp = container_of(css, struct perf_cgroup, css);
777 info = this_cpu_ptr(cgrp->info);
778 info->timestamp = ctx->timestamp;
779 }
780}
781
782static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
783
784#define PERF_CGROUP_SWOUT 0x1
785#define PERF_CGROUP_SWIN 0x2
786
787
788
789
790
791
792
793static void perf_cgroup_switch(struct task_struct *task, int mode)
794{
795 struct perf_cpu_context *cpuctx;
796 struct list_head *list;
797 unsigned long flags;
798
799
800
801
802
803 local_irq_save(flags);
804
805 list = this_cpu_ptr(&cgrp_cpuctx_list);
806 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
807 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
808
809 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
810 perf_pmu_disable(cpuctx->ctx.pmu);
811
812 if (mode & PERF_CGROUP_SWOUT) {
813 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
814
815
816
817
818 cpuctx->cgrp = NULL;
819 }
820
821 if (mode & PERF_CGROUP_SWIN) {
822 WARN_ON_ONCE(cpuctx->cgrp);
823
824
825
826
827
828
829
830 cpuctx->cgrp = perf_cgroup_from_task(task,
831 &cpuctx->ctx);
832 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
833 }
834 perf_pmu_enable(cpuctx->ctx.pmu);
835 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
836 }
837
838 local_irq_restore(flags);
839}
840
841static inline void perf_cgroup_sched_out(struct task_struct *task,
842 struct task_struct *next)
843{
844 struct perf_cgroup *cgrp1;
845 struct perf_cgroup *cgrp2 = NULL;
846
847 rcu_read_lock();
848
849
850
851
852
853 cgrp1 = perf_cgroup_from_task(task, NULL);
854 cgrp2 = perf_cgroup_from_task(next, NULL);
855
856
857
858
859
860
861 if (cgrp1 != cgrp2)
862 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
863
864 rcu_read_unlock();
865}
866
867static inline void perf_cgroup_sched_in(struct task_struct *prev,
868 struct task_struct *task)
869{
870 struct perf_cgroup *cgrp1;
871 struct perf_cgroup *cgrp2 = NULL;
872
873 rcu_read_lock();
874
875
876
877
878
879 cgrp1 = perf_cgroup_from_task(task, NULL);
880 cgrp2 = perf_cgroup_from_task(prev, NULL);
881
882
883
884
885
886
887 if (cgrp1 != cgrp2)
888 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
889
890 rcu_read_unlock();
891}
892
893static inline int perf_cgroup_connect(int fd, struct perf_event *event,
894 struct perf_event_attr *attr,
895 struct perf_event *group_leader)
896{
897 struct perf_cgroup *cgrp;
898 struct cgroup_subsys_state *css;
899 struct fd f = fdget(fd);
900 int ret = 0;
901
902 if (!f.file)
903 return -EBADF;
904
905 css = css_tryget_online_from_dir(f.file->f_path.dentry,
906 &perf_event_cgrp_subsys);
907 if (IS_ERR(css)) {
908 ret = PTR_ERR(css);
909 goto out;
910 }
911
912 cgrp = container_of(css, struct perf_cgroup, css);
913 event->cgrp = cgrp;
914
915
916
917
918
919
920 if (group_leader && group_leader->cgrp != cgrp) {
921 perf_detach_cgroup(event);
922 ret = -EINVAL;
923 }
924out:
925 fdput(f);
926 return ret;
927}
928
929static inline void
930perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
931{
932 struct perf_cgroup_info *t;
933 t = per_cpu_ptr(event->cgrp->info, event->cpu);
934 event->shadow_ctx_time = now - t->timestamp;
935}
936
937
938
939
940
941static inline void
942list_update_cgroup_event(struct perf_event *event,
943 struct perf_event_context *ctx, bool add)
944{
945 struct perf_cpu_context *cpuctx;
946 struct list_head *cpuctx_entry;
947
948 if (!is_cgroup_event(event))
949 return;
950
951
952
953
954
955 cpuctx = __get_cpu_context(ctx);
956
957
958
959
960
961
962
963 if (add && !cpuctx->cgrp) {
964 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
965
966 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
967 cpuctx->cgrp = cgrp;
968 }
969
970 if (add && ctx->nr_cgroups++)
971 return;
972 else if (!add && --ctx->nr_cgroups)
973 return;
974
975
976 if (!add)
977 cpuctx->cgrp = NULL;
978
979 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
980 if (add)
981 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
982 else
983 list_del(cpuctx_entry);
984}
985
986#else
987
988static inline bool
989perf_cgroup_match(struct perf_event *event)
990{
991 return true;
992}
993
994static inline void perf_detach_cgroup(struct perf_event *event)
995{}
996
997static inline int is_cgroup_event(struct perf_event *event)
998{
999 return 0;
1000}
1001
1002static inline void update_cgrp_time_from_event(struct perf_event *event)
1003{
1004}
1005
1006static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1007{
1008}
1009
1010static inline void perf_cgroup_sched_out(struct task_struct *task,
1011 struct task_struct *next)
1012{
1013}
1014
1015static inline void perf_cgroup_sched_in(struct task_struct *prev,
1016 struct task_struct *task)
1017{
1018}
1019
1020static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1021 struct perf_event_attr *attr,
1022 struct perf_event *group_leader)
1023{
1024 return -EINVAL;
1025}
1026
1027static inline void
1028perf_cgroup_set_timestamp(struct task_struct *task,
1029 struct perf_event_context *ctx)
1030{
1031}
1032
1033void
1034perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1035{
1036}
1037
1038static inline void
1039perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1040{
1041}
1042
1043static inline u64 perf_cgroup_event_time(struct perf_event *event)
1044{
1045 return 0;
1046}
1047
1048static inline void
1049list_update_cgroup_event(struct perf_event *event,
1050 struct perf_event_context *ctx, bool add)
1051{
1052}
1053
1054#endif
1055
1056
1057
1058
1059
1060#define PERF_CPU_HRTIMER (1000 / HZ)
1061
1062
1063
1064static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1065{
1066 struct perf_cpu_context *cpuctx;
1067 bool rotations;
1068
1069 lockdep_assert_irqs_disabled();
1070
1071 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1072 rotations = perf_rotate_context(cpuctx);
1073
1074 raw_spin_lock(&cpuctx->hrtimer_lock);
1075 if (rotations)
1076 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1077 else
1078 cpuctx->hrtimer_active = 0;
1079 raw_spin_unlock(&cpuctx->hrtimer_lock);
1080
1081 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1082}
1083
1084static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1085{
1086 struct hrtimer *timer = &cpuctx->hrtimer;
1087 struct pmu *pmu = cpuctx->ctx.pmu;
1088 u64 interval;
1089
1090
1091 if (pmu->task_ctx_nr == perf_sw_context)
1092 return;
1093
1094
1095
1096
1097
1098 interval = pmu->hrtimer_interval_ms;
1099 if (interval < 1)
1100 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1101
1102 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1103
1104 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1105 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1106 timer->function = perf_mux_hrtimer_handler;
1107}
1108
1109static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1110{
1111 struct hrtimer *timer = &cpuctx->hrtimer;
1112 struct pmu *pmu = cpuctx->ctx.pmu;
1113 unsigned long flags;
1114
1115
1116 if (pmu->task_ctx_nr == perf_sw_context)
1117 return 0;
1118
1119 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1120 if (!cpuctx->hrtimer_active) {
1121 cpuctx->hrtimer_active = 1;
1122 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1123 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1124 }
1125 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1126
1127 return 0;
1128}
1129
1130void perf_pmu_disable(struct pmu *pmu)
1131{
1132 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1133 if (!(*count)++)
1134 pmu->pmu_disable(pmu);
1135}
1136
1137void perf_pmu_enable(struct pmu *pmu)
1138{
1139 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1140 if (!--(*count))
1141 pmu->pmu_enable(pmu);
1142}
1143
1144static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1145
1146
1147
1148
1149
1150
1151
1152static void perf_event_ctx_activate(struct perf_event_context *ctx)
1153{
1154 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1155
1156 lockdep_assert_irqs_disabled();
1157
1158 WARN_ON(!list_empty(&ctx->active_ctx_list));
1159
1160 list_add(&ctx->active_ctx_list, head);
1161}
1162
1163static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1164{
1165 lockdep_assert_irqs_disabled();
1166
1167 WARN_ON(list_empty(&ctx->active_ctx_list));
1168
1169 list_del_init(&ctx->active_ctx_list);
1170}
1171
1172static void get_ctx(struct perf_event_context *ctx)
1173{
1174 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1175}
1176
1177static void free_ctx(struct rcu_head *head)
1178{
1179 struct perf_event_context *ctx;
1180
1181 ctx = container_of(head, struct perf_event_context, rcu_head);
1182 kfree(ctx->task_ctx_data);
1183 kfree(ctx);
1184}
1185
1186static void put_ctx(struct perf_event_context *ctx)
1187{
1188 if (atomic_dec_and_test(&ctx->refcount)) {
1189 if (ctx->parent_ctx)
1190 put_ctx(ctx->parent_ctx);
1191 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1192 put_task_struct(ctx->task);
1193 call_rcu(&ctx->rcu_head, free_ctx);
1194 }
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262static struct perf_event_context *
1263perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1264{
1265 struct perf_event_context *ctx;
1266
1267again:
1268 rcu_read_lock();
1269 ctx = READ_ONCE(event->ctx);
1270 if (!atomic_inc_not_zero(&ctx->refcount)) {
1271 rcu_read_unlock();
1272 goto again;
1273 }
1274 rcu_read_unlock();
1275
1276 mutex_lock_nested(&ctx->mutex, nesting);
1277 if (event->ctx != ctx) {
1278 mutex_unlock(&ctx->mutex);
1279 put_ctx(ctx);
1280 goto again;
1281 }
1282
1283 return ctx;
1284}
1285
1286static inline struct perf_event_context *
1287perf_event_ctx_lock(struct perf_event *event)
1288{
1289 return perf_event_ctx_lock_nested(event, 0);
1290}
1291
1292static void perf_event_ctx_unlock(struct perf_event *event,
1293 struct perf_event_context *ctx)
1294{
1295 mutex_unlock(&ctx->mutex);
1296 put_ctx(ctx);
1297}
1298
1299
1300
1301
1302
1303
1304static __must_check struct perf_event_context *
1305unclone_ctx(struct perf_event_context *ctx)
1306{
1307 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1308
1309 lockdep_assert_held(&ctx->lock);
1310
1311 if (parent_ctx)
1312 ctx->parent_ctx = NULL;
1313 ctx->generation++;
1314
1315 return parent_ctx;
1316}
1317
1318static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1319 enum pid_type type)
1320{
1321 u32 nr;
1322
1323
1324
1325 if (event->parent)
1326 event = event->parent;
1327
1328 nr = __task_pid_nr_ns(p, type, event->ns);
1329
1330 if (!nr && !pid_alive(p))
1331 nr = -1;
1332 return nr;
1333}
1334
1335static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1336{
1337 return perf_event_pid_type(event, p, __PIDTYPE_TGID);
1338}
1339
1340static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1341{
1342 return perf_event_pid_type(event, p, PIDTYPE_PID);
1343}
1344
1345
1346
1347
1348
1349static u64 primary_event_id(struct perf_event *event)
1350{
1351 u64 id = event->id;
1352
1353 if (event->parent)
1354 id = event->parent->id;
1355
1356 return id;
1357}
1358
1359
1360
1361
1362
1363
1364
1365static struct perf_event_context *
1366perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1367{
1368 struct perf_event_context *ctx;
1369
1370retry:
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380 local_irq_save(*flags);
1381 rcu_read_lock();
1382 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1383 if (ctx) {
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394 raw_spin_lock(&ctx->lock);
1395 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1396 raw_spin_unlock(&ctx->lock);
1397 rcu_read_unlock();
1398 local_irq_restore(*flags);
1399 goto retry;
1400 }
1401
1402 if (ctx->task == TASK_TOMBSTONE ||
1403 !atomic_inc_not_zero(&ctx->refcount)) {
1404 raw_spin_unlock(&ctx->lock);
1405 ctx = NULL;
1406 } else {
1407 WARN_ON_ONCE(ctx->task != task);
1408 }
1409 }
1410 rcu_read_unlock();
1411 if (!ctx)
1412 local_irq_restore(*flags);
1413 return ctx;
1414}
1415
1416
1417
1418
1419
1420
1421static struct perf_event_context *
1422perf_pin_task_context(struct task_struct *task, int ctxn)
1423{
1424 struct perf_event_context *ctx;
1425 unsigned long flags;
1426
1427 ctx = perf_lock_task_context(task, ctxn, &flags);
1428 if (ctx) {
1429 ++ctx->pin_count;
1430 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1431 }
1432 return ctx;
1433}
1434
1435static void perf_unpin_context(struct perf_event_context *ctx)
1436{
1437 unsigned long flags;
1438
1439 raw_spin_lock_irqsave(&ctx->lock, flags);
1440 --ctx->pin_count;
1441 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1442}
1443
1444
1445
1446
1447static void update_context_time(struct perf_event_context *ctx)
1448{
1449 u64 now = perf_clock();
1450
1451 ctx->time += now - ctx->timestamp;
1452 ctx->timestamp = now;
1453}
1454
1455static u64 perf_event_time(struct perf_event *event)
1456{
1457 struct perf_event_context *ctx = event->ctx;
1458
1459 if (is_cgroup_event(event))
1460 return perf_cgroup_event_time(event);
1461
1462 return ctx ? ctx->time : 0;
1463}
1464
1465static enum event_type_t get_event_type(struct perf_event *event)
1466{
1467 struct perf_event_context *ctx = event->ctx;
1468 enum event_type_t event_type;
1469
1470 lockdep_assert_held(&ctx->lock);
1471
1472
1473
1474
1475
1476 if (event->group_leader != event)
1477 event = event->group_leader;
1478
1479 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1480 if (!ctx->task)
1481 event_type |= EVENT_CPU;
1482
1483 return event_type;
1484}
1485
1486
1487
1488
1489static void init_event_group(struct perf_event *event)
1490{
1491 RB_CLEAR_NODE(&event->group_node);
1492 event->group_index = 0;
1493}
1494
1495
1496
1497
1498
1499static struct perf_event_groups *
1500get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1501{
1502 if (event->attr.pinned)
1503 return &ctx->pinned_groups;
1504 else
1505 return &ctx->flexible_groups;
1506}
1507
1508
1509
1510
1511static void perf_event_groups_init(struct perf_event_groups *groups)
1512{
1513 groups->tree = RB_ROOT;
1514 groups->index = 0;
1515}
1516
1517
1518
1519
1520
1521
1522
1523static bool
1524perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1525{
1526 if (left->cpu < right->cpu)
1527 return true;
1528 if (left->cpu > right->cpu)
1529 return false;
1530
1531 if (left->group_index < right->group_index)
1532 return true;
1533 if (left->group_index > right->group_index)
1534 return false;
1535
1536 return false;
1537}
1538
1539
1540
1541
1542
1543
1544static void
1545perf_event_groups_insert(struct perf_event_groups *groups,
1546 struct perf_event *event)
1547{
1548 struct perf_event *node_event;
1549 struct rb_node *parent;
1550 struct rb_node **node;
1551
1552 event->group_index = ++groups->index;
1553
1554 node = &groups->tree.rb_node;
1555 parent = *node;
1556
1557 while (*node) {
1558 parent = *node;
1559 node_event = container_of(*node, struct perf_event, group_node);
1560
1561 if (perf_event_groups_less(event, node_event))
1562 node = &parent->rb_left;
1563 else
1564 node = &parent->rb_right;
1565 }
1566
1567 rb_link_node(&event->group_node, parent, node);
1568 rb_insert_color(&event->group_node, &groups->tree);
1569}
1570
1571
1572
1573
1574static void
1575add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1576{
1577 struct perf_event_groups *groups;
1578
1579 groups = get_event_groups(event, ctx);
1580 perf_event_groups_insert(groups, event);
1581}
1582
1583
1584
1585
1586static void
1587perf_event_groups_delete(struct perf_event_groups *groups,
1588 struct perf_event *event)
1589{
1590 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1591 RB_EMPTY_ROOT(&groups->tree));
1592
1593 rb_erase(&event->group_node, &groups->tree);
1594 init_event_group(event);
1595}
1596
1597
1598
1599
1600static void
1601del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1602{
1603 struct perf_event_groups *groups;
1604
1605 groups = get_event_groups(event, ctx);
1606 perf_event_groups_delete(groups, event);
1607}
1608
1609
1610
1611
1612static struct perf_event *
1613perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1614{
1615 struct perf_event *node_event = NULL, *match = NULL;
1616 struct rb_node *node = groups->tree.rb_node;
1617
1618 while (node) {
1619 node_event = container_of(node, struct perf_event, group_node);
1620
1621 if (cpu < node_event->cpu) {
1622 node = node->rb_left;
1623 } else if (cpu > node_event->cpu) {
1624 node = node->rb_right;
1625 } else {
1626 match = node_event;
1627 node = node->rb_left;
1628 }
1629 }
1630
1631 return match;
1632}
1633
1634
1635
1636
1637static struct perf_event *
1638perf_event_groups_next(struct perf_event *event)
1639{
1640 struct perf_event *next;
1641
1642 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1643 if (next && next->cpu == event->cpu)
1644 return next;
1645
1646 return NULL;
1647}
1648
1649
1650
1651
1652#define perf_event_groups_for_each(event, groups) \
1653 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1654 typeof(*event), group_node); event; \
1655 event = rb_entry_safe(rb_next(&event->group_node), \
1656 typeof(*event), group_node))
1657
1658
1659
1660
1661
1662static void
1663list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1664{
1665 lockdep_assert_held(&ctx->lock);
1666
1667 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1668 event->attach_state |= PERF_ATTACH_CONTEXT;
1669
1670 event->tstamp = perf_event_time(event);
1671
1672
1673
1674
1675
1676
1677 if (event->group_leader == event) {
1678 event->group_caps = event->event_caps;
1679 add_event_to_groups(event, ctx);
1680 }
1681
1682 list_update_cgroup_event(event, ctx, true);
1683
1684 list_add_rcu(&event->event_entry, &ctx->event_list);
1685 ctx->nr_events++;
1686 if (event->attr.inherit_stat)
1687 ctx->nr_stat++;
1688
1689 ctx->generation++;
1690}
1691
1692
1693
1694
1695static inline void perf_event__state_init(struct perf_event *event)
1696{
1697 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1698 PERF_EVENT_STATE_INACTIVE;
1699}
1700
1701static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1702{
1703 int entry = sizeof(u64);
1704 int size = 0;
1705 int nr = 1;
1706
1707 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1708 size += sizeof(u64);
1709
1710 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1711 size += sizeof(u64);
1712
1713 if (event->attr.read_format & PERF_FORMAT_ID)
1714 entry += sizeof(u64);
1715
1716 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1717 nr += nr_siblings;
1718 size += sizeof(u64);
1719 }
1720
1721 size += entry * nr;
1722 event->read_size = size;
1723}
1724
1725static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1726{
1727 struct perf_sample_data *data;
1728 u16 size = 0;
1729
1730 if (sample_type & PERF_SAMPLE_IP)
1731 size += sizeof(data->ip);
1732
1733 if (sample_type & PERF_SAMPLE_ADDR)
1734 size += sizeof(data->addr);
1735
1736 if (sample_type & PERF_SAMPLE_PERIOD)
1737 size += sizeof(data->period);
1738
1739 if (sample_type & PERF_SAMPLE_WEIGHT)
1740 size += sizeof(data->weight);
1741
1742 if (sample_type & PERF_SAMPLE_READ)
1743 size += event->read_size;
1744
1745 if (sample_type & PERF_SAMPLE_DATA_SRC)
1746 size += sizeof(data->data_src.val);
1747
1748 if (sample_type & PERF_SAMPLE_TRANSACTION)
1749 size += sizeof(data->txn);
1750
1751 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1752 size += sizeof(data->phys_addr);
1753
1754 event->header_size = size;
1755}
1756
1757
1758
1759
1760
1761static void perf_event__header_size(struct perf_event *event)
1762{
1763 __perf_event_read_size(event,
1764 event->group_leader->nr_siblings);
1765 __perf_event_header_size(event, event->attr.sample_type);
1766}
1767
1768static void perf_event__id_header_size(struct perf_event *event)
1769{
1770 struct perf_sample_data *data;
1771 u64 sample_type = event->attr.sample_type;
1772 u16 size = 0;
1773
1774 if (sample_type & PERF_SAMPLE_TID)
1775 size += sizeof(data->tid_entry);
1776
1777 if (sample_type & PERF_SAMPLE_TIME)
1778 size += sizeof(data->time);
1779
1780 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1781 size += sizeof(data->id);
1782
1783 if (sample_type & PERF_SAMPLE_ID)
1784 size += sizeof(data->id);
1785
1786 if (sample_type & PERF_SAMPLE_STREAM_ID)
1787 size += sizeof(data->stream_id);
1788
1789 if (sample_type & PERF_SAMPLE_CPU)
1790 size += sizeof(data->cpu_entry);
1791
1792 event->id_header_size = size;
1793}
1794
1795static bool perf_event_validate_size(struct perf_event *event)
1796{
1797
1798
1799
1800
1801 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1802 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1803 perf_event__id_header_size(event);
1804
1805
1806
1807
1808
1809 if (event->read_size + event->header_size +
1810 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1811 return false;
1812
1813 return true;
1814}
1815
1816static void perf_group_attach(struct perf_event *event)
1817{
1818 struct perf_event *group_leader = event->group_leader, *pos;
1819
1820 lockdep_assert_held(&event->ctx->lock);
1821
1822
1823
1824
1825 if (event->attach_state & PERF_ATTACH_GROUP)
1826 return;
1827
1828 event->attach_state |= PERF_ATTACH_GROUP;
1829
1830 if (group_leader == event)
1831 return;
1832
1833 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1834
1835 group_leader->group_caps &= event->event_caps;
1836
1837 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1838 group_leader->nr_siblings++;
1839
1840 perf_event__header_size(group_leader);
1841
1842 for_each_sibling_event(pos, group_leader)
1843 perf_event__header_size(pos);
1844}
1845
1846
1847
1848
1849
1850static void
1851list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1852{
1853 WARN_ON_ONCE(event->ctx != ctx);
1854 lockdep_assert_held(&ctx->lock);
1855
1856
1857
1858
1859 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1860 return;
1861
1862 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1863
1864 list_update_cgroup_event(event, ctx, false);
1865
1866 ctx->nr_events--;
1867 if (event->attr.inherit_stat)
1868 ctx->nr_stat--;
1869
1870 list_del_rcu(&event->event_entry);
1871
1872 if (event->group_leader == event)
1873 del_event_from_groups(event, ctx);
1874
1875
1876
1877
1878
1879
1880
1881
1882 if (event->state > PERF_EVENT_STATE_OFF)
1883 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1884
1885 ctx->generation++;
1886}
1887
1888static void perf_group_detach(struct perf_event *event)
1889{
1890 struct perf_event *sibling, *tmp;
1891 struct perf_event_context *ctx = event->ctx;
1892
1893 lockdep_assert_held(&ctx->lock);
1894
1895
1896
1897
1898 if (!(event->attach_state & PERF_ATTACH_GROUP))
1899 return;
1900
1901 event->attach_state &= ~PERF_ATTACH_GROUP;
1902
1903
1904
1905
1906 if (event->group_leader != event) {
1907 list_del_init(&event->sibling_list);
1908 event->group_leader->nr_siblings--;
1909 goto out;
1910 }
1911
1912
1913
1914
1915
1916
1917 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1918
1919 sibling->group_leader = sibling;
1920 list_del_init(&sibling->sibling_list);
1921
1922
1923 sibling->group_caps = event->group_caps;
1924
1925 if (!RB_EMPTY_NODE(&event->group_node)) {
1926 add_event_to_groups(sibling, event->ctx);
1927
1928 if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1929 struct list_head *list = sibling->attr.pinned ?
1930 &ctx->pinned_active : &ctx->flexible_active;
1931
1932 list_add_tail(&sibling->active_list, list);
1933 }
1934 }
1935
1936 WARN_ON_ONCE(sibling->ctx != event->ctx);
1937 }
1938
1939out:
1940 perf_event__header_size(event->group_leader);
1941
1942 for_each_sibling_event(tmp, event->group_leader)
1943 perf_event__header_size(tmp);
1944}
1945
1946static bool is_orphaned_event(struct perf_event *event)
1947{
1948 return event->state == PERF_EVENT_STATE_DEAD;
1949}
1950
1951static inline int __pmu_filter_match(struct perf_event *event)
1952{
1953 struct pmu *pmu = event->pmu;
1954 return pmu->filter_match ? pmu->filter_match(event) : 1;
1955}
1956
1957
1958
1959
1960
1961
1962
1963static inline int pmu_filter_match(struct perf_event *event)
1964{
1965 struct perf_event *sibling;
1966
1967 if (!__pmu_filter_match(event))
1968 return 0;
1969
1970 for_each_sibling_event(sibling, event) {
1971 if (!__pmu_filter_match(sibling))
1972 return 0;
1973 }
1974
1975 return 1;
1976}
1977
1978static inline int
1979event_filter_match(struct perf_event *event)
1980{
1981 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1982 perf_cgroup_match(event) && pmu_filter_match(event);
1983}
1984
1985static void
1986event_sched_out(struct perf_event *event,
1987 struct perf_cpu_context *cpuctx,
1988 struct perf_event_context *ctx)
1989{
1990 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1991
1992 WARN_ON_ONCE(event->ctx != ctx);
1993 lockdep_assert_held(&ctx->lock);
1994
1995 if (event->state != PERF_EVENT_STATE_ACTIVE)
1996 return;
1997
1998
1999
2000
2001
2002
2003 list_del_init(&event->active_list);
2004
2005 perf_pmu_disable(event->pmu);
2006
2007 event->pmu->del(event, 0);
2008 event->oncpu = -1;
2009
2010 if (event->pending_disable) {
2011 event->pending_disable = 0;
2012 state = PERF_EVENT_STATE_OFF;
2013 }
2014 perf_event_set_state(event, state);
2015
2016 if (!is_software_event(event))
2017 cpuctx->active_oncpu--;
2018 if (!--ctx->nr_active)
2019 perf_event_ctx_deactivate(ctx);
2020 if (event->attr.freq && event->attr.sample_freq)
2021 ctx->nr_freq--;
2022 if (event->attr.exclusive || !cpuctx->active_oncpu)
2023 cpuctx->exclusive = 0;
2024
2025 perf_pmu_enable(event->pmu);
2026}
2027
2028static void
2029group_sched_out(struct perf_event *group_event,
2030 struct perf_cpu_context *cpuctx,
2031 struct perf_event_context *ctx)
2032{
2033 struct perf_event *event;
2034
2035 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2036 return;
2037
2038 perf_pmu_disable(ctx->pmu);
2039
2040 event_sched_out(group_event, cpuctx, ctx);
2041
2042
2043
2044
2045 for_each_sibling_event(event, group_event)
2046 event_sched_out(event, cpuctx, ctx);
2047
2048 perf_pmu_enable(ctx->pmu);
2049
2050 if (group_event->attr.exclusive)
2051 cpuctx->exclusive = 0;
2052}
2053
2054#define DETACH_GROUP 0x01UL
2055
2056
2057
2058
2059
2060
2061
2062static void
2063__perf_remove_from_context(struct perf_event *event,
2064 struct perf_cpu_context *cpuctx,
2065 struct perf_event_context *ctx,
2066 void *info)
2067{
2068 unsigned long flags = (unsigned long)info;
2069
2070 if (ctx->is_active & EVENT_TIME) {
2071 update_context_time(ctx);
2072 update_cgrp_time_from_cpuctx(cpuctx);
2073 }
2074
2075 event_sched_out(event, cpuctx, ctx);
2076 if (flags & DETACH_GROUP)
2077 perf_group_detach(event);
2078 list_del_event(event, ctx);
2079
2080 if (!ctx->nr_events && ctx->is_active) {
2081 ctx->is_active = 0;
2082 if (ctx->task) {
2083 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2084 cpuctx->task_ctx = NULL;
2085 }
2086 }
2087}
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2100{
2101 struct perf_event_context *ctx = event->ctx;
2102
2103 lockdep_assert_held(&ctx->mutex);
2104
2105 event_function_call(event, __perf_remove_from_context, (void *)flags);
2106
2107
2108
2109
2110
2111
2112
2113 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2114 if ((flags & DETACH_GROUP) &&
2115 (event->attach_state & PERF_ATTACH_GROUP)) {
2116
2117
2118
2119
2120 raw_spin_lock_irq(&ctx->lock);
2121 perf_group_detach(event);
2122 raw_spin_unlock_irq(&ctx->lock);
2123 }
2124}
2125
2126
2127
2128
2129static void __perf_event_disable(struct perf_event *event,
2130 struct perf_cpu_context *cpuctx,
2131 struct perf_event_context *ctx,
2132 void *info)
2133{
2134 if (event->state < PERF_EVENT_STATE_INACTIVE)
2135 return;
2136
2137 if (ctx->is_active & EVENT_TIME) {
2138 update_context_time(ctx);
2139 update_cgrp_time_from_event(event);
2140 }
2141
2142 if (event == event->group_leader)
2143 group_sched_out(event, cpuctx, ctx);
2144 else
2145 event_sched_out(event, cpuctx, ctx);
2146
2147 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2148}
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164static void _perf_event_disable(struct perf_event *event)
2165{
2166 struct perf_event_context *ctx = event->ctx;
2167
2168 raw_spin_lock_irq(&ctx->lock);
2169 if (event->state <= PERF_EVENT_STATE_OFF) {
2170 raw_spin_unlock_irq(&ctx->lock);
2171 return;
2172 }
2173 raw_spin_unlock_irq(&ctx->lock);
2174
2175 event_function_call(event, __perf_event_disable, NULL);
2176}
2177
2178void perf_event_disable_local(struct perf_event *event)
2179{
2180 event_function_local(event, __perf_event_disable, NULL);
2181}
2182
2183
2184
2185
2186
2187void perf_event_disable(struct perf_event *event)
2188{
2189 struct perf_event_context *ctx;
2190
2191 ctx = perf_event_ctx_lock(event);
2192 _perf_event_disable(event);
2193 perf_event_ctx_unlock(event, ctx);
2194}
2195EXPORT_SYMBOL_GPL(perf_event_disable);
2196
2197void perf_event_disable_inatomic(struct perf_event *event)
2198{
2199 event->pending_disable = 1;
2200 irq_work_queue(&event->pending);
2201}
2202
2203static void perf_set_shadow_time(struct perf_event *event,
2204 struct perf_event_context *ctx)
2205{
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231 if (is_cgroup_event(event))
2232 perf_cgroup_set_shadow_time(event, event->tstamp);
2233 else
2234 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2235}
2236
2237#define MAX_INTERRUPTS (~0ULL)
2238
2239static void perf_log_throttle(struct perf_event *event, int enable);
2240static void perf_log_itrace_start(struct perf_event *event);
2241
2242static int
2243event_sched_in(struct perf_event *event,
2244 struct perf_cpu_context *cpuctx,
2245 struct perf_event_context *ctx)
2246{
2247 int ret = 0;
2248
2249 lockdep_assert_held(&ctx->lock);
2250
2251 if (event->state <= PERF_EVENT_STATE_OFF)
2252 return 0;
2253
2254 WRITE_ONCE(event->oncpu, smp_processor_id());
2255
2256
2257
2258
2259
2260 smp_wmb();
2261 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2262
2263
2264
2265
2266
2267
2268 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2269 perf_log_throttle(event, 1);
2270 event->hw.interrupts = 0;
2271 }
2272
2273 perf_pmu_disable(event->pmu);
2274
2275 perf_set_shadow_time(event, ctx);
2276
2277 perf_log_itrace_start(event);
2278
2279 if (event->pmu->add(event, PERF_EF_START)) {
2280 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2281 event->oncpu = -1;
2282 ret = -EAGAIN;
2283 goto out;
2284 }
2285
2286 if (!is_software_event(event))
2287 cpuctx->active_oncpu++;
2288 if (!ctx->nr_active++)
2289 perf_event_ctx_activate(ctx);
2290 if (event->attr.freq && event->attr.sample_freq)
2291 ctx->nr_freq++;
2292
2293 if (event->attr.exclusive)
2294 cpuctx->exclusive = 1;
2295
2296out:
2297 perf_pmu_enable(event->pmu);
2298
2299 return ret;
2300}
2301
2302static int
2303group_sched_in(struct perf_event *group_event,
2304 struct perf_cpu_context *cpuctx,
2305 struct perf_event_context *ctx)
2306{
2307 struct perf_event *event, *partial_group = NULL;
2308 struct pmu *pmu = ctx->pmu;
2309
2310 if (group_event->state == PERF_EVENT_STATE_OFF)
2311 return 0;
2312
2313 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2314
2315 if (event_sched_in(group_event, cpuctx, ctx)) {
2316 pmu->cancel_txn(pmu);
2317 perf_mux_hrtimer_restart(cpuctx);
2318 return -EAGAIN;
2319 }
2320
2321
2322
2323
2324 for_each_sibling_event(event, group_event) {
2325 if (event_sched_in(event, cpuctx, ctx)) {
2326 partial_group = event;
2327 goto group_error;
2328 }
2329 }
2330
2331 if (!pmu->commit_txn(pmu))
2332 return 0;
2333
2334group_error:
2335
2336
2337
2338
2339
2340 for_each_sibling_event(event, group_event) {
2341 if (event == partial_group)
2342 break;
2343
2344 event_sched_out(event, cpuctx, ctx);
2345 }
2346 event_sched_out(group_event, cpuctx, ctx);
2347
2348 pmu->cancel_txn(pmu);
2349
2350 perf_mux_hrtimer_restart(cpuctx);
2351
2352 return -EAGAIN;
2353}
2354
2355
2356
2357
2358static int group_can_go_on(struct perf_event *event,
2359 struct perf_cpu_context *cpuctx,
2360 int can_add_hw)
2361{
2362
2363
2364
2365 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2366 return 1;
2367
2368
2369
2370
2371 if (cpuctx->exclusive)
2372 return 0;
2373
2374
2375
2376
2377 if (event->attr.exclusive && cpuctx->active_oncpu)
2378 return 0;
2379
2380
2381
2382
2383 return can_add_hw;
2384}
2385
2386static void add_event_to_ctx(struct perf_event *event,
2387 struct perf_event_context *ctx)
2388{
2389 list_add_event(event, ctx);
2390 perf_group_attach(event);
2391}
2392
2393static void ctx_sched_out(struct perf_event_context *ctx,
2394 struct perf_cpu_context *cpuctx,
2395 enum event_type_t event_type);
2396static void
2397ctx_sched_in(struct perf_event_context *ctx,
2398 struct perf_cpu_context *cpuctx,
2399 enum event_type_t event_type,
2400 struct task_struct *task);
2401
2402static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2403 struct perf_event_context *ctx,
2404 enum event_type_t event_type)
2405{
2406 if (!cpuctx->task_ctx)
2407 return;
2408
2409 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2410 return;
2411
2412 ctx_sched_out(ctx, cpuctx, event_type);
2413}
2414
2415static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2416 struct perf_event_context *ctx,
2417 struct task_struct *task)
2418{
2419 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2420 if (ctx)
2421 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2422 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2423 if (ctx)
2424 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2425}
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442static void ctx_resched(struct perf_cpu_context *cpuctx,
2443 struct perf_event_context *task_ctx,
2444 enum event_type_t event_type)
2445{
2446 enum event_type_t ctx_event_type;
2447 bool cpu_event = !!(event_type & EVENT_CPU);
2448
2449
2450
2451
2452
2453 if (event_type & EVENT_PINNED)
2454 event_type |= EVENT_FLEXIBLE;
2455
2456 ctx_event_type = event_type & EVENT_ALL;
2457
2458 perf_pmu_disable(cpuctx->ctx.pmu);
2459 if (task_ctx)
2460 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2461
2462
2463
2464
2465
2466
2467
2468
2469 if (cpu_event)
2470 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2471 else if (ctx_event_type & EVENT_PINNED)
2472 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2473
2474 perf_event_sched_in(cpuctx, task_ctx, current);
2475 perf_pmu_enable(cpuctx->ctx.pmu);
2476}
2477
2478
2479
2480
2481
2482
2483
2484static int __perf_install_in_context(void *info)
2485{
2486 struct perf_event *event = info;
2487 struct perf_event_context *ctx = event->ctx;
2488 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2489 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2490 bool reprogram = true;
2491 int ret = 0;
2492
2493 raw_spin_lock(&cpuctx->ctx.lock);
2494 if (ctx->task) {
2495 raw_spin_lock(&ctx->lock);
2496 task_ctx = ctx;
2497
2498 reprogram = (ctx->task == current);
2499
2500
2501
2502
2503
2504
2505
2506
2507 if (task_curr(ctx->task) && !reprogram) {
2508 ret = -ESRCH;
2509 goto unlock;
2510 }
2511
2512 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2513 } else if (task_ctx) {
2514 raw_spin_lock(&task_ctx->lock);
2515 }
2516
2517#ifdef CONFIG_CGROUP_PERF
2518 if (is_cgroup_event(event)) {
2519
2520
2521
2522
2523 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2524 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2525 event->cgrp->css.cgroup);
2526 }
2527#endif
2528
2529 if (reprogram) {
2530 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2531 add_event_to_ctx(event, ctx);
2532 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2533 } else {
2534 add_event_to_ctx(event, ctx);
2535 }
2536
2537unlock:
2538 perf_ctx_unlock(cpuctx, task_ctx);
2539
2540 return ret;
2541}
2542
2543
2544
2545
2546
2547
2548static void
2549perf_install_in_context(struct perf_event_context *ctx,
2550 struct perf_event *event,
2551 int cpu)
2552{
2553 struct task_struct *task = READ_ONCE(ctx->task);
2554
2555 lockdep_assert_held(&ctx->mutex);
2556
2557 if (event->cpu != -1)
2558 event->cpu = cpu;
2559
2560
2561
2562
2563
2564 smp_store_release(&event->ctx, ctx);
2565
2566 if (!task) {
2567 cpu_function_call(cpu, __perf_install_in_context, event);
2568 return;
2569 }
2570
2571
2572
2573
2574 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2575 return;
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607 smp_mb();
2608again:
2609 if (!task_function_call(task, __perf_install_in_context, event))
2610 return;
2611
2612 raw_spin_lock_irq(&ctx->lock);
2613 task = ctx->task;
2614 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2615
2616
2617
2618
2619
2620 raw_spin_unlock_irq(&ctx->lock);
2621 return;
2622 }
2623
2624
2625
2626
2627 if (task_curr(task)) {
2628 raw_spin_unlock_irq(&ctx->lock);
2629 goto again;
2630 }
2631 add_event_to_ctx(event, ctx);
2632 raw_spin_unlock_irq(&ctx->lock);
2633}
2634
2635
2636
2637
2638static void __perf_event_enable(struct perf_event *event,
2639 struct perf_cpu_context *cpuctx,
2640 struct perf_event_context *ctx,
2641 void *info)
2642{
2643 struct perf_event *leader = event->group_leader;
2644 struct perf_event_context *task_ctx;
2645
2646 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2647 event->state <= PERF_EVENT_STATE_ERROR)
2648 return;
2649
2650 if (ctx->is_active)
2651 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2652
2653 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2654
2655 if (!ctx->is_active)
2656 return;
2657
2658 if (!event_filter_match(event)) {
2659 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2660 return;
2661 }
2662
2663
2664
2665
2666
2667 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2668 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2669 return;
2670 }
2671
2672 task_ctx = cpuctx->task_ctx;
2673 if (ctx->task)
2674 WARN_ON_ONCE(task_ctx != ctx);
2675
2676 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2677}
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688static void _perf_event_enable(struct perf_event *event)
2689{
2690 struct perf_event_context *ctx = event->ctx;
2691
2692 raw_spin_lock_irq(&ctx->lock);
2693 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2694 event->state < PERF_EVENT_STATE_ERROR) {
2695 raw_spin_unlock_irq(&ctx->lock);
2696 return;
2697 }
2698
2699
2700
2701
2702
2703
2704
2705
2706 if (event->state == PERF_EVENT_STATE_ERROR)
2707 event->state = PERF_EVENT_STATE_OFF;
2708 raw_spin_unlock_irq(&ctx->lock);
2709
2710 event_function_call(event, __perf_event_enable, NULL);
2711}
2712
2713
2714
2715
2716void perf_event_enable(struct perf_event *event)
2717{
2718 struct perf_event_context *ctx;
2719
2720 ctx = perf_event_ctx_lock(event);
2721 _perf_event_enable(event);
2722 perf_event_ctx_unlock(event, ctx);
2723}
2724EXPORT_SYMBOL_GPL(perf_event_enable);
2725
2726struct stop_event_data {
2727 struct perf_event *event;
2728 unsigned int restart;
2729};
2730
2731static int __perf_event_stop(void *info)
2732{
2733 struct stop_event_data *sd = info;
2734 struct perf_event *event = sd->event;
2735
2736
2737 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2738 return 0;
2739
2740
2741 smp_rmb();
2742
2743
2744
2745
2746
2747 if (READ_ONCE(event->oncpu) != smp_processor_id())
2748 return -EAGAIN;
2749
2750 event->pmu->stop(event, PERF_EF_UPDATE);
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761 if (sd->restart)
2762 event->pmu->start(event, 0);
2763
2764 return 0;
2765}
2766
2767static int perf_event_stop(struct perf_event *event, int restart)
2768{
2769 struct stop_event_data sd = {
2770 .event = event,
2771 .restart = restart,
2772 };
2773 int ret = 0;
2774
2775 do {
2776 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2777 return 0;
2778
2779
2780 smp_rmb();
2781
2782
2783
2784
2785
2786
2787 ret = cpu_function_call(READ_ONCE(event->oncpu),
2788 __perf_event_stop, &sd);
2789 } while (ret == -EAGAIN);
2790
2791 return ret;
2792}
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816void perf_event_addr_filters_sync(struct perf_event *event)
2817{
2818 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2819
2820 if (!has_addr_filter(event))
2821 return;
2822
2823 raw_spin_lock(&ifh->lock);
2824 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2825 event->pmu->addr_filters_sync(event);
2826 event->hw.addr_filters_gen = event->addr_filters_gen;
2827 }
2828 raw_spin_unlock(&ifh->lock);
2829}
2830EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2831
2832static int _perf_event_refresh(struct perf_event *event, int refresh)
2833{
2834
2835
2836
2837 if (event->attr.inherit || !is_sampling_event(event))
2838 return -EINVAL;
2839
2840 atomic_add(refresh, &event->event_limit);
2841 _perf_event_enable(event);
2842
2843 return 0;
2844}
2845
2846
2847
2848
2849int perf_event_refresh(struct perf_event *event, int refresh)
2850{
2851 struct perf_event_context *ctx;
2852 int ret;
2853
2854 ctx = perf_event_ctx_lock(event);
2855 ret = _perf_event_refresh(event, refresh);
2856 perf_event_ctx_unlock(event, ctx);
2857
2858 return ret;
2859}
2860EXPORT_SYMBOL_GPL(perf_event_refresh);
2861
2862static int perf_event_modify_breakpoint(struct perf_event *bp,
2863 struct perf_event_attr *attr)
2864{
2865 int err;
2866
2867 _perf_event_disable(bp);
2868
2869 err = modify_user_hw_breakpoint_check(bp, attr, true);
2870 if (err) {
2871 if (!bp->attr.disabled)
2872 _perf_event_enable(bp);
2873
2874 return err;
2875 }
2876
2877 if (!attr->disabled)
2878 _perf_event_enable(bp);
2879 return 0;
2880}
2881
2882static int perf_event_modify_attr(struct perf_event *event,
2883 struct perf_event_attr *attr)
2884{
2885 if (event->attr.type != attr->type)
2886 return -EINVAL;
2887
2888 switch (event->attr.type) {
2889 case PERF_TYPE_BREAKPOINT:
2890 return perf_event_modify_breakpoint(event, attr);
2891 default:
2892
2893 return -EOPNOTSUPP;
2894 }
2895}
2896
2897static void ctx_sched_out(struct perf_event_context *ctx,
2898 struct perf_cpu_context *cpuctx,
2899 enum event_type_t event_type)
2900{
2901 struct perf_event *event, *tmp;
2902 int is_active = ctx->is_active;
2903
2904 lockdep_assert_held(&ctx->lock);
2905
2906 if (likely(!ctx->nr_events)) {
2907
2908
2909
2910 WARN_ON_ONCE(ctx->is_active);
2911 if (ctx->task)
2912 WARN_ON_ONCE(cpuctx->task_ctx);
2913 return;
2914 }
2915
2916 ctx->is_active &= ~event_type;
2917 if (!(ctx->is_active & EVENT_ALL))
2918 ctx->is_active = 0;
2919
2920 if (ctx->task) {
2921 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2922 if (!ctx->is_active)
2923 cpuctx->task_ctx = NULL;
2924 }
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936 if (is_active & EVENT_TIME) {
2937
2938 update_context_time(ctx);
2939 update_cgrp_time_from_cpuctx(cpuctx);
2940 }
2941
2942 is_active ^= ctx->is_active;
2943
2944 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2945 return;
2946
2947 perf_pmu_disable(ctx->pmu);
2948 if (is_active & EVENT_PINNED) {
2949 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2950 group_sched_out(event, cpuctx, ctx);
2951 }
2952
2953 if (is_active & EVENT_FLEXIBLE) {
2954 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2955 group_sched_out(event, cpuctx, ctx);
2956 }
2957 perf_pmu_enable(ctx->pmu);
2958}
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968static int context_equiv(struct perf_event_context *ctx1,
2969 struct perf_event_context *ctx2)
2970{
2971 lockdep_assert_held(&ctx1->lock);
2972 lockdep_assert_held(&ctx2->lock);
2973
2974
2975 if (ctx1->pin_count || ctx2->pin_count)
2976 return 0;
2977
2978
2979 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2980 return 1;
2981
2982
2983 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2984 return 1;
2985
2986
2987
2988
2989
2990 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2991 ctx1->parent_gen == ctx2->parent_gen)
2992 return 1;
2993
2994
2995 return 0;
2996}
2997
2998static void __perf_event_sync_stat(struct perf_event *event,
2999 struct perf_event *next_event)
3000{
3001 u64 value;
3002
3003 if (!event->attr.inherit_stat)
3004 return;
3005
3006
3007
3008
3009
3010
3011
3012
3013 if (event->state == PERF_EVENT_STATE_ACTIVE)
3014 event->pmu->read(event);
3015
3016 perf_event_update_time(event);
3017
3018
3019
3020
3021
3022 value = local64_read(&next_event->count);
3023 value = local64_xchg(&event->count, value);
3024 local64_set(&next_event->count, value);
3025
3026 swap(event->total_time_enabled, next_event->total_time_enabled);
3027 swap(event->total_time_running, next_event->total_time_running);
3028
3029
3030
3031
3032 perf_event_update_userpage(event);
3033 perf_event_update_userpage(next_event);
3034}
3035
3036static void perf_event_sync_stat(struct perf_event_context *ctx,
3037 struct perf_event_context *next_ctx)
3038{
3039 struct perf_event *event, *next_event;
3040
3041 if (!ctx->nr_stat)
3042 return;
3043
3044 update_context_time(ctx);
3045
3046 event = list_first_entry(&ctx->event_list,
3047 struct perf_event, event_entry);
3048
3049 next_event = list_first_entry(&next_ctx->event_list,
3050 struct perf_event, event_entry);
3051
3052 while (&event->event_entry != &ctx->event_list &&
3053 &next_event->event_entry != &next_ctx->event_list) {
3054
3055 __perf_event_sync_stat(event, next_event);
3056
3057 event = list_next_entry(event, event_entry);
3058 next_event = list_next_entry(next_event, event_entry);
3059 }
3060}
3061
3062static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3063 struct task_struct *next)
3064{
3065 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3066 struct perf_event_context *next_ctx;
3067 struct perf_event_context *parent, *next_parent;
3068 struct perf_cpu_context *cpuctx;
3069 int do_switch = 1;
3070
3071 if (likely(!ctx))
3072 return;
3073
3074 cpuctx = __get_cpu_context(ctx);
3075 if (!cpuctx->task_ctx)
3076 return;
3077
3078 rcu_read_lock();
3079 next_ctx = next->perf_event_ctxp[ctxn];
3080 if (!next_ctx)
3081 goto unlock;
3082
3083 parent = rcu_dereference(ctx->parent_ctx);
3084 next_parent = rcu_dereference(next_ctx->parent_ctx);
3085
3086
3087 if (!parent && !next_parent)
3088 goto unlock;
3089
3090 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100 raw_spin_lock(&ctx->lock);
3101 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3102 if (context_equiv(ctx, next_ctx)) {
3103 WRITE_ONCE(ctx->task, next);
3104 WRITE_ONCE(next_ctx->task, task);
3105
3106 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3107
3108
3109
3110
3111
3112
3113
3114
3115 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3116 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3117
3118 do_switch = 0;
3119
3120 perf_event_sync_stat(ctx, next_ctx);
3121 }
3122 raw_spin_unlock(&next_ctx->lock);
3123 raw_spin_unlock(&ctx->lock);
3124 }
3125unlock:
3126 rcu_read_unlock();
3127
3128 if (do_switch) {
3129 raw_spin_lock(&ctx->lock);
3130 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3131 raw_spin_unlock(&ctx->lock);
3132 }
3133}
3134
3135static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3136
3137void perf_sched_cb_dec(struct pmu *pmu)
3138{
3139 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3140
3141 this_cpu_dec(perf_sched_cb_usages);
3142
3143 if (!--cpuctx->sched_cb_usage)
3144 list_del(&cpuctx->sched_cb_entry);
3145}
3146
3147
3148void perf_sched_cb_inc(struct pmu *pmu)
3149{
3150 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3151
3152 if (!cpuctx->sched_cb_usage++)
3153 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3154
3155 this_cpu_inc(perf_sched_cb_usages);
3156}
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166static void perf_pmu_sched_task(struct task_struct *prev,
3167 struct task_struct *next,
3168 bool sched_in)
3169{
3170 struct perf_cpu_context *cpuctx;
3171 struct pmu *pmu;
3172
3173 if (prev == next)
3174 return;
3175
3176 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3177 pmu = cpuctx->ctx.pmu;
3178
3179 if (WARN_ON_ONCE(!pmu->sched_task))
3180 continue;
3181
3182 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3183 perf_pmu_disable(pmu);
3184
3185 pmu->sched_task(cpuctx->task_ctx, sched_in);
3186
3187 perf_pmu_enable(pmu);
3188 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3189 }
3190}
3191
3192static void perf_event_switch(struct task_struct *task,
3193 struct task_struct *next_prev, bool sched_in);
3194
3195#define for_each_task_context_nr(ctxn) \
3196 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209void __perf_event_task_sched_out(struct task_struct *task,
3210 struct task_struct *next)
3211{
3212 int ctxn;
3213
3214 if (__this_cpu_read(perf_sched_cb_usages))
3215 perf_pmu_sched_task(task, next, false);
3216
3217 if (atomic_read(&nr_switch_events))
3218 perf_event_switch(task, next, false);
3219
3220 for_each_task_context_nr(ctxn)
3221 perf_event_context_sched_out(task, ctxn, next);
3222
3223
3224
3225
3226
3227
3228 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3229 perf_cgroup_sched_out(task, next);
3230}
3231
3232
3233
3234
3235static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3236 enum event_type_t event_type)
3237{
3238 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3239}
3240
3241static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3242 int (*func)(struct perf_event *, void *), void *data)
3243{
3244 struct perf_event **evt, *evt1, *evt2;
3245 int ret;
3246
3247 evt1 = perf_event_groups_first(groups, -1);
3248 evt2 = perf_event_groups_first(groups, cpu);
3249
3250 while (evt1 || evt2) {
3251 if (evt1 && evt2) {
3252 if (evt1->group_index < evt2->group_index)
3253 evt = &evt1;
3254 else
3255 evt = &evt2;
3256 } else if (evt1) {
3257 evt = &evt1;
3258 } else {
3259 evt = &evt2;
3260 }
3261
3262 ret = func(*evt, data);
3263 if (ret)
3264 return ret;
3265
3266 *evt = perf_event_groups_next(*evt);
3267 }
3268
3269 return 0;
3270}
3271
3272struct sched_in_data {
3273 struct perf_event_context *ctx;
3274 struct perf_cpu_context *cpuctx;
3275 int can_add_hw;
3276};
3277
3278static int pinned_sched_in(struct perf_event *event, void *data)
3279{
3280 struct sched_in_data *sid = data;
3281
3282 if (event->state <= PERF_EVENT_STATE_OFF)
3283 return 0;
3284
3285 if (!event_filter_match(event))
3286 return 0;
3287
3288 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3289 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3290 list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3291 }
3292
3293
3294
3295
3296
3297 if (event->state == PERF_EVENT_STATE_INACTIVE)
3298 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3299
3300 return 0;
3301}
3302
3303static int flexible_sched_in(struct perf_event *event, void *data)
3304{
3305 struct sched_in_data *sid = data;
3306
3307 if (event->state <= PERF_EVENT_STATE_OFF)
3308 return 0;
3309
3310 if (!event_filter_match(event))
3311 return 0;
3312
3313 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3314 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3315 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3316 else
3317 sid->can_add_hw = 0;
3318 }
3319
3320 return 0;
3321}
3322
3323static void
3324ctx_pinned_sched_in(struct perf_event_context *ctx,
3325 struct perf_cpu_context *cpuctx)
3326{
3327 struct sched_in_data sid = {
3328 .ctx = ctx,
3329 .cpuctx = cpuctx,
3330 .can_add_hw = 1,
3331 };
3332
3333 visit_groups_merge(&ctx->pinned_groups,
3334 smp_processor_id(),
3335 pinned_sched_in, &sid);
3336}
3337
3338static void
3339ctx_flexible_sched_in(struct perf_event_context *ctx,
3340 struct perf_cpu_context *cpuctx)
3341{
3342 struct sched_in_data sid = {
3343 .ctx = ctx,
3344 .cpuctx = cpuctx,
3345 .can_add_hw = 1,
3346 };
3347
3348 visit_groups_merge(&ctx->flexible_groups,
3349 smp_processor_id(),
3350 flexible_sched_in, &sid);
3351}
3352
3353static void
3354ctx_sched_in(struct perf_event_context *ctx,
3355 struct perf_cpu_context *cpuctx,
3356 enum event_type_t event_type,
3357 struct task_struct *task)
3358{
3359 int is_active = ctx->is_active;
3360 u64 now;
3361
3362 lockdep_assert_held(&ctx->lock);
3363
3364 if (likely(!ctx->nr_events))
3365 return;
3366
3367 ctx->is_active |= (event_type | EVENT_TIME);
3368 if (ctx->task) {
3369 if (!is_active)
3370 cpuctx->task_ctx = ctx;
3371 else
3372 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3373 }
3374
3375 is_active ^= ctx->is_active;
3376
3377 if (is_active & EVENT_TIME) {
3378
3379 now = perf_clock();
3380 ctx->timestamp = now;
3381 perf_cgroup_set_timestamp(task, ctx);
3382 }
3383
3384
3385
3386
3387
3388 if (is_active & EVENT_PINNED)
3389 ctx_pinned_sched_in(ctx, cpuctx);
3390
3391
3392 if (is_active & EVENT_FLEXIBLE)
3393 ctx_flexible_sched_in(ctx, cpuctx);
3394}
3395
3396static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3397 enum event_type_t event_type,
3398 struct task_struct *task)
3399{
3400 struct perf_event_context *ctx = &cpuctx->ctx;
3401
3402 ctx_sched_in(ctx, cpuctx, event_type, task);
3403}
3404
3405static void perf_event_context_sched_in(struct perf_event_context *ctx,
3406 struct task_struct *task)
3407{
3408 struct perf_cpu_context *cpuctx;
3409
3410 cpuctx = __get_cpu_context(ctx);
3411 if (cpuctx->task_ctx == ctx)
3412 return;
3413
3414 perf_ctx_lock(cpuctx, ctx);
3415
3416
3417
3418
3419 if (!ctx->nr_events)
3420 goto unlock;
3421
3422 perf_pmu_disable(ctx->pmu);
3423
3424
3425
3426
3427
3428
3429
3430
3431 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3432 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3433 perf_event_sched_in(cpuctx, ctx, task);
3434 perf_pmu_enable(ctx->pmu);
3435
3436unlock:
3437 perf_ctx_unlock(cpuctx, ctx);
3438}
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451void __perf_event_task_sched_in(struct task_struct *prev,
3452 struct task_struct *task)
3453{
3454 struct perf_event_context *ctx;
3455 int ctxn;
3456
3457
3458
3459
3460
3461
3462
3463
3464 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3465 perf_cgroup_sched_in(prev, task);
3466
3467 for_each_task_context_nr(ctxn) {
3468 ctx = task->perf_event_ctxp[ctxn];
3469 if (likely(!ctx))
3470 continue;
3471
3472 perf_event_context_sched_in(ctx, task);
3473 }
3474
3475 if (atomic_read(&nr_switch_events))
3476 perf_event_switch(task, prev, true);
3477
3478 if (__this_cpu_read(perf_sched_cb_usages))
3479 perf_pmu_sched_task(prev, task, true);
3480}
3481
3482static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3483{
3484 u64 frequency = event->attr.sample_freq;
3485 u64 sec = NSEC_PER_SEC;
3486 u64 divisor, dividend;
3487
3488 int count_fls, nsec_fls, frequency_fls, sec_fls;
3489
3490 count_fls = fls64(count);
3491 nsec_fls = fls64(nsec);
3492 frequency_fls = fls64(frequency);
3493 sec_fls = 30;
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509#define REDUCE_FLS(a, b) \
3510do { \
3511 if (a##_fls > b##_fls) { \
3512 a >>= 1; \
3513 a##_fls--; \
3514 } else { \
3515 b >>= 1; \
3516 b##_fls--; \
3517 } \
3518} while (0)
3519
3520
3521
3522
3523
3524 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3525 REDUCE_FLS(nsec, frequency);
3526 REDUCE_FLS(sec, count);
3527 }
3528
3529 if (count_fls + sec_fls > 64) {
3530 divisor = nsec * frequency;
3531
3532 while (count_fls + sec_fls > 64) {
3533 REDUCE_FLS(count, sec);
3534 divisor >>= 1;
3535 }
3536
3537 dividend = count * sec;
3538 } else {
3539 dividend = count * sec;
3540
3541 while (nsec_fls + frequency_fls > 64) {
3542 REDUCE_FLS(nsec, frequency);
3543 dividend >>= 1;
3544 }
3545
3546 divisor = nsec * frequency;
3547 }
3548
3549 if (!divisor)
3550 return dividend;
3551
3552 return div64_u64(dividend, divisor);
3553}
3554
3555static DEFINE_PER_CPU(int, perf_throttled_count);
3556static DEFINE_PER_CPU(u64, perf_throttled_seq);
3557
3558static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3559{
3560 struct hw_perf_event *hwc = &event->hw;
3561 s64 period, sample_period;
3562 s64 delta;
3563
3564 period = perf_calculate_period(event, nsec, count);
3565
3566 delta = (s64)(period - hwc->sample_period);
3567 delta = (delta + 7) / 8;
3568
3569 sample_period = hwc->sample_period + delta;
3570
3571 if (!sample_period)
3572 sample_period = 1;
3573
3574 hwc->sample_period = sample_period;
3575
3576 if (local64_read(&hwc->period_left) > 8*sample_period) {
3577 if (disable)
3578 event->pmu->stop(event, PERF_EF_UPDATE);
3579
3580 local64_set(&hwc->period_left, 0);
3581
3582 if (disable)
3583 event->pmu->start(event, PERF_EF_RELOAD);
3584 }
3585}
3586
3587
3588
3589
3590
3591
3592static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3593 int needs_unthr)
3594{
3595 struct perf_event *event;
3596 struct hw_perf_event *hwc;
3597 u64 now, period = TICK_NSEC;
3598 s64 delta;
3599
3600
3601
3602
3603
3604
3605 if (!(ctx->nr_freq || needs_unthr))
3606 return;
3607
3608 raw_spin_lock(&ctx->lock);
3609 perf_pmu_disable(ctx->pmu);
3610
3611 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3612 if (event->state != PERF_EVENT_STATE_ACTIVE)
3613 continue;
3614
3615 if (!event_filter_match(event))
3616 continue;
3617
3618 perf_pmu_disable(event->pmu);
3619
3620 hwc = &event->hw;
3621
3622 if (hwc->interrupts == MAX_INTERRUPTS) {
3623 hwc->interrupts = 0;
3624 perf_log_throttle(event, 1);
3625 event->pmu->start(event, 0);
3626 }
3627
3628 if (!event->attr.freq || !event->attr.sample_freq)
3629 goto next;
3630
3631
3632
3633
3634 event->pmu->stop(event, PERF_EF_UPDATE);
3635
3636 now = local64_read(&event->count);
3637 delta = now - hwc->freq_count_stamp;
3638 hwc->freq_count_stamp = now;
3639
3640
3641
3642
3643
3644
3645
3646
3647 if (delta > 0)
3648 perf_adjust_period(event, period, delta, false);
3649
3650 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3651 next:
3652 perf_pmu_enable(event->pmu);
3653 }
3654
3655 perf_pmu_enable(ctx->pmu);
3656 raw_spin_unlock(&ctx->lock);
3657}
3658
3659
3660
3661
3662static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3663{
3664
3665
3666
3667
3668 if (ctx->rotate_disable)
3669 return;
3670
3671 perf_event_groups_delete(&ctx->flexible_groups, event);
3672 perf_event_groups_insert(&ctx->flexible_groups, event);
3673}
3674
3675static inline struct perf_event *
3676ctx_first_active(struct perf_event_context *ctx)
3677{
3678 return list_first_entry_or_null(&ctx->flexible_active,
3679 struct perf_event, active_list);
3680}
3681
3682static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3683{
3684 struct perf_event *cpu_event = NULL, *task_event = NULL;
3685 bool cpu_rotate = false, task_rotate = false;
3686 struct perf_event_context *ctx = NULL;
3687
3688
3689
3690
3691
3692
3693 if (cpuctx->ctx.nr_events) {
3694 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3695 cpu_rotate = true;
3696 }
3697
3698 ctx = cpuctx->task_ctx;
3699 if (ctx && ctx->nr_events) {
3700 if (ctx->nr_events != ctx->nr_active)
3701 task_rotate = true;
3702 }
3703
3704 if (!(cpu_rotate || task_rotate))
3705 return false;
3706
3707 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3708 perf_pmu_disable(cpuctx->ctx.pmu);
3709
3710 if (task_rotate)
3711 task_event = ctx_first_active(ctx);
3712 if (cpu_rotate)
3713 cpu_event = ctx_first_active(&cpuctx->ctx);
3714
3715
3716
3717
3718
3719 if (task_event || (ctx && cpu_event))
3720 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3721 if (cpu_event)
3722 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3723
3724 if (task_event)
3725 rotate_ctx(ctx, task_event);
3726 if (cpu_event)
3727 rotate_ctx(&cpuctx->ctx, cpu_event);
3728
3729 perf_event_sched_in(cpuctx, ctx, current);
3730
3731 perf_pmu_enable(cpuctx->ctx.pmu);
3732 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3733
3734 return true;
3735}
3736
3737void perf_event_task_tick(void)
3738{
3739 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3740 struct perf_event_context *ctx, *tmp;
3741 int throttled;
3742
3743 lockdep_assert_irqs_disabled();
3744
3745 __this_cpu_inc(perf_throttled_seq);
3746 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3747 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3748
3749 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3750 perf_adjust_freq_unthr_context(ctx, throttled);
3751}
3752
3753static int event_enable_on_exec(struct perf_event *event,
3754 struct perf_event_context *ctx)
3755{
3756 if (!event->attr.enable_on_exec)
3757 return 0;
3758
3759 event->attr.enable_on_exec = 0;
3760 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3761 return 0;
3762
3763 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3764
3765 return 1;
3766}
3767
3768
3769
3770
3771
3772static void perf_event_enable_on_exec(int ctxn)
3773{
3774 struct perf_event_context *ctx, *clone_ctx = NULL;
3775 enum event_type_t event_type = 0;
3776 struct perf_cpu_context *cpuctx;
3777 struct perf_event *event;
3778 unsigned long flags;
3779 int enabled = 0;
3780
3781 local_irq_save(flags);
3782 ctx = current->perf_event_ctxp[ctxn];
3783 if (!ctx || !ctx->nr_events)
3784 goto out;
3785
3786 cpuctx = __get_cpu_context(ctx);
3787 perf_ctx_lock(cpuctx, ctx);
3788 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3789 list_for_each_entry(event, &ctx->event_list, event_entry) {
3790 enabled |= event_enable_on_exec(event, ctx);
3791 event_type |= get_event_type(event);
3792 }
3793
3794
3795
3796
3797 if (enabled) {
3798 clone_ctx = unclone_ctx(ctx);
3799 ctx_resched(cpuctx, ctx, event_type);
3800 } else {
3801 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3802 }
3803 perf_ctx_unlock(cpuctx, ctx);
3804
3805out:
3806 local_irq_restore(flags);
3807
3808 if (clone_ctx)
3809 put_ctx(clone_ctx);
3810}
3811
3812struct perf_read_data {
3813 struct perf_event *event;
3814 bool group;
3815 int ret;
3816};
3817
3818static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3819{
3820 u16 local_pkg, event_pkg;
3821
3822 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3823 int local_cpu = smp_processor_id();
3824
3825 event_pkg = topology_physical_package_id(event_cpu);
3826 local_pkg = topology_physical_package_id(local_cpu);
3827
3828 if (event_pkg == local_pkg)
3829 return local_cpu;
3830 }
3831
3832 return event_cpu;
3833}
3834
3835
3836
3837
3838static void __perf_event_read(void *info)
3839{
3840 struct perf_read_data *data = info;
3841 struct perf_event *sub, *event = data->event;
3842 struct perf_event_context *ctx = event->ctx;
3843 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3844 struct pmu *pmu = event->pmu;
3845
3846
3847
3848
3849
3850
3851
3852
3853 if (ctx->task && cpuctx->task_ctx != ctx)
3854 return;
3855
3856 raw_spin_lock(&ctx->lock);
3857 if (ctx->is_active & EVENT_TIME) {
3858 update_context_time(ctx);
3859 update_cgrp_time_from_event(event);
3860 }
3861
3862 perf_event_update_time(event);
3863 if (data->group)
3864 perf_event_update_sibling_time(event);
3865
3866 if (event->state != PERF_EVENT_STATE_ACTIVE)
3867 goto unlock;
3868
3869 if (!data->group) {
3870 pmu->read(event);
3871 data->ret = 0;
3872 goto unlock;
3873 }
3874
3875 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3876
3877 pmu->read(event);
3878
3879 for_each_sibling_event(sub, event) {
3880 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3881
3882
3883
3884
3885 sub->pmu->read(sub);
3886 }
3887 }
3888
3889 data->ret = pmu->commit_txn(pmu);
3890
3891unlock:
3892 raw_spin_unlock(&ctx->lock);
3893}
3894
3895static inline u64 perf_event_count(struct perf_event *event)
3896{
3897 return local64_read(&event->count) + atomic64_read(&event->child_count);
3898}
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908int perf_event_read_local(struct perf_event *event, u64 *value,
3909 u64 *enabled, u64 *running)
3910{
3911 unsigned long flags;
3912 int ret = 0;
3913
3914
3915
3916
3917
3918 local_irq_save(flags);
3919
3920
3921
3922
3923
3924 if (event->attr.inherit) {
3925 ret = -EOPNOTSUPP;
3926 goto out;
3927 }
3928
3929
3930 if ((event->attach_state & PERF_ATTACH_TASK) &&
3931 event->hw.target != current) {
3932 ret = -EINVAL;
3933 goto out;
3934 }
3935
3936
3937 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3938 event->cpu != smp_processor_id()) {
3939 ret = -EINVAL;
3940 goto out;
3941 }
3942
3943
3944
3945
3946
3947
3948 if (event->oncpu == smp_processor_id())
3949 event->pmu->read(event);
3950
3951 *value = local64_read(&event->count);
3952 if (enabled || running) {
3953 u64 now = event->shadow_ctx_time + perf_clock();
3954 u64 __enabled, __running;
3955
3956 __perf_update_times(event, now, &__enabled, &__running);
3957 if (enabled)
3958 *enabled = __enabled;
3959 if (running)
3960 *running = __running;
3961 }
3962out:
3963 local_irq_restore(flags);
3964
3965 return ret;
3966}
3967
3968static int perf_event_read(struct perf_event *event, bool group)
3969{
3970 enum perf_event_state state = READ_ONCE(event->state);
3971 int event_cpu, ret = 0;
3972
3973
3974
3975
3976
3977again:
3978 if (state == PERF_EVENT_STATE_ACTIVE) {
3979 struct perf_read_data data;
3980
3981
3982
3983
3984
3985
3986
3987 smp_rmb();
3988
3989 event_cpu = READ_ONCE(event->oncpu);
3990 if ((unsigned)event_cpu >= nr_cpu_ids)
3991 return 0;
3992
3993 data = (struct perf_read_data){
3994 .event = event,
3995 .group = group,
3996 .ret = 0,
3997 };
3998
3999 preempt_disable();
4000 event_cpu = __perf_event_read_cpu(event, event_cpu);
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4013 preempt_enable();
4014 ret = data.ret;
4015
4016 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4017 struct perf_event_context *ctx = event->ctx;
4018 unsigned long flags;
4019
4020 raw_spin_lock_irqsave(&ctx->lock, flags);
4021 state = event->state;
4022 if (state != PERF_EVENT_STATE_INACTIVE) {
4023 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4024 goto again;
4025 }
4026
4027
4028
4029
4030
4031 if (ctx->is_active & EVENT_TIME) {
4032 update_context_time(ctx);
4033 update_cgrp_time_from_event(event);
4034 }
4035
4036 perf_event_update_time(event);
4037 if (group)
4038 perf_event_update_sibling_time(event);
4039 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4040 }
4041
4042 return ret;
4043}
4044
4045
4046
4047
4048static void __perf_event_init_context(struct perf_event_context *ctx)
4049{
4050 raw_spin_lock_init(&ctx->lock);
4051 mutex_init(&ctx->mutex);
4052 INIT_LIST_HEAD(&ctx->active_ctx_list);
4053 perf_event_groups_init(&ctx->pinned_groups);
4054 perf_event_groups_init(&ctx->flexible_groups);
4055 INIT_LIST_HEAD(&ctx->event_list);
4056 INIT_LIST_HEAD(&ctx->pinned_active);
4057 INIT_LIST_HEAD(&ctx->flexible_active);
4058 atomic_set(&ctx->refcount, 1);
4059}
4060
4061static struct perf_event_context *
4062alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4063{
4064 struct perf_event_context *ctx;
4065
4066 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4067 if (!ctx)
4068 return NULL;
4069
4070 __perf_event_init_context(ctx);
4071 if (task) {
4072 ctx->task = task;
4073 get_task_struct(task);
4074 }
4075 ctx->pmu = pmu;
4076
4077 return ctx;
4078}
4079
4080static struct task_struct *
4081find_lively_task_by_vpid(pid_t vpid)
4082{
4083 struct task_struct *task;
4084
4085 rcu_read_lock();
4086 if (!vpid)
4087 task = current;
4088 else
4089 task = find_task_by_vpid(vpid);
4090 if (task)
4091 get_task_struct(task);
4092 rcu_read_unlock();
4093
4094 if (!task)
4095 return ERR_PTR(-ESRCH);
4096
4097 return task;
4098}
4099
4100
4101
4102
4103static struct perf_event_context *
4104find_get_context(struct pmu *pmu, struct task_struct *task,
4105 struct perf_event *event)
4106{
4107 struct perf_event_context *ctx, *clone_ctx = NULL;
4108 struct perf_cpu_context *cpuctx;
4109 void *task_ctx_data = NULL;
4110 unsigned long flags;
4111 int ctxn, err;
4112 int cpu = event->cpu;
4113
4114 if (!task) {
4115
4116 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4117 return ERR_PTR(-EACCES);
4118
4119 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4120 ctx = &cpuctx->ctx;
4121 get_ctx(ctx);
4122 ++ctx->pin_count;
4123
4124 return ctx;
4125 }
4126
4127 err = -EINVAL;
4128 ctxn = pmu->task_ctx_nr;
4129 if (ctxn < 0)
4130 goto errout;
4131
4132 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4133 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4134 if (!task_ctx_data) {
4135 err = -ENOMEM;
4136 goto errout;
4137 }
4138 }
4139
4140retry:
4141 ctx = perf_lock_task_context(task, ctxn, &flags);
4142 if (ctx) {
4143 clone_ctx = unclone_ctx(ctx);
4144 ++ctx->pin_count;
4145
4146 if (task_ctx_data && !ctx->task_ctx_data) {
4147 ctx->task_ctx_data = task_ctx_data;
4148 task_ctx_data = NULL;
4149 }
4150 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4151
4152 if (clone_ctx)
4153 put_ctx(clone_ctx);
4154 } else {
4155 ctx = alloc_perf_context(pmu, task);
4156 err = -ENOMEM;
4157 if (!ctx)
4158 goto errout;
4159
4160 if (task_ctx_data) {
4161 ctx->task_ctx_data = task_ctx_data;
4162 task_ctx_data = NULL;
4163 }
4164
4165 err = 0;
4166 mutex_lock(&task->perf_event_mutex);
4167
4168
4169
4170
4171 if (task->flags & PF_EXITING)
4172 err = -ESRCH;
4173 else if (task->perf_event_ctxp[ctxn])
4174 err = -EAGAIN;
4175 else {
4176 get_ctx(ctx);
4177 ++ctx->pin_count;
4178 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4179 }
4180 mutex_unlock(&task->perf_event_mutex);
4181
4182 if (unlikely(err)) {
4183 put_ctx(ctx);
4184
4185 if (err == -EAGAIN)
4186 goto retry;
4187 goto errout;
4188 }
4189 }
4190
4191 kfree(task_ctx_data);
4192 return ctx;
4193
4194errout:
4195 kfree(task_ctx_data);
4196 return ERR_PTR(err);
4197}
4198
4199static void perf_event_free_filter(struct perf_event *event);
4200static void perf_event_free_bpf_prog(struct perf_event *event);
4201
4202static void free_event_rcu(struct rcu_head *head)
4203{
4204 struct perf_event *event;
4205
4206 event = container_of(head, struct perf_event, rcu_head);
4207 if (event->ns)
4208 put_pid_ns(event->ns);
4209 perf_event_free_filter(event);
4210 kfree(event);
4211}
4212
4213static void ring_buffer_attach(struct perf_event *event,
4214 struct ring_buffer *rb);
4215
4216static void detach_sb_event(struct perf_event *event)
4217{
4218 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4219
4220 raw_spin_lock(&pel->lock);
4221 list_del_rcu(&event->sb_list);
4222 raw_spin_unlock(&pel->lock);
4223}
4224
4225static bool is_sb_event(struct perf_event *event)
4226{
4227 struct perf_event_attr *attr = &event->attr;
4228
4229 if (event->parent)
4230 return false;
4231
4232 if (event->attach_state & PERF_ATTACH_TASK)
4233 return false;
4234
4235 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4236 attr->comm || attr->comm_exec ||
4237 attr->task ||
4238 attr->context_switch)
4239 return true;
4240 return false;
4241}
4242
4243static void unaccount_pmu_sb_event(struct perf_event *event)
4244{
4245 if (is_sb_event(event))
4246 detach_sb_event(event);
4247}
4248
4249static void unaccount_event_cpu(struct perf_event *event, int cpu)
4250{
4251 if (event->parent)
4252 return;
4253
4254 if (is_cgroup_event(event))
4255 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4256}
4257
4258#ifdef CONFIG_NO_HZ_FULL
4259static DEFINE_SPINLOCK(nr_freq_lock);
4260#endif
4261
4262static void unaccount_freq_event_nohz(void)
4263{
4264#ifdef CONFIG_NO_HZ_FULL
4265 spin_lock(&nr_freq_lock);
4266 if (atomic_dec_and_test(&nr_freq_events))
4267 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4268 spin_unlock(&nr_freq_lock);
4269#endif
4270}
4271
4272static void unaccount_freq_event(void)
4273{
4274 if (tick_nohz_full_enabled())
4275 unaccount_freq_event_nohz();
4276 else
4277 atomic_dec(&nr_freq_events);
4278}
4279
4280static void unaccount_event(struct perf_event *event)
4281{
4282 bool dec = false;
4283
4284 if (event->parent)
4285 return;
4286
4287 if (event->attach_state & PERF_ATTACH_TASK)
4288 dec = true;
4289 if (event->attr.mmap || event->attr.mmap_data)
4290 atomic_dec(&nr_mmap_events);
4291 if (event->attr.comm)
4292 atomic_dec(&nr_comm_events);
4293 if (event->attr.namespaces)
4294 atomic_dec(&nr_namespaces_events);
4295 if (event->attr.task)
4296 atomic_dec(&nr_task_events);
4297 if (event->attr.freq)
4298 unaccount_freq_event();
4299 if (event->attr.context_switch) {
4300 dec = true;
4301 atomic_dec(&nr_switch_events);
4302 }
4303 if (is_cgroup_event(event))
4304 dec = true;
4305 if (has_branch_stack(event))
4306 dec = true;
4307
4308 if (dec) {
4309 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4310 schedule_delayed_work(&perf_sched_work, HZ);
4311 }
4312
4313 unaccount_event_cpu(event, event->cpu);
4314
4315 unaccount_pmu_sb_event(event);
4316}
4317
4318static void perf_sched_delayed(struct work_struct *work)
4319{
4320 mutex_lock(&perf_sched_mutex);
4321 if (atomic_dec_and_test(&perf_sched_count))
4322 static_branch_disable(&perf_sched_events);
4323 mutex_unlock(&perf_sched_mutex);
4324}
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338static int exclusive_event_init(struct perf_event *event)
4339{
4340 struct pmu *pmu = event->pmu;
4341
4342 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4343 return 0;
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358 if (event->attach_state & PERF_ATTACH_TASK) {
4359 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4360 return -EBUSY;
4361 } else {
4362 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4363 return -EBUSY;
4364 }
4365
4366 return 0;
4367}
4368
4369static void exclusive_event_destroy(struct perf_event *event)
4370{
4371 struct pmu *pmu = event->pmu;
4372
4373 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4374 return;
4375
4376
4377 if (event->attach_state & PERF_ATTACH_TASK)
4378 atomic_dec(&pmu->exclusive_cnt);
4379 else
4380 atomic_inc(&pmu->exclusive_cnt);
4381}
4382
4383static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4384{
4385 if ((e1->pmu == e2->pmu) &&
4386 (e1->cpu == e2->cpu ||
4387 e1->cpu == -1 ||
4388 e2->cpu == -1))
4389 return true;
4390 return false;
4391}
4392
4393
4394static bool exclusive_event_installable(struct perf_event *event,
4395 struct perf_event_context *ctx)
4396{
4397 struct perf_event *iter_event;
4398 struct pmu *pmu = event->pmu;
4399
4400 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4401 return true;
4402
4403 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4404 if (exclusive_event_match(iter_event, event))
4405 return false;
4406 }
4407
4408 return true;
4409}
4410
4411static void perf_addr_filters_splice(struct perf_event *event,
4412 struct list_head *head);
4413
4414static void _free_event(struct perf_event *event)
4415{
4416 irq_work_sync(&event->pending);
4417
4418 unaccount_event(event);
4419
4420 if (event->rb) {
4421
4422
4423
4424
4425
4426
4427 mutex_lock(&event->mmap_mutex);
4428 ring_buffer_attach(event, NULL);
4429 mutex_unlock(&event->mmap_mutex);
4430 }
4431
4432 if (is_cgroup_event(event))
4433 perf_detach_cgroup(event);
4434
4435 if (!event->parent) {
4436 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4437 put_callchain_buffers();
4438 }
4439
4440 perf_event_free_bpf_prog(event);
4441 perf_addr_filters_splice(event, NULL);
4442 kfree(event->addr_filters_offs);
4443
4444 if (event->destroy)
4445 event->destroy(event);
4446
4447 if (event->ctx)
4448 put_ctx(event->ctx);
4449
4450 if (event->hw.target)
4451 put_task_struct(event->hw.target);
4452
4453 exclusive_event_destroy(event);
4454 module_put(event->pmu->module);
4455
4456 call_rcu(&event->rcu_head, free_event_rcu);
4457}
4458
4459
4460
4461
4462
4463static void free_event(struct perf_event *event)
4464{
4465 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4466 "unexpected event refcount: %ld; ptr=%p\n",
4467 atomic_long_read(&event->refcount), event)) {
4468
4469 return;
4470 }
4471
4472 _free_event(event);
4473}
4474
4475
4476
4477
4478static void perf_remove_from_owner(struct perf_event *event)
4479{
4480 struct task_struct *owner;
4481
4482 rcu_read_lock();
4483
4484
4485
4486
4487
4488
4489 owner = READ_ONCE(event->owner);
4490 if (owner) {
4491
4492
4493
4494
4495
4496 get_task_struct(owner);
4497 }
4498 rcu_read_unlock();
4499
4500 if (owner) {
4501
4502
4503
4504
4505
4506
4507
4508
4509 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4510
4511
4512
4513
4514
4515
4516
4517 if (event->owner) {
4518 list_del_init(&event->owner_entry);
4519 smp_store_release(&event->owner, NULL);
4520 }
4521 mutex_unlock(&owner->perf_event_mutex);
4522 put_task_struct(owner);
4523 }
4524}
4525
4526static void put_event(struct perf_event *event)
4527{
4528 if (!atomic_long_dec_and_test(&event->refcount))
4529 return;
4530
4531 _free_event(event);
4532}
4533
4534
4535
4536
4537
4538
4539int perf_event_release_kernel(struct perf_event *event)
4540{
4541 struct perf_event_context *ctx = event->ctx;
4542 struct perf_event *child, *tmp;
4543 LIST_HEAD(free_list);
4544
4545
4546
4547
4548
4549 if (!ctx) {
4550 WARN_ON_ONCE(event->attach_state &
4551 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4552 goto no_ctx;
4553 }
4554
4555 if (!is_kernel_event(event))
4556 perf_remove_from_owner(event);
4557
4558 ctx = perf_event_ctx_lock(event);
4559 WARN_ON_ONCE(ctx->parent_ctx);
4560 perf_remove_from_context(event, DETACH_GROUP);
4561
4562 raw_spin_lock_irq(&ctx->lock);
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574 event->state = PERF_EVENT_STATE_DEAD;
4575 raw_spin_unlock_irq(&ctx->lock);
4576
4577 perf_event_ctx_unlock(event, ctx);
4578
4579again:
4580 mutex_lock(&event->child_mutex);
4581 list_for_each_entry(child, &event->child_list, child_list) {
4582
4583
4584
4585
4586
4587 ctx = READ_ONCE(child->ctx);
4588
4589
4590
4591
4592
4593
4594
4595
4596 get_ctx(ctx);
4597
4598
4599
4600
4601
4602
4603 mutex_unlock(&event->child_mutex);
4604 mutex_lock(&ctx->mutex);
4605 mutex_lock(&event->child_mutex);
4606
4607
4608
4609
4610
4611
4612 tmp = list_first_entry_or_null(&event->child_list,
4613 struct perf_event, child_list);
4614 if (tmp == child) {
4615 perf_remove_from_context(child, DETACH_GROUP);
4616 list_move(&child->child_list, &free_list);
4617
4618
4619
4620
4621 put_event(event);
4622 }
4623
4624 mutex_unlock(&event->child_mutex);
4625 mutex_unlock(&ctx->mutex);
4626 put_ctx(ctx);
4627 goto again;
4628 }
4629 mutex_unlock(&event->child_mutex);
4630
4631 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4632 list_del(&child->child_list);
4633 free_event(child);
4634 }
4635
4636no_ctx:
4637 put_event(event);
4638 return 0;
4639}
4640EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4641
4642
4643
4644
4645static int perf_release(struct inode *inode, struct file *file)
4646{
4647 perf_event_release_kernel(file->private_data);
4648 return 0;
4649}
4650
4651static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4652{
4653 struct perf_event *child;
4654 u64 total = 0;
4655
4656 *enabled = 0;
4657 *running = 0;
4658
4659 mutex_lock(&event->child_mutex);
4660
4661 (void)perf_event_read(event, false);
4662 total += perf_event_count(event);
4663
4664 *enabled += event->total_time_enabled +
4665 atomic64_read(&event->child_total_time_enabled);
4666 *running += event->total_time_running +
4667 atomic64_read(&event->child_total_time_running);
4668
4669 list_for_each_entry(child, &event->child_list, child_list) {
4670 (void)perf_event_read(child, false);
4671 total += perf_event_count(child);
4672 *enabled += child->total_time_enabled;
4673 *running += child->total_time_running;
4674 }
4675 mutex_unlock(&event->child_mutex);
4676
4677 return total;
4678}
4679
4680u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4681{
4682 struct perf_event_context *ctx;
4683 u64 count;
4684
4685 ctx = perf_event_ctx_lock(event);
4686 count = __perf_event_read_value(event, enabled, running);
4687 perf_event_ctx_unlock(event, ctx);
4688
4689 return count;
4690}
4691EXPORT_SYMBOL_GPL(perf_event_read_value);
4692
4693static int __perf_read_group_add(struct perf_event *leader,
4694 u64 read_format, u64 *values)
4695{
4696 struct perf_event_context *ctx = leader->ctx;
4697 struct perf_event *sub;
4698 unsigned long flags;
4699 int n = 1;
4700 int ret;
4701
4702 ret = perf_event_read(leader, true);
4703 if (ret)
4704 return ret;
4705
4706 raw_spin_lock_irqsave(&ctx->lock, flags);
4707
4708
4709
4710
4711
4712
4713 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4714 values[n++] += leader->total_time_enabled +
4715 atomic64_read(&leader->child_total_time_enabled);
4716 }
4717
4718 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4719 values[n++] += leader->total_time_running +
4720 atomic64_read(&leader->child_total_time_running);
4721 }
4722
4723
4724
4725
4726 values[n++] += perf_event_count(leader);
4727 if (read_format & PERF_FORMAT_ID)
4728 values[n++] = primary_event_id(leader);
4729
4730 for_each_sibling_event(sub, leader) {
4731 values[n++] += perf_event_count(sub);
4732 if (read_format & PERF_FORMAT_ID)
4733 values[n++] = primary_event_id(sub);
4734 }
4735
4736 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4737 return 0;
4738}
4739
4740static int perf_read_group(struct perf_event *event,
4741 u64 read_format, char __user *buf)
4742{
4743 struct perf_event *leader = event->group_leader, *child;
4744 struct perf_event_context *ctx = leader->ctx;
4745 int ret;
4746 u64 *values;
4747
4748 lockdep_assert_held(&ctx->mutex);
4749
4750 values = kzalloc(event->read_size, GFP_KERNEL);
4751 if (!values)
4752 return -ENOMEM;
4753
4754 values[0] = 1 + leader->nr_siblings;
4755
4756
4757
4758
4759
4760 mutex_lock(&leader->child_mutex);
4761
4762 ret = __perf_read_group_add(leader, read_format, values);
4763 if (ret)
4764 goto unlock;
4765
4766 list_for_each_entry(child, &leader->child_list, child_list) {
4767 ret = __perf_read_group_add(child, read_format, values);
4768 if (ret)
4769 goto unlock;
4770 }
4771
4772 mutex_unlock(&leader->child_mutex);
4773
4774 ret = event->read_size;
4775 if (copy_to_user(buf, values, event->read_size))
4776 ret = -EFAULT;
4777 goto out;
4778
4779unlock:
4780 mutex_unlock(&leader->child_mutex);
4781out:
4782 kfree(values);
4783 return ret;
4784}
4785
4786static int perf_read_one(struct perf_event *event,
4787 u64 read_format, char __user *buf)
4788{
4789 u64 enabled, running;
4790 u64 values[4];
4791 int n = 0;
4792
4793 values[n++] = __perf_event_read_value(event, &enabled, &running);
4794 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4795 values[n++] = enabled;
4796 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4797 values[n++] = running;
4798 if (read_format & PERF_FORMAT_ID)
4799 values[n++] = primary_event_id(event);
4800
4801 if (copy_to_user(buf, values, n * sizeof(u64)))
4802 return -EFAULT;
4803
4804 return n * sizeof(u64);
4805}
4806
4807static bool is_event_hup(struct perf_event *event)
4808{
4809 bool no_children;
4810
4811 if (event->state > PERF_EVENT_STATE_EXIT)
4812 return false;
4813
4814 mutex_lock(&event->child_mutex);
4815 no_children = list_empty(&event->child_list);
4816 mutex_unlock(&event->child_mutex);
4817 return no_children;
4818}
4819
4820
4821
4822
4823static ssize_t
4824__perf_read(struct perf_event *event, char __user *buf, size_t count)
4825{
4826 u64 read_format = event->attr.read_format;
4827 int ret;
4828
4829
4830
4831
4832
4833
4834 if (event->state == PERF_EVENT_STATE_ERROR)
4835 return 0;
4836
4837 if (count < event->read_size)
4838 return -ENOSPC;
4839
4840 WARN_ON_ONCE(event->ctx->parent_ctx);
4841 if (read_format & PERF_FORMAT_GROUP)
4842 ret = perf_read_group(event, read_format, buf);
4843 else
4844 ret = perf_read_one(event, read_format, buf);
4845
4846 return ret;
4847}
4848
4849static ssize_t
4850perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4851{
4852 struct perf_event *event = file->private_data;
4853 struct perf_event_context *ctx;
4854 int ret;
4855
4856 ctx = perf_event_ctx_lock(event);
4857 ret = __perf_read(event, buf, count);
4858 perf_event_ctx_unlock(event, ctx);
4859
4860 return ret;
4861}
4862
4863static __poll_t perf_poll(struct file *file, poll_table *wait)
4864{
4865 struct perf_event *event = file->private_data;
4866 struct ring_buffer *rb;
4867 __poll_t events = EPOLLHUP;
4868
4869 poll_wait(file, &event->waitq, wait);
4870
4871 if (is_event_hup(event))
4872 return events;
4873
4874
4875
4876
4877
4878 mutex_lock(&event->mmap_mutex);
4879 rb = event->rb;
4880 if (rb)
4881 events = atomic_xchg(&rb->poll, 0);
4882 mutex_unlock(&event->mmap_mutex);
4883 return events;
4884}
4885
4886static void _perf_event_reset(struct perf_event *event)
4887{
4888 (void)perf_event_read(event, false);
4889 local64_set(&event->count, 0);
4890 perf_event_update_userpage(event);
4891}
4892
4893
4894
4895
4896
4897
4898
4899static void perf_event_for_each_child(struct perf_event *event,
4900 void (*func)(struct perf_event *))
4901{
4902 struct perf_event *child;
4903
4904 WARN_ON_ONCE(event->ctx->parent_ctx);
4905
4906 mutex_lock(&event->child_mutex);
4907 func(event);
4908 list_for_each_entry(child, &event->child_list, child_list)
4909 func(child);
4910 mutex_unlock(&event->child_mutex);
4911}
4912
4913static void perf_event_for_each(struct perf_event *event,
4914 void (*func)(struct perf_event *))
4915{
4916 struct perf_event_context *ctx = event->ctx;
4917 struct perf_event *sibling;
4918
4919 lockdep_assert_held(&ctx->mutex);
4920
4921 event = event->group_leader;
4922
4923 perf_event_for_each_child(event, func);
4924 for_each_sibling_event(sibling, event)
4925 perf_event_for_each_child(sibling, func);
4926}
4927
4928static void __perf_event_period(struct perf_event *event,
4929 struct perf_cpu_context *cpuctx,
4930 struct perf_event_context *ctx,
4931 void *info)
4932{
4933 u64 value = *((u64 *)info);
4934 bool active;
4935
4936 if (event->attr.freq) {
4937 event->attr.sample_freq = value;
4938 } else {
4939 event->attr.sample_period = value;
4940 event->hw.sample_period = value;
4941 }
4942
4943 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4944 if (active) {
4945 perf_pmu_disable(ctx->pmu);
4946
4947
4948
4949
4950 if (event->hw.interrupts == MAX_INTERRUPTS) {
4951 event->hw.interrupts = 0;
4952 perf_log_throttle(event, 1);
4953 }
4954 event->pmu->stop(event, PERF_EF_UPDATE);
4955 }
4956
4957 local64_set(&event->hw.period_left, 0);
4958
4959 if (active) {
4960 event->pmu->start(event, PERF_EF_RELOAD);
4961 perf_pmu_enable(ctx->pmu);
4962 }
4963}
4964
4965static int perf_event_period(struct perf_event *event, u64 __user *arg)
4966{
4967 u64 value;
4968
4969 if (!is_sampling_event(event))
4970 return -EINVAL;
4971
4972 if (copy_from_user(&value, arg, sizeof(value)))
4973 return -EFAULT;
4974
4975 if (!value)
4976 return -EINVAL;
4977
4978 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4979 return -EINVAL;
4980
4981 event_function_call(event, __perf_event_period, &value);
4982
4983 return 0;
4984}
4985
4986static const struct file_operations perf_fops;
4987
4988static inline int perf_fget_light(int fd, struct fd *p)
4989{
4990 struct fd f = fdget(fd);
4991 if (!f.file)
4992 return -EBADF;
4993
4994 if (f.file->f_op != &perf_fops) {
4995 fdput(f);
4996 return -EBADF;
4997 }
4998 *p = f;
4999 return 0;
5000}
5001
5002static int perf_event_set_output(struct perf_event *event,
5003 struct perf_event *output_event);
5004static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5005static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5006static int perf_copy_attr(struct perf_event_attr __user *uattr,
5007 struct perf_event_attr *attr);
5008
5009static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5010{
5011 void (*func)(struct perf_event *);
5012 u32 flags = arg;
5013
5014 switch (cmd) {
5015 case PERF_EVENT_IOC_ENABLE:
5016 func = _perf_event_enable;
5017 break;
5018 case PERF_EVENT_IOC_DISABLE:
5019 func = _perf_event_disable;
5020 break;
5021 case PERF_EVENT_IOC_RESET:
5022 func = _perf_event_reset;
5023 break;
5024
5025 case PERF_EVENT_IOC_REFRESH:
5026 return _perf_event_refresh(event, arg);
5027
5028 case PERF_EVENT_IOC_PERIOD:
5029 return perf_event_period(event, (u64 __user *)arg);
5030
5031 case PERF_EVENT_IOC_ID:
5032 {
5033 u64 id = primary_event_id(event);
5034
5035 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5036 return -EFAULT;
5037 return 0;
5038 }
5039
5040 case PERF_EVENT_IOC_SET_OUTPUT:
5041 {
5042 int ret;
5043 if (arg != -1) {
5044 struct perf_event *output_event;
5045 struct fd output;
5046 ret = perf_fget_light(arg, &output);
5047 if (ret)
5048 return ret;
5049 output_event = output.file->private_data;
5050 ret = perf_event_set_output(event, output_event);
5051 fdput(output);
5052 } else {
5053 ret = perf_event_set_output(event, NULL);
5054 }
5055 return ret;
5056 }
5057
5058 case PERF_EVENT_IOC_SET_FILTER:
5059 return perf_event_set_filter(event, (void __user *)arg);
5060
5061 case PERF_EVENT_IOC_SET_BPF:
5062 return perf_event_set_bpf_prog(event, arg);
5063
5064 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5065 struct ring_buffer *rb;
5066
5067 rcu_read_lock();
5068 rb = rcu_dereference(event->rb);
5069 if (!rb || !rb->nr_pages) {
5070 rcu_read_unlock();
5071 return -EINVAL;
5072 }
5073 rb_toggle_paused(rb, !!arg);
5074 rcu_read_unlock();
5075 return 0;
5076 }
5077
5078 case PERF_EVENT_IOC_QUERY_BPF:
5079 return perf_event_query_prog_array(event, (void __user *)arg);
5080
5081 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5082 struct perf_event_attr new_attr;
5083 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5084 &new_attr);
5085
5086 if (err)
5087 return err;
5088
5089 return perf_event_modify_attr(event, &new_attr);
5090 }
5091 default:
5092 return -ENOTTY;
5093 }
5094
5095 if (flags & PERF_IOC_FLAG_GROUP)
5096 perf_event_for_each(event, func);
5097 else
5098 perf_event_for_each_child(event, func);
5099
5100 return 0;
5101}
5102
5103static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5104{
5105 struct perf_event *event = file->private_data;
5106 struct perf_event_context *ctx;
5107 long ret;
5108
5109 ctx = perf_event_ctx_lock(event);
5110 ret = _perf_ioctl(event, cmd, arg);
5111 perf_event_ctx_unlock(event, ctx);
5112
5113 return ret;
5114}
5115
5116#ifdef CONFIG_COMPAT
5117static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5118 unsigned long arg)
5119{
5120 switch (_IOC_NR(cmd)) {
5121 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5122 case _IOC_NR(PERF_EVENT_IOC_ID):
5123
5124 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5125 cmd &= ~IOCSIZE_MASK;
5126 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5127 }
5128 break;
5129 }
5130 return perf_ioctl(file, cmd, arg);
5131}
5132#else
5133# define perf_compat_ioctl NULL
5134#endif
5135
5136int perf_event_task_enable(void)
5137{
5138 struct perf_event_context *ctx;
5139 struct perf_event *event;
5140
5141 mutex_lock(¤t->perf_event_mutex);
5142 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5143 ctx = perf_event_ctx_lock(event);
5144 perf_event_for_each_child(event, _perf_event_enable);
5145 perf_event_ctx_unlock(event, ctx);
5146 }
5147 mutex_unlock(¤t->perf_event_mutex);
5148
5149 return 0;
5150}
5151
5152int perf_event_task_disable(void)
5153{
5154 struct perf_event_context *ctx;
5155 struct perf_event *event;
5156
5157 mutex_lock(¤t->perf_event_mutex);
5158 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5159 ctx = perf_event_ctx_lock(event);
5160 perf_event_for_each_child(event, _perf_event_disable);
5161 perf_event_ctx_unlock(event, ctx);
5162 }
5163 mutex_unlock(¤t->perf_event_mutex);
5164
5165 return 0;
5166}
5167
5168static int perf_event_index(struct perf_event *event)
5169{
5170 if (event->hw.state & PERF_HES_STOPPED)
5171 return 0;
5172
5173 if (event->state != PERF_EVENT_STATE_ACTIVE)
5174 return 0;
5175
5176 return event->pmu->event_idx(event);
5177}
5178
5179static void calc_timer_values(struct perf_event *event,
5180 u64 *now,
5181 u64 *enabled,
5182 u64 *running)
5183{
5184 u64 ctx_time;
5185
5186 *now = perf_clock();
5187 ctx_time = event->shadow_ctx_time + *now;
5188 __perf_update_times(event, ctx_time, enabled, running);
5189}
5190
5191static void perf_event_init_userpage(struct perf_event *event)
5192{
5193 struct perf_event_mmap_page *userpg;
5194 struct ring_buffer *rb;
5195
5196 rcu_read_lock();
5197 rb = rcu_dereference(event->rb);
5198 if (!rb)
5199 goto unlock;
5200
5201 userpg = rb->user_page;
5202
5203
5204 userpg->cap_bit0_is_deprecated = 1;
5205 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5206 userpg->data_offset = PAGE_SIZE;
5207 userpg->data_size = perf_data_size(rb);
5208
5209unlock:
5210 rcu_read_unlock();
5211}
5212
5213void __weak arch_perf_update_userpage(
5214 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5215{
5216}
5217
5218
5219
5220
5221
5222
5223void perf_event_update_userpage(struct perf_event *event)
5224{
5225 struct perf_event_mmap_page *userpg;
5226 struct ring_buffer *rb;
5227 u64 enabled, running, now;
5228
5229 rcu_read_lock();
5230 rb = rcu_dereference(event->rb);
5231 if (!rb)
5232 goto unlock;
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243 calc_timer_values(event, &now, &enabled, &running);
5244
5245 userpg = rb->user_page;
5246
5247
5248
5249
5250 preempt_disable();
5251 ++userpg->lock;
5252 barrier();
5253 userpg->index = perf_event_index(event);
5254 userpg->offset = perf_event_count(event);
5255 if (userpg->index)
5256 userpg->offset -= local64_read(&event->hw.prev_count);
5257
5258 userpg->time_enabled = enabled +
5259 atomic64_read(&event->child_total_time_enabled);
5260
5261 userpg->time_running = running +
5262 atomic64_read(&event->child_total_time_running);
5263
5264 arch_perf_update_userpage(event, userpg, now);
5265
5266 barrier();
5267 ++userpg->lock;
5268 preempt_enable();
5269unlock:
5270 rcu_read_unlock();
5271}
5272EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5273
5274static int perf_mmap_fault(struct vm_fault *vmf)
5275{
5276 struct perf_event *event = vmf->vma->vm_file->private_data;
5277 struct ring_buffer *rb;
5278 int ret = VM_FAULT_SIGBUS;
5279
5280 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5281 if (vmf->pgoff == 0)
5282 ret = 0;
5283 return ret;
5284 }
5285
5286 rcu_read_lock();
5287 rb = rcu_dereference(event->rb);
5288 if (!rb)
5289 goto unlock;
5290
5291 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5292 goto unlock;
5293
5294 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5295 if (!vmf->page)
5296 goto unlock;
5297
5298 get_page(vmf->page);
5299 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5300 vmf->page->index = vmf->pgoff;
5301
5302 ret = 0;
5303unlock:
5304 rcu_read_unlock();
5305
5306 return ret;
5307}
5308
5309static void ring_buffer_attach(struct perf_event *event,
5310 struct ring_buffer *rb)
5311{
5312 struct ring_buffer *old_rb = NULL;
5313 unsigned long flags;
5314
5315 if (event->rb) {
5316
5317
5318
5319
5320 WARN_ON_ONCE(event->rcu_pending);
5321
5322 old_rb = event->rb;
5323 spin_lock_irqsave(&old_rb->event_lock, flags);
5324 list_del_rcu(&event->rb_entry);
5325 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5326
5327 event->rcu_batches = get_state_synchronize_rcu();
5328 event->rcu_pending = 1;
5329 }
5330
5331 if (rb) {
5332 if (event->rcu_pending) {
5333 cond_synchronize_rcu(event->rcu_batches);
5334 event->rcu_pending = 0;
5335 }
5336
5337 spin_lock_irqsave(&rb->event_lock, flags);
5338 list_add_rcu(&event->rb_entry, &rb->event_list);
5339 spin_unlock_irqrestore(&rb->event_lock, flags);
5340 }
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352 if (has_aux(event))
5353 perf_event_stop(event, 0);
5354
5355 rcu_assign_pointer(event->rb, rb);
5356
5357 if (old_rb) {
5358 ring_buffer_put(old_rb);
5359
5360
5361
5362
5363
5364 wake_up_all(&event->waitq);
5365 }
5366}
5367
5368static void ring_buffer_wakeup(struct perf_event *event)
5369{
5370 struct ring_buffer *rb;
5371
5372 rcu_read_lock();
5373 rb = rcu_dereference(event->rb);
5374 if (rb) {
5375 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5376 wake_up_all(&event->waitq);
5377 }
5378 rcu_read_unlock();
5379}
5380
5381struct ring_buffer *ring_buffer_get(struct perf_event *event)
5382{
5383 struct ring_buffer *rb;
5384
5385 rcu_read_lock();
5386 rb = rcu_dereference(event->rb);
5387 if (rb) {
5388 if (!atomic_inc_not_zero(&rb->refcount))
5389 rb = NULL;
5390 }
5391 rcu_read_unlock();
5392
5393 return rb;
5394}
5395
5396void ring_buffer_put(struct ring_buffer *rb)
5397{
5398 if (!atomic_dec_and_test(&rb->refcount))
5399 return;
5400
5401 WARN_ON_ONCE(!list_empty(&rb->event_list));
5402
5403 call_rcu(&rb->rcu_head, rb_free_rcu);
5404}
5405
5406static void perf_mmap_open(struct vm_area_struct *vma)
5407{
5408 struct perf_event *event = vma->vm_file->private_data;
5409
5410 atomic_inc(&event->mmap_count);
5411 atomic_inc(&event->rb->mmap_count);
5412
5413 if (vma->vm_pgoff)
5414 atomic_inc(&event->rb->aux_mmap_count);
5415
5416 if (event->pmu->event_mapped)
5417 event->pmu->event_mapped(event, vma->vm_mm);
5418}
5419
5420static void perf_pmu_output_stop(struct perf_event *event);
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430static void perf_mmap_close(struct vm_area_struct *vma)
5431{
5432 struct perf_event *event = vma->vm_file->private_data;
5433
5434 struct ring_buffer *rb = ring_buffer_get(event);
5435 struct user_struct *mmap_user = rb->mmap_user;
5436 int mmap_locked = rb->mmap_locked;
5437 unsigned long size = perf_data_size(rb);
5438
5439 if (event->pmu->event_unmapped)
5440 event->pmu->event_unmapped(event, vma->vm_mm);
5441
5442
5443
5444
5445
5446
5447 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5448 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5449
5450
5451
5452
5453
5454
5455 perf_pmu_output_stop(event);
5456
5457
5458 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5459 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5460
5461
5462 rb_free_aux(rb);
5463 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5464
5465 mutex_unlock(&event->mmap_mutex);
5466 }
5467
5468 atomic_dec(&rb->mmap_count);
5469
5470 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5471 goto out_put;
5472
5473 ring_buffer_attach(event, NULL);
5474 mutex_unlock(&event->mmap_mutex);
5475
5476
5477 if (atomic_read(&rb->mmap_count))
5478 goto out_put;
5479
5480
5481
5482
5483
5484
5485again:
5486 rcu_read_lock();
5487 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5488 if (!atomic_long_inc_not_zero(&event->refcount)) {
5489
5490
5491
5492
5493 continue;
5494 }
5495 rcu_read_unlock();
5496
5497 mutex_lock(&event->mmap_mutex);
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508 if (event->rb == rb)
5509 ring_buffer_attach(event, NULL);
5510
5511 mutex_unlock(&event->mmap_mutex);
5512 put_event(event);
5513
5514
5515
5516
5517
5518 goto again;
5519 }
5520 rcu_read_unlock();
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5532 vma->vm_mm->pinned_vm -= mmap_locked;
5533 free_uid(mmap_user);
5534
5535out_put:
5536 ring_buffer_put(rb);
5537}
5538
5539static const struct vm_operations_struct perf_mmap_vmops = {
5540 .open = perf_mmap_open,
5541 .close = perf_mmap_close,
5542 .fault = perf_mmap_fault,
5543 .page_mkwrite = perf_mmap_fault,
5544};
5545
5546static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5547{
5548 struct perf_event *event = file->private_data;
5549 unsigned long user_locked, user_lock_limit;
5550 struct user_struct *user = current_user();
5551 unsigned long locked, lock_limit;
5552 struct ring_buffer *rb = NULL;
5553 unsigned long vma_size;
5554 unsigned long nr_pages;
5555 long user_extra = 0, extra = 0;
5556 int ret = 0, flags = 0;
5557
5558
5559
5560
5561
5562
5563 if (event->cpu == -1 && event->attr.inherit)
5564 return -EINVAL;
5565
5566 if (!(vma->vm_flags & VM_SHARED))
5567 return -EINVAL;
5568
5569 vma_size = vma->vm_end - vma->vm_start;
5570
5571 if (vma->vm_pgoff == 0) {
5572 nr_pages = (vma_size / PAGE_SIZE) - 1;
5573 } else {
5574
5575
5576
5577
5578
5579 u64 aux_offset, aux_size;
5580
5581 if (!event->rb)
5582 return -EINVAL;
5583
5584 nr_pages = vma_size / PAGE_SIZE;
5585
5586 mutex_lock(&event->mmap_mutex);
5587 ret = -EINVAL;
5588
5589 rb = event->rb;
5590 if (!rb)
5591 goto aux_unlock;
5592
5593 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5594 aux_size = READ_ONCE(rb->user_page->aux_size);
5595
5596 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5597 goto aux_unlock;
5598
5599 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5600 goto aux_unlock;
5601
5602
5603 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5604 goto aux_unlock;
5605
5606 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5607 goto aux_unlock;
5608
5609
5610 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5611 goto aux_unlock;
5612
5613 if (!is_power_of_2(nr_pages))
5614 goto aux_unlock;
5615
5616 if (!atomic_inc_not_zero(&rb->mmap_count))
5617 goto aux_unlock;
5618
5619 if (rb_has_aux(rb)) {
5620 atomic_inc(&rb->aux_mmap_count);
5621 ret = 0;
5622 goto unlock;
5623 }
5624
5625 atomic_set(&rb->aux_mmap_count, 1);
5626 user_extra = nr_pages;
5627
5628 goto accounting;
5629 }
5630
5631
5632
5633
5634
5635 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5636 return -EINVAL;
5637
5638 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5639 return -EINVAL;
5640
5641 WARN_ON_ONCE(event->ctx->parent_ctx);
5642again:
5643 mutex_lock(&event->mmap_mutex);
5644 if (event->rb) {
5645 if (event->rb->nr_pages != nr_pages) {
5646 ret = -EINVAL;
5647 goto unlock;
5648 }
5649
5650 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5651
5652
5653
5654
5655
5656 mutex_unlock(&event->mmap_mutex);
5657 goto again;
5658 }
5659
5660 goto unlock;
5661 }
5662
5663 user_extra = nr_pages + 1;
5664
5665accounting:
5666 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5667
5668
5669
5670
5671 user_lock_limit *= num_online_cpus();
5672
5673 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5674
5675 if (user_locked > user_lock_limit)
5676 extra = user_locked - user_lock_limit;
5677
5678 lock_limit = rlimit(RLIMIT_MEMLOCK);
5679 lock_limit >>= PAGE_SHIFT;
5680 locked = vma->vm_mm->pinned_vm + extra;
5681
5682 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5683 !capable(CAP_IPC_LOCK)) {
5684 ret = -EPERM;
5685 goto unlock;
5686 }
5687
5688 WARN_ON(!rb && event->rb);
5689
5690 if (vma->vm_flags & VM_WRITE)
5691 flags |= RING_BUFFER_WRITABLE;
5692
5693 if (!rb) {
5694 rb = rb_alloc(nr_pages,
5695 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5696 event->cpu, flags);
5697
5698 if (!rb) {
5699 ret = -ENOMEM;
5700 goto unlock;
5701 }
5702
5703 atomic_set(&rb->mmap_count, 1);
5704 rb->mmap_user = get_current_user();
5705 rb->mmap_locked = extra;
5706
5707 ring_buffer_attach(event, rb);
5708
5709 perf_event_init_userpage(event);
5710 perf_event_update_userpage(event);
5711 } else {
5712 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5713 event->attr.aux_watermark, flags);
5714 if (!ret)
5715 rb->aux_mmap_locked = extra;
5716 }
5717
5718unlock:
5719 if (!ret) {
5720 atomic_long_add(user_extra, &user->locked_vm);
5721 vma->vm_mm->pinned_vm += extra;
5722
5723 atomic_inc(&event->mmap_count);
5724 } else if (rb) {
5725 atomic_dec(&rb->mmap_count);
5726 }
5727aux_unlock:
5728 mutex_unlock(&event->mmap_mutex);
5729
5730
5731
5732
5733
5734 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5735 vma->vm_ops = &perf_mmap_vmops;
5736
5737 if (event->pmu->event_mapped)
5738 event->pmu->event_mapped(event, vma->vm_mm);
5739
5740 return ret;
5741}
5742
5743static int perf_fasync(int fd, struct file *filp, int on)
5744{
5745 struct inode *inode = file_inode(filp);
5746 struct perf_event *event = filp->private_data;
5747 int retval;
5748
5749 inode_lock(inode);
5750 retval = fasync_helper(fd, filp, on, &event->fasync);
5751 inode_unlock(inode);
5752
5753 if (retval < 0)
5754 return retval;
5755
5756 return 0;
5757}
5758
5759static const struct file_operations perf_fops = {
5760 .llseek = no_llseek,
5761 .release = perf_release,
5762 .read = perf_read,
5763 .poll = perf_poll,
5764 .unlocked_ioctl = perf_ioctl,
5765 .compat_ioctl = perf_compat_ioctl,
5766 .mmap = perf_mmap,
5767 .fasync = perf_fasync,
5768};
5769
5770
5771
5772
5773
5774
5775
5776
5777static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5778{
5779
5780 if (event->parent)
5781 event = event->parent;
5782 return &event->fasync;
5783}
5784
5785void perf_event_wakeup(struct perf_event *event)
5786{
5787 ring_buffer_wakeup(event);
5788
5789 if (event->pending_kill) {
5790 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5791 event->pending_kill = 0;
5792 }
5793}
5794
5795static void perf_pending_event(struct irq_work *entry)
5796{
5797 struct perf_event *event = container_of(entry,
5798 struct perf_event, pending);
5799 int rctx;
5800
5801 rctx = perf_swevent_get_recursion_context();
5802
5803
5804
5805
5806
5807 if (event->pending_disable) {
5808 event->pending_disable = 0;
5809 perf_event_disable_local(event);
5810 }
5811
5812 if (event->pending_wakeup) {
5813 event->pending_wakeup = 0;
5814 perf_event_wakeup(event);
5815 }
5816
5817 if (rctx >= 0)
5818 perf_swevent_put_recursion_context(rctx);
5819}
5820
5821
5822
5823
5824
5825
5826struct perf_guest_info_callbacks *perf_guest_cbs;
5827
5828int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5829{
5830 perf_guest_cbs = cbs;
5831 return 0;
5832}
5833EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5834
5835int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5836{
5837 perf_guest_cbs = NULL;
5838 return 0;
5839}
5840EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5841
5842static void
5843perf_output_sample_regs(struct perf_output_handle *handle,
5844 struct pt_regs *regs, u64 mask)
5845{
5846 int bit;
5847 DECLARE_BITMAP(_mask, 64);
5848
5849 bitmap_from_u64(_mask, mask);
5850 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5851 u64 val;
5852
5853 val = perf_reg_value(regs, bit);
5854 perf_output_put(handle, val);
5855 }
5856}
5857
5858static void perf_sample_regs_user(struct perf_regs *regs_user,
5859 struct pt_regs *regs,
5860 struct pt_regs *regs_user_copy)
5861{
5862 if (user_mode(regs)) {
5863 regs_user->abi = perf_reg_abi(current);
5864 regs_user->regs = regs;
5865 } else if (current->mm) {
5866 perf_get_regs_user(regs_user, regs, regs_user_copy);
5867 } else {
5868 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5869 regs_user->regs = NULL;
5870 }
5871}
5872
5873static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5874 struct pt_regs *regs)
5875{
5876 regs_intr->regs = regs;
5877 regs_intr->abi = perf_reg_abi(current);
5878}
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888static u64 perf_ustack_task_size(struct pt_regs *regs)
5889{
5890 unsigned long addr = perf_user_stack_pointer(regs);
5891
5892 if (!addr || addr >= TASK_SIZE)
5893 return 0;
5894
5895 return TASK_SIZE - addr;
5896}
5897
5898static u16
5899perf_sample_ustack_size(u16 stack_size, u16 header_size,
5900 struct pt_regs *regs)
5901{
5902 u64 task_size;
5903
5904
5905 if (!regs)
5906 return 0;
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5919 stack_size = min(stack_size, (u16) task_size);
5920
5921
5922 header_size += 2 * sizeof(u64);
5923
5924
5925 if ((u16) (header_size + stack_size) < header_size) {
5926
5927
5928
5929
5930 stack_size = USHRT_MAX - header_size - sizeof(u64);
5931 stack_size = round_up(stack_size, sizeof(u64));
5932 }
5933
5934 return stack_size;
5935}
5936
5937static void
5938perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5939 struct pt_regs *regs)
5940{
5941
5942 if (!regs) {
5943 u64 size = 0;
5944 perf_output_put(handle, size);
5945 } else {
5946 unsigned long sp;
5947 unsigned int rem;
5948 u64 dyn_size;
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962 perf_output_put(handle, dump_size);
5963
5964
5965 sp = perf_user_stack_pointer(regs);
5966 rem = __output_copy_user(handle, (void *) sp, dump_size);
5967 dyn_size = dump_size - rem;
5968
5969 perf_output_skip(handle, rem);
5970
5971
5972 perf_output_put(handle, dyn_size);
5973 }
5974}
5975
5976static void __perf_event_header__init_id(struct perf_event_header *header,
5977 struct perf_sample_data *data,
5978 struct perf_event *event)
5979{
5980 u64 sample_type = event->attr.sample_type;
5981
5982 data->type = sample_type;
5983 header->size += event->id_header_size;
5984
5985 if (sample_type & PERF_SAMPLE_TID) {
5986
5987 data->tid_entry.pid = perf_event_pid(event, current);
5988 data->tid_entry.tid = perf_event_tid(event, current);
5989 }
5990
5991 if (sample_type & PERF_SAMPLE_TIME)
5992 data->time = perf_event_clock(event);
5993
5994 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5995 data->id = primary_event_id(event);
5996
5997 if (sample_type & PERF_SAMPLE_STREAM_ID)
5998 data->stream_id = event->id;
5999
6000 if (sample_type & PERF_SAMPLE_CPU) {
6001 data->cpu_entry.cpu = raw_smp_processor_id();
6002 data->cpu_entry.reserved = 0;
6003 }
6004}
6005
6006void perf_event_header__init_id(struct perf_event_header *header,
6007 struct perf_sample_data *data,
6008 struct perf_event *event)
6009{
6010 if (event->attr.sample_id_all)
6011 __perf_event_header__init_id(header, data, event);
6012}
6013
6014static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6015 struct perf_sample_data *data)
6016{
6017 u64 sample_type = data->type;
6018
6019 if (sample_type & PERF_SAMPLE_TID)
6020 perf_output_put(handle, data->tid_entry);
6021
6022 if (sample_type & PERF_SAMPLE_TIME)
6023 perf_output_put(handle, data->time);
6024
6025 if (sample_type & PERF_SAMPLE_ID)
6026 perf_output_put(handle, data->id);
6027
6028 if (sample_type & PERF_SAMPLE_STREAM_ID)
6029 perf_output_put(handle, data->stream_id);
6030
6031 if (sample_type & PERF_SAMPLE_CPU)
6032 perf_output_put(handle, data->cpu_entry);
6033
6034 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6035 perf_output_put(handle, data->id);
6036}
6037
6038void perf_event__output_id_sample(struct perf_event *event,
6039 struct perf_output_handle *handle,
6040 struct perf_sample_data *sample)
6041{
6042 if (event->attr.sample_id_all)
6043 __perf_event__output_id_sample(handle, sample);
6044}
6045
6046static void perf_output_read_one(struct perf_output_handle *handle,
6047 struct perf_event *event,
6048 u64 enabled, u64 running)
6049{
6050 u64 read_format = event->attr.read_format;
6051 u64 values[4];
6052 int n = 0;
6053
6054 values[n++] = perf_event_count(event);
6055 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6056 values[n++] = enabled +
6057 atomic64_read(&event->child_total_time_enabled);
6058 }
6059 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6060 values[n++] = running +
6061 atomic64_read(&event->child_total_time_running);
6062 }
6063 if (read_format & PERF_FORMAT_ID)
6064 values[n++] = primary_event_id(event);
6065
6066 __output_copy(handle, values, n * sizeof(u64));
6067}
6068
6069static void perf_output_read_group(struct perf_output_handle *handle,
6070 struct perf_event *event,
6071 u64 enabled, u64 running)
6072{
6073 struct perf_event *leader = event->group_leader, *sub;
6074 u64 read_format = event->attr.read_format;
6075 u64 values[5];
6076 int n = 0;
6077
6078 values[n++] = 1 + leader->nr_siblings;
6079
6080 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6081 values[n++] = enabled;
6082
6083 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6084 values[n++] = running;
6085
6086 if ((leader != event) &&
6087 (leader->state == PERF_EVENT_STATE_ACTIVE))
6088 leader->pmu->read(leader);
6089
6090 values[n++] = perf_event_count(leader);
6091 if (read_format & PERF_FORMAT_ID)
6092 values[n++] = primary_event_id(leader);
6093
6094 __output_copy(handle, values, n * sizeof(u64));
6095
6096 for_each_sibling_event(sub, leader) {
6097 n = 0;
6098
6099 if ((sub != event) &&
6100 (sub->state == PERF_EVENT_STATE_ACTIVE))
6101 sub->pmu->read(sub);
6102
6103 values[n++] = perf_event_count(sub);
6104 if (read_format & PERF_FORMAT_ID)
6105 values[n++] = primary_event_id(sub);
6106
6107 __output_copy(handle, values, n * sizeof(u64));
6108 }
6109}
6110
6111#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6112 PERF_FORMAT_TOTAL_TIME_RUNNING)
6113
6114
6115
6116
6117
6118
6119
6120
6121static void perf_output_read(struct perf_output_handle *handle,
6122 struct perf_event *event)
6123{
6124 u64 enabled = 0, running = 0, now;
6125 u64 read_format = event->attr.read_format;
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6137 calc_timer_values(event, &now, &enabled, &running);
6138
6139 if (event->attr.read_format & PERF_FORMAT_GROUP)
6140 perf_output_read_group(handle, event, enabled, running);
6141 else
6142 perf_output_read_one(handle, event, enabled, running);
6143}
6144
6145void perf_output_sample(struct perf_output_handle *handle,
6146 struct perf_event_header *header,
6147 struct perf_sample_data *data,
6148 struct perf_event *event)
6149{
6150 u64 sample_type = data->type;
6151
6152 perf_output_put(handle, *header);
6153
6154 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6155 perf_output_put(handle, data->id);
6156
6157 if (sample_type & PERF_SAMPLE_IP)
6158 perf_output_put(handle, data->ip);
6159
6160 if (sample_type & PERF_SAMPLE_TID)
6161 perf_output_put(handle, data->tid_entry);
6162
6163 if (sample_type & PERF_SAMPLE_TIME)
6164 perf_output_put(handle, data->time);
6165
6166 if (sample_type & PERF_SAMPLE_ADDR)
6167 perf_output_put(handle, data->addr);
6168
6169 if (sample_type & PERF_SAMPLE_ID)
6170 perf_output_put(handle, data->id);
6171
6172 if (sample_type & PERF_SAMPLE_STREAM_ID)
6173 perf_output_put(handle, data->stream_id);
6174
6175 if (sample_type & PERF_SAMPLE_CPU)
6176 perf_output_put(handle, data->cpu_entry);
6177
6178 if (sample_type & PERF_SAMPLE_PERIOD)
6179 perf_output_put(handle, data->period);
6180
6181 if (sample_type & PERF_SAMPLE_READ)
6182 perf_output_read(handle, event);
6183
6184 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6185 int size = 1;
6186
6187 size += data->callchain->nr;
6188 size *= sizeof(u64);
6189 __output_copy(handle, data->callchain, size);
6190 }
6191
6192 if (sample_type & PERF_SAMPLE_RAW) {
6193 struct perf_raw_record *raw = data->raw;
6194
6195 if (raw) {
6196 struct perf_raw_frag *frag = &raw->frag;
6197
6198 perf_output_put(handle, raw->size);
6199 do {
6200 if (frag->copy) {
6201 __output_custom(handle, frag->copy,
6202 frag->data, frag->size);
6203 } else {
6204 __output_copy(handle, frag->data,
6205 frag->size);
6206 }
6207 if (perf_raw_frag_last(frag))
6208 break;
6209 frag = frag->next;
6210 } while (1);
6211 if (frag->pad)
6212 __output_skip(handle, NULL, frag->pad);
6213 } else {
6214 struct {
6215 u32 size;
6216 u32 data;
6217 } raw = {
6218 .size = sizeof(u32),
6219 .data = 0,
6220 };
6221 perf_output_put(handle, raw);
6222 }
6223 }
6224
6225 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6226 if (data->br_stack) {
6227 size_t size;
6228
6229 size = data->br_stack->nr
6230 * sizeof(struct perf_branch_entry);
6231
6232 perf_output_put(handle, data->br_stack->nr);
6233 perf_output_copy(handle, data->br_stack->entries, size);
6234 } else {
6235
6236
6237
6238 u64 nr = 0;
6239 perf_output_put(handle, nr);
6240 }
6241 }
6242
6243 if (sample_type & PERF_SAMPLE_REGS_USER) {
6244 u64 abi = data->regs_user.abi;
6245
6246
6247
6248
6249
6250 perf_output_put(handle, abi);
6251
6252 if (abi) {
6253 u64 mask = event->attr.sample_regs_user;
6254 perf_output_sample_regs(handle,
6255 data->regs_user.regs,
6256 mask);
6257 }
6258 }
6259
6260 if (sample_type & PERF_SAMPLE_STACK_USER) {
6261 perf_output_sample_ustack(handle,
6262 data->stack_user_size,
6263 data->regs_user.regs);
6264 }
6265
6266 if (sample_type & PERF_SAMPLE_WEIGHT)
6267 perf_output_put(handle, data->weight);
6268
6269 if (sample_type & PERF_SAMPLE_DATA_SRC)
6270 perf_output_put(handle, data->data_src.val);
6271
6272 if (sample_type & PERF_SAMPLE_TRANSACTION)
6273 perf_output_put(handle, data->txn);
6274
6275 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6276 u64 abi = data->regs_intr.abi;
6277
6278
6279
6280
6281 perf_output_put(handle, abi);
6282
6283 if (abi) {
6284 u64 mask = event->attr.sample_regs_intr;
6285
6286 perf_output_sample_regs(handle,
6287 data->regs_intr.regs,
6288 mask);
6289 }
6290 }
6291
6292 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6293 perf_output_put(handle, data->phys_addr);
6294
6295 if (!event->attr.watermark) {
6296 int wakeup_events = event->attr.wakeup_events;
6297
6298 if (wakeup_events) {
6299 struct ring_buffer *rb = handle->rb;
6300 int events = local_inc_return(&rb->events);
6301
6302 if (events >= wakeup_events) {
6303 local_sub(wakeup_events, &rb->events);
6304 local_inc(&rb->wakeup);
6305 }
6306 }
6307 }
6308}
6309
6310static u64 perf_virt_to_phys(u64 virt)
6311{
6312 u64 phys_addr = 0;
6313 struct page *p = NULL;
6314
6315 if (!virt)
6316 return 0;
6317
6318 if (virt >= TASK_SIZE) {
6319
6320 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6321 !(virt >= VMALLOC_START && virt < VMALLOC_END))
6322 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6323 } else {
6324
6325
6326
6327
6328
6329
6330
6331 if ((current->mm != NULL) &&
6332 (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6333 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6334
6335 if (p)
6336 put_page(p);
6337 }
6338
6339 return phys_addr;
6340}
6341
6342static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6343
6344static struct perf_callchain_entry *
6345perf_callchain(struct perf_event *event, struct pt_regs *regs)
6346{
6347 bool kernel = !event->attr.exclude_callchain_kernel;
6348 bool user = !event->attr.exclude_callchain_user;
6349
6350 bool crosstask = event->ctx->task && event->ctx->task != current;
6351 const u32 max_stack = event->attr.sample_max_stack;
6352 struct perf_callchain_entry *callchain;
6353
6354 if (!kernel && !user)
6355 return &__empty_callchain;
6356
6357 callchain = get_perf_callchain(regs, 0, kernel, user,
6358 max_stack, crosstask, true);
6359 return callchain ?: &__empty_callchain;
6360}
6361
6362void perf_prepare_sample(struct perf_event_header *header,
6363 struct perf_sample_data *data,
6364 struct perf_event *event,
6365 struct pt_regs *regs)
6366{
6367 u64 sample_type = event->attr.sample_type;
6368
6369 header->type = PERF_RECORD_SAMPLE;
6370 header->size = sizeof(*header) + event->header_size;
6371
6372 header->misc = 0;
6373 header->misc |= perf_misc_flags(regs);
6374
6375 __perf_event_header__init_id(header, data, event);
6376
6377 if (sample_type & PERF_SAMPLE_IP)
6378 data->ip = perf_instruction_pointer(regs);
6379
6380 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6381 int size = 1;
6382
6383 data->callchain = perf_callchain(event, regs);
6384 size += data->callchain->nr;
6385
6386 header->size += size * sizeof(u64);
6387 }
6388
6389 if (sample_type & PERF_SAMPLE_RAW) {
6390 struct perf_raw_record *raw = data->raw;
6391 int size;
6392
6393 if (raw) {
6394 struct perf_raw_frag *frag = &raw->frag;
6395 u32 sum = 0;
6396
6397 do {
6398 sum += frag->size;
6399 if (perf_raw_frag_last(frag))
6400 break;
6401 frag = frag->next;
6402 } while (1);
6403
6404 size = round_up(sum + sizeof(u32), sizeof(u64));
6405 raw->size = size - sizeof(u32);
6406 frag->pad = raw->size - sum;
6407 } else {
6408 size = sizeof(u64);
6409 }
6410
6411 header->size += size;
6412 }
6413
6414 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6415 int size = sizeof(u64);
6416 if (data->br_stack) {
6417 size += data->br_stack->nr
6418 * sizeof(struct perf_branch_entry);
6419 }
6420 header->size += size;
6421 }
6422
6423 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6424 perf_sample_regs_user(&data->regs_user, regs,
6425 &data->regs_user_copy);
6426
6427 if (sample_type & PERF_SAMPLE_REGS_USER) {
6428
6429 int size = sizeof(u64);
6430
6431 if (data->regs_user.regs) {
6432 u64 mask = event->attr.sample_regs_user;
6433 size += hweight64(mask) * sizeof(u64);
6434 }
6435
6436 header->size += size;
6437 }
6438
6439 if (sample_type & PERF_SAMPLE_STACK_USER) {
6440
6441
6442
6443
6444
6445
6446 u16 stack_size = event->attr.sample_stack_user;
6447 u16 size = sizeof(u64);
6448
6449 stack_size = perf_sample_ustack_size(stack_size, header->size,
6450 data->regs_user.regs);
6451
6452
6453
6454
6455
6456
6457 if (stack_size)
6458 size += sizeof(u64) + stack_size;
6459
6460 data->stack_user_size = stack_size;
6461 header->size += size;
6462 }
6463
6464 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6465
6466 int size = sizeof(u64);
6467
6468 perf_sample_regs_intr(&data->regs_intr, regs);
6469
6470 if (data->regs_intr.regs) {
6471 u64 mask = event->attr.sample_regs_intr;
6472
6473 size += hweight64(mask) * sizeof(u64);
6474 }
6475
6476 header->size += size;
6477 }
6478
6479 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6480 data->phys_addr = perf_virt_to_phys(data->addr);
6481}
6482
6483static void __always_inline
6484__perf_event_output(struct perf_event *event,
6485 struct perf_sample_data *data,
6486 struct pt_regs *regs,
6487 int (*output_begin)(struct perf_output_handle *,
6488 struct perf_event *,
6489 unsigned int))
6490{
6491 struct perf_output_handle handle;
6492 struct perf_event_header header;
6493
6494
6495 rcu_read_lock();
6496
6497 perf_prepare_sample(&header, data, event, regs);
6498
6499 if (output_begin(&handle, event, header.size))
6500 goto exit;
6501
6502 perf_output_sample(&handle, &header, data, event);
6503
6504 perf_output_end(&handle);
6505
6506exit:
6507 rcu_read_unlock();
6508}
6509
6510void
6511perf_event_output_forward(struct perf_event *event,
6512 struct perf_sample_data *data,
6513 struct pt_regs *regs)
6514{
6515 __perf_event_output(event, data, regs, perf_output_begin_forward);
6516}
6517
6518void
6519perf_event_output_backward(struct perf_event *event,
6520 struct perf_sample_data *data,
6521 struct pt_regs *regs)
6522{
6523 __perf_event_output(event, data, regs, perf_output_begin_backward);
6524}
6525
6526void
6527perf_event_output(struct perf_event *event,
6528 struct perf_sample_data *data,
6529 struct pt_regs *regs)
6530{
6531 __perf_event_output(event, data, regs, perf_output_begin);
6532}
6533
6534
6535
6536
6537
6538struct perf_read_event {
6539 struct perf_event_header header;
6540
6541 u32 pid;
6542 u32 tid;
6543};
6544
6545static void
6546perf_event_read_event(struct perf_event *event,
6547 struct task_struct *task)
6548{
6549 struct perf_output_handle handle;
6550 struct perf_sample_data sample;
6551 struct perf_read_event read_event = {
6552 .header = {
6553 .type = PERF_RECORD_READ,
6554 .misc = 0,
6555 .size = sizeof(read_event) + event->read_size,
6556 },
6557 .pid = perf_event_pid(event, task),
6558 .tid = perf_event_tid(event, task),
6559 };
6560 int ret;
6561
6562 perf_event_header__init_id(&read_event.header, &sample, event);
6563 ret = perf_output_begin(&handle, event, read_event.header.size);
6564 if (ret)
6565 return;
6566
6567 perf_output_put(&handle, read_event);
6568 perf_output_read(&handle, event);
6569 perf_event__output_id_sample(event, &handle, &sample);
6570
6571 perf_output_end(&handle);
6572}
6573
6574typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6575
6576static void
6577perf_iterate_ctx(struct perf_event_context *ctx,
6578 perf_iterate_f output,
6579 void *data, bool all)
6580{
6581 struct perf_event *event;
6582
6583 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6584 if (!all) {
6585 if (event->state < PERF_EVENT_STATE_INACTIVE)
6586 continue;
6587 if (!event_filter_match(event))
6588 continue;
6589 }
6590
6591 output(event, data);
6592 }
6593}
6594
6595static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6596{
6597 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6598 struct perf_event *event;
6599
6600 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6601
6602
6603
6604
6605
6606 if (!smp_load_acquire(&event->ctx))
6607 continue;
6608
6609 if (event->state < PERF_EVENT_STATE_INACTIVE)
6610 continue;
6611 if (!event_filter_match(event))
6612 continue;
6613 output(event, data);
6614 }
6615}
6616
6617
6618
6619
6620
6621
6622
6623static void
6624perf_iterate_sb(perf_iterate_f output, void *data,
6625 struct perf_event_context *task_ctx)
6626{
6627 struct perf_event_context *ctx;
6628 int ctxn;
6629
6630 rcu_read_lock();
6631 preempt_disable();
6632
6633
6634
6635
6636
6637
6638 if (task_ctx) {
6639 perf_iterate_ctx(task_ctx, output, data, false);
6640 goto done;
6641 }
6642
6643 perf_iterate_sb_cpu(output, data);
6644
6645 for_each_task_context_nr(ctxn) {
6646 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6647 if (ctx)
6648 perf_iterate_ctx(ctx, output, data, false);
6649 }
6650done:
6651 preempt_enable();
6652 rcu_read_unlock();
6653}
6654
6655
6656
6657
6658
6659static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6660{
6661 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6662 struct perf_addr_filter *filter;
6663 unsigned int restart = 0, count = 0;
6664 unsigned long flags;
6665
6666 if (!has_addr_filter(event))
6667 return;
6668
6669 raw_spin_lock_irqsave(&ifh->lock, flags);
6670 list_for_each_entry(filter, &ifh->list, entry) {
6671 if (filter->inode) {
6672 event->addr_filters_offs[count] = 0;
6673 restart++;
6674 }
6675
6676 count++;
6677 }
6678
6679 if (restart)
6680 event->addr_filters_gen++;
6681 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6682
6683 if (restart)
6684 perf_event_stop(event, 1);
6685}
6686
6687void perf_event_exec(void)
6688{
6689 struct perf_event_context *ctx;
6690 int ctxn;
6691
6692 rcu_read_lock();
6693 for_each_task_context_nr(ctxn) {
6694 ctx = current->perf_event_ctxp[ctxn];
6695 if (!ctx)
6696 continue;
6697
6698 perf_event_enable_on_exec(ctxn);
6699
6700 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6701 true);
6702 }
6703 rcu_read_unlock();
6704}
6705
6706struct remote_output {
6707 struct ring_buffer *rb;
6708 int err;
6709};
6710
6711static void __perf_event_output_stop(struct perf_event *event, void *data)
6712{
6713 struct perf_event *parent = event->parent;
6714 struct remote_output *ro = data;
6715 struct ring_buffer *rb = ro->rb;
6716 struct stop_event_data sd = {
6717 .event = event,
6718 };
6719
6720 if (!has_aux(event))
6721 return;
6722
6723 if (!parent)
6724 parent = event;
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736 if (rcu_dereference(parent->rb) == rb)
6737 ro->err = __perf_event_stop(&sd);
6738}
6739
6740static int __perf_pmu_output_stop(void *info)
6741{
6742 struct perf_event *event = info;
6743 struct pmu *pmu = event->pmu;
6744 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6745 struct remote_output ro = {
6746 .rb = event->rb,
6747 };
6748
6749 rcu_read_lock();
6750 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6751 if (cpuctx->task_ctx)
6752 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6753 &ro, false);
6754 rcu_read_unlock();
6755
6756 return ro.err;
6757}
6758
6759static void perf_pmu_output_stop(struct perf_event *event)
6760{
6761 struct perf_event *iter;
6762 int err, cpu;
6763
6764restart:
6765 rcu_read_lock();
6766 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6767
6768
6769
6770
6771
6772
6773 cpu = iter->cpu;
6774 if (cpu == -1)
6775 cpu = READ_ONCE(iter->oncpu);
6776
6777 if (cpu == -1)
6778 continue;
6779
6780 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6781 if (err == -EAGAIN) {
6782 rcu_read_unlock();
6783 goto restart;
6784 }
6785 }
6786 rcu_read_unlock();
6787}
6788
6789
6790
6791
6792
6793
6794
6795struct perf_task_event {
6796 struct task_struct *task;
6797 struct perf_event_context *task_ctx;
6798
6799 struct {
6800 struct perf_event_header header;
6801
6802 u32 pid;
6803 u32 ppid;
6804 u32 tid;
6805 u32 ptid;
6806 u64 time;
6807 } event_id;
6808};
6809
6810static int perf_event_task_match(struct perf_event *event)
6811{
6812 return event->attr.comm || event->attr.mmap ||
6813 event->attr.mmap2 || event->attr.mmap_data ||
6814 event->attr.task;
6815}
6816
6817static void perf_event_task_output(struct perf_event *event,
6818 void *data)
6819{
6820 struct perf_task_event *task_event = data;
6821 struct perf_output_handle handle;
6822 struct perf_sample_data sample;
6823 struct task_struct *task = task_event->task;
6824 int ret, size = task_event->event_id.header.size;
6825
6826 if (!perf_event_task_match(event))
6827 return;
6828
6829 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6830
6831 ret = perf_output_begin(&handle, event,
6832 task_event->event_id.header.size);
6833 if (ret)
6834 goto out;
6835
6836 task_event->event_id.pid = perf_event_pid(event, task);
6837 task_event->event_id.ppid = perf_event_pid(event, current);
6838
6839 task_event->event_id.tid = perf_event_tid(event, task);
6840 task_event->event_id.ptid = perf_event_tid(event, current);
6841
6842 task_event->event_id.time = perf_event_clock(event);
6843
6844 perf_output_put(&handle, task_event->event_id);
6845
6846 perf_event__output_id_sample(event, &handle, &sample);
6847
6848 perf_output_end(&handle);
6849out:
6850 task_event->event_id.header.size = size;
6851}
6852
6853static void perf_event_task(struct task_struct *task,
6854 struct perf_event_context *task_ctx,
6855 int new)
6856{
6857 struct perf_task_event task_event;
6858
6859 if (!atomic_read(&nr_comm_events) &&
6860 !atomic_read(&nr_mmap_events) &&
6861 !atomic_read(&nr_task_events))
6862 return;
6863
6864 task_event = (struct perf_task_event){
6865 .task = task,
6866 .task_ctx = task_ctx,
6867 .event_id = {
6868 .header = {
6869 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6870 .misc = 0,
6871 .size = sizeof(task_event.event_id),
6872 },
6873
6874
6875
6876
6877
6878 },
6879 };
6880
6881 perf_iterate_sb(perf_event_task_output,
6882 &task_event,
6883 task_ctx);
6884}
6885
6886void perf_event_fork(struct task_struct *task)
6887{
6888 perf_event_task(task, NULL, 1);
6889 perf_event_namespaces(task);
6890}
6891
6892
6893
6894
6895
6896struct perf_comm_event {
6897 struct task_struct *task;
6898 char *comm;
6899 int comm_size;
6900
6901 struct {
6902 struct perf_event_header header;
6903
6904 u32 pid;
6905 u32 tid;
6906 } event_id;
6907};
6908
6909static int perf_event_comm_match(struct perf_event *event)
6910{
6911 return event->attr.comm;
6912}
6913
6914static void perf_event_comm_output(struct perf_event *event,
6915 void *data)
6916{
6917 struct perf_comm_event *comm_event = data;
6918 struct perf_output_handle handle;
6919 struct perf_sample_data sample;
6920 int size = comm_event->event_id.header.size;
6921 int ret;
6922
6923 if (!perf_event_comm_match(event))
6924 return;
6925
6926 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6927 ret = perf_output_begin(&handle, event,
6928 comm_event->event_id.header.size);
6929
6930 if (ret)
6931 goto out;
6932
6933 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6934 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6935
6936 perf_output_put(&handle, comm_event->event_id);
6937 __output_copy(&handle, comm_event->comm,
6938 comm_event->comm_size);
6939
6940 perf_event__output_id_sample(event, &handle, &sample);
6941
6942 perf_output_end(&handle);
6943out:
6944 comm_event->event_id.header.size = size;
6945}
6946
6947static void perf_event_comm_event(struct perf_comm_event *comm_event)
6948{
6949 char comm[TASK_COMM_LEN];
6950 unsigned int size;
6951
6952 memset(comm, 0, sizeof(comm));
6953 strlcpy(comm, comm_event->task->comm, sizeof(comm));
6954 size = ALIGN(strlen(comm)+1, sizeof(u64));
6955
6956 comm_event->comm = comm;
6957 comm_event->comm_size = size;
6958
6959 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6960
6961 perf_iterate_sb(perf_event_comm_output,
6962 comm_event,
6963 NULL);
6964}
6965
6966void perf_event_comm(struct task_struct *task, bool exec)
6967{
6968 struct perf_comm_event comm_event;
6969
6970 if (!atomic_read(&nr_comm_events))
6971 return;
6972
6973 comm_event = (struct perf_comm_event){
6974 .task = task,
6975
6976
6977 .event_id = {
6978 .header = {
6979 .type = PERF_RECORD_COMM,
6980 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6981
6982 },
6983
6984
6985 },
6986 };
6987
6988 perf_event_comm_event(&comm_event);
6989}
6990
6991
6992
6993
6994
6995struct perf_namespaces_event {
6996 struct task_struct *task;
6997
6998 struct {
6999 struct perf_event_header header;
7000
7001 u32 pid;
7002 u32 tid;
7003 u64 nr_namespaces;
7004 struct perf_ns_link_info link_info[NR_NAMESPACES];
7005 } event_id;
7006};
7007
7008static int perf_event_namespaces_match(struct perf_event *event)
7009{
7010 return event->attr.namespaces;
7011}
7012
7013static void perf_event_namespaces_output(struct perf_event *event,
7014 void *data)
7015{
7016 struct perf_namespaces_event *namespaces_event = data;
7017 struct perf_output_handle handle;
7018 struct perf_sample_data sample;
7019 u16 header_size = namespaces_event->event_id.header.size;
7020 int ret;
7021
7022 if (!perf_event_namespaces_match(event))
7023 return;
7024
7025 perf_event_header__init_id(&namespaces_event->event_id.header,
7026 &sample, event);
7027 ret = perf_output_begin(&handle, event,
7028 namespaces_event->event_id.header.size);
7029 if (ret)
7030 goto out;
7031
7032 namespaces_event->event_id.pid = perf_event_pid(event,
7033 namespaces_event->task);
7034 namespaces_event->event_id.tid = perf_event_tid(event,
7035 namespaces_event->task);
7036
7037 perf_output_put(&handle, namespaces_event->event_id);
7038
7039 perf_event__output_id_sample(event, &handle, &sample);
7040
7041 perf_output_end(&handle);
7042out:
7043 namespaces_event->event_id.header.size = header_size;
7044}
7045
7046static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7047 struct task_struct *task,
7048 const struct proc_ns_operations *ns_ops)
7049{
7050 struct path ns_path;
7051 struct inode *ns_inode;
7052 void *error;
7053
7054 error = ns_get_path(&ns_path, task, ns_ops);
7055 if (!error) {
7056 ns_inode = ns_path.dentry->d_inode;
7057 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7058 ns_link_info->ino = ns_inode->i_ino;
7059 path_put(&ns_path);
7060 }
7061}
7062
7063void perf_event_namespaces(struct task_struct *task)
7064{
7065 struct perf_namespaces_event namespaces_event;
7066 struct perf_ns_link_info *ns_link_info;
7067
7068 if (!atomic_read(&nr_namespaces_events))
7069 return;
7070
7071 namespaces_event = (struct perf_namespaces_event){
7072 .task = task,
7073 .event_id = {
7074 .header = {
7075 .type = PERF_RECORD_NAMESPACES,
7076 .misc = 0,
7077 .size = sizeof(namespaces_event.event_id),
7078 },
7079
7080
7081 .nr_namespaces = NR_NAMESPACES,
7082
7083 },
7084 };
7085
7086 ns_link_info = namespaces_event.event_id.link_info;
7087
7088 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7089 task, &mntns_operations);
7090
7091#ifdef CONFIG_USER_NS
7092 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7093 task, &userns_operations);
7094#endif
7095#ifdef CONFIG_NET_NS
7096 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7097 task, &netns_operations);
7098#endif
7099#ifdef CONFIG_UTS_NS
7100 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7101 task, &utsns_operations);
7102#endif
7103#ifdef CONFIG_IPC_NS
7104 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7105 task, &ipcns_operations);
7106#endif
7107#ifdef CONFIG_PID_NS
7108 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7109 task, &pidns_operations);
7110#endif
7111#ifdef CONFIG_CGROUPS
7112 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7113 task, &cgroupns_operations);
7114#endif
7115
7116 perf_iterate_sb(perf_event_namespaces_output,
7117 &namespaces_event,
7118 NULL);
7119}
7120
7121
7122
7123
7124
7125struct perf_mmap_event {
7126 struct vm_area_struct *vma;
7127
7128 const char *file_name;
7129 int file_size;
7130 int maj, min;
7131 u64 ino;
7132 u64 ino_generation;
7133 u32 prot, flags;
7134
7135 struct {
7136 struct perf_event_header header;
7137
7138 u32 pid;
7139 u32 tid;
7140 u64 start;
7141 u64 len;
7142 u64 pgoff;
7143 } event_id;
7144};
7145
7146static int perf_event_mmap_match(struct perf_event *event,
7147 void *data)
7148{
7149 struct perf_mmap_event *mmap_event = data;
7150 struct vm_area_struct *vma = mmap_event->vma;
7151 int executable = vma->vm_flags & VM_EXEC;
7152
7153 return (!executable && event->attr.mmap_data) ||
7154 (executable && (event->attr.mmap || event->attr.mmap2));
7155}
7156
7157static void perf_event_mmap_output(struct perf_event *event,
7158 void *data)
7159{
7160 struct perf_mmap_event *mmap_event = data;
7161 struct perf_output_handle handle;
7162 struct perf_sample_data sample;
7163 int size = mmap_event->event_id.header.size;
7164 int ret;
7165
7166 if (!perf_event_mmap_match(event, data))
7167 return;
7168
7169 if (event->attr.mmap2) {
7170 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7171 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7172 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7173 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7174 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7175 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7176 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7177 }
7178
7179 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7180 ret = perf_output_begin(&handle, event,
7181 mmap_event->event_id.header.size);
7182 if (ret)
7183 goto out;
7184
7185 mmap_event->event_id.pid = perf_event_pid(event, current);
7186 mmap_event->event_id.tid = perf_event_tid(event, current);
7187
7188 perf_output_put(&handle, mmap_event->event_id);
7189
7190 if (event->attr.mmap2) {
7191 perf_output_put(&handle, mmap_event->maj);
7192 perf_output_put(&handle, mmap_event->min);
7193 perf_output_put(&handle, mmap_event->ino);
7194 perf_output_put(&handle, mmap_event->ino_generation);
7195 perf_output_put(&handle, mmap_event->prot);
7196 perf_output_put(&handle, mmap_event->flags);
7197 }
7198
7199 __output_copy(&handle, mmap_event->file_name,
7200 mmap_event->file_size);
7201
7202 perf_event__output_id_sample(event, &handle, &sample);
7203
7204 perf_output_end(&handle);
7205out:
7206 mmap_event->event_id.header.size = size;
7207}
7208
7209static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7210{
7211 struct vm_area_struct *vma = mmap_event->vma;
7212 struct file *file = vma->vm_file;
7213 int maj = 0, min = 0;
7214 u64 ino = 0, gen = 0;
7215 u32 prot = 0, flags = 0;
7216 unsigned int size;
7217 char tmp[16];
7218 char *buf = NULL;
7219 char *name;
7220
7221 if (vma->vm_flags & VM_READ)
7222 prot |= PROT_READ;
7223 if (vma->vm_flags & VM_WRITE)
7224 prot |= PROT_WRITE;
7225 if (vma->vm_flags & VM_EXEC)
7226 prot |= PROT_EXEC;
7227
7228 if (vma->vm_flags & VM_MAYSHARE)
7229 flags = MAP_SHARED;
7230 else
7231 flags = MAP_PRIVATE;
7232
7233 if (vma->vm_flags & VM_DENYWRITE)
7234 flags |= MAP_DENYWRITE;
7235 if (vma->vm_flags & VM_MAYEXEC)
7236 flags |= MAP_EXECUTABLE;
7237 if (vma->vm_flags & VM_LOCKED)
7238 flags |= MAP_LOCKED;
7239 if (vma->vm_flags & VM_HUGETLB)
7240 flags |= MAP_HUGETLB;
7241
7242 if (file) {
7243 struct inode *inode;
7244 dev_t dev;
7245
7246 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7247 if (!buf) {
7248 name = "//enomem";
7249 goto cpy_name;
7250 }
7251
7252
7253
7254
7255
7256 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7257 if (IS_ERR(name)) {
7258 name = "//toolong";
7259 goto cpy_name;
7260 }
7261 inode = file_inode(vma->vm_file);
7262 dev = inode->i_sb->s_dev;
7263 ino = inode->i_ino;
7264 gen = inode->i_generation;
7265 maj = MAJOR(dev);
7266 min = MINOR(dev);
7267
7268 goto got_name;
7269 } else {
7270 if (vma->vm_ops && vma->vm_ops->name) {
7271 name = (char *) vma->vm_ops->name(vma);
7272 if (name)
7273 goto cpy_name;
7274 }
7275
7276 name = (char *)arch_vma_name(vma);
7277 if (name)
7278 goto cpy_name;
7279
7280 if (vma->vm_start <= vma->vm_mm->start_brk &&
7281 vma->vm_end >= vma->vm_mm->brk) {
7282 name = "[heap]";
7283 goto cpy_name;
7284 }
7285 if (vma->vm_start <= vma->vm_mm->start_stack &&
7286 vma->vm_end >= vma->vm_mm->start_stack) {
7287 name = "[stack]";
7288 goto cpy_name;
7289 }
7290
7291 name = "//anon";
7292 goto cpy_name;
7293 }
7294
7295cpy_name:
7296 strlcpy(tmp, name, sizeof(tmp));
7297 name = tmp;
7298got_name:
7299
7300
7301
7302
7303
7304 size = strlen(name)+1;
7305 while (!IS_ALIGNED(size, sizeof(u64)))
7306 name[size++] = '\0';
7307
7308 mmap_event->file_name = name;
7309 mmap_event->file_size = size;
7310 mmap_event->maj = maj;
7311 mmap_event->min = min;
7312 mmap_event->ino = ino;
7313 mmap_event->ino_generation = gen;
7314 mmap_event->prot = prot;
7315 mmap_event->flags = flags;
7316
7317 if (!(vma->vm_flags & VM_EXEC))
7318 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7319
7320 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7321
7322 perf_iterate_sb(perf_event_mmap_output,
7323 mmap_event,
7324 NULL);
7325
7326 kfree(buf);
7327}
7328
7329
7330
7331
7332static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7333 struct file *file, unsigned long offset,
7334 unsigned long size)
7335{
7336 if (filter->inode != file_inode(file))
7337 return false;
7338
7339 if (filter->offset > offset + size)
7340 return false;
7341
7342 if (filter->offset + filter->size < offset)
7343 return false;
7344
7345 return true;
7346}
7347
7348static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7349{
7350 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7351 struct vm_area_struct *vma = data;
7352 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7353 struct file *file = vma->vm_file;
7354 struct perf_addr_filter *filter;
7355 unsigned int restart = 0, count = 0;
7356
7357 if (!has_addr_filter(event))
7358 return;
7359
7360 if (!file)
7361 return;
7362
7363 raw_spin_lock_irqsave(&ifh->lock, flags);
7364 list_for_each_entry(filter, &ifh->list, entry) {
7365 if (perf_addr_filter_match(filter, file, off,
7366 vma->vm_end - vma->vm_start)) {
7367 event->addr_filters_offs[count] = vma->vm_start;
7368 restart++;
7369 }
7370
7371 count++;
7372 }
7373
7374 if (restart)
7375 event->addr_filters_gen++;
7376 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7377
7378 if (restart)
7379 perf_event_stop(event, 1);
7380}
7381
7382
7383
7384
7385static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7386{
7387 struct perf_event_context *ctx;
7388 int ctxn;
7389
7390
7391
7392
7393
7394 if (!(vma->vm_flags & VM_EXEC))
7395 return;
7396
7397 rcu_read_lock();
7398 for_each_task_context_nr(ctxn) {
7399 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7400 if (!ctx)
7401 continue;
7402
7403 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7404 }
7405 rcu_read_unlock();
7406}
7407
7408void perf_event_mmap(struct vm_area_struct *vma)
7409{
7410 struct perf_mmap_event mmap_event;
7411
7412 if (!atomic_read(&nr_mmap_events))
7413 return;
7414
7415 mmap_event = (struct perf_mmap_event){
7416 .vma = vma,
7417
7418
7419 .event_id = {
7420 .header = {
7421 .type = PERF_RECORD_MMAP,
7422 .misc = PERF_RECORD_MISC_USER,
7423
7424 },
7425
7426
7427 .start = vma->vm_start,
7428 .len = vma->vm_end - vma->vm_start,
7429 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
7430 },
7431
7432
7433
7434
7435
7436
7437 };
7438
7439 perf_addr_filters_adjust(vma);
7440 perf_event_mmap_event(&mmap_event);
7441}
7442
7443void perf_event_aux_event(struct perf_event *event, unsigned long head,
7444 unsigned long size, u64 flags)
7445{
7446 struct perf_output_handle handle;
7447 struct perf_sample_data sample;
7448 struct perf_aux_event {
7449 struct perf_event_header header;
7450 u64 offset;
7451 u64 size;
7452 u64 flags;
7453 } rec = {
7454 .header = {
7455 .type = PERF_RECORD_AUX,
7456 .misc = 0,
7457 .size = sizeof(rec),
7458 },
7459 .offset = head,
7460 .size = size,
7461 .flags = flags,
7462 };
7463 int ret;
7464
7465 perf_event_header__init_id(&rec.header, &sample, event);
7466 ret = perf_output_begin(&handle, event, rec.header.size);
7467
7468 if (ret)
7469 return;
7470
7471 perf_output_put(&handle, rec);
7472 perf_event__output_id_sample(event, &handle, &sample);
7473
7474 perf_output_end(&handle);
7475}
7476
7477
7478
7479
7480void perf_log_lost_samples(struct perf_event *event, u64 lost)
7481{
7482 struct perf_output_handle handle;
7483 struct perf_sample_data sample;
7484 int ret;
7485
7486 struct {
7487 struct perf_event_header header;
7488 u64 lost;
7489 } lost_samples_event = {
7490 .header = {
7491 .type = PERF_RECORD_LOST_SAMPLES,
7492 .misc = 0,
7493 .size = sizeof(lost_samples_event),
7494 },
7495 .lost = lost,
7496 };
7497
7498 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7499
7500 ret = perf_output_begin(&handle, event,
7501 lost_samples_event.header.size);
7502 if (ret)
7503 return;
7504
7505 perf_output_put(&handle, lost_samples_event);
7506 perf_event__output_id_sample(event, &handle, &sample);
7507 perf_output_end(&handle);
7508}
7509
7510
7511
7512
7513
7514struct perf_switch_event {
7515 struct task_struct *task;
7516 struct task_struct *next_prev;
7517
7518 struct {
7519 struct perf_event_header header;
7520 u32 next_prev_pid;
7521 u32 next_prev_tid;
7522 } event_id;
7523};
7524
7525static int perf_event_switch_match(struct perf_event *event)
7526{
7527 return event->attr.context_switch;
7528}
7529
7530static void perf_event_switch_output(struct perf_event *event, void *data)
7531{
7532 struct perf_switch_event *se = data;
7533 struct perf_output_handle handle;
7534 struct perf_sample_data sample;
7535 int ret;
7536
7537 if (!perf_event_switch_match(event))
7538 return;
7539
7540
7541 if (event->ctx->task) {
7542 se->event_id.header.type = PERF_RECORD_SWITCH;
7543 se->event_id.header.size = sizeof(se->event_id.header);
7544 } else {
7545 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7546 se->event_id.header.size = sizeof(se->event_id);
7547 se->event_id.next_prev_pid =
7548 perf_event_pid(event, se->next_prev);
7549 se->event_id.next_prev_tid =
7550 perf_event_tid(event, se->next_prev);
7551 }
7552
7553 perf_event_header__init_id(&se->event_id.header, &sample, event);
7554
7555 ret = perf_output_begin(&handle, event, se->event_id.header.size);
7556 if (ret)
7557 return;
7558
7559 if (event->ctx->task)
7560 perf_output_put(&handle, se->event_id.header);
7561 else
7562 perf_output_put(&handle, se->event_id);
7563
7564 perf_event__output_id_sample(event, &handle, &sample);
7565
7566 perf_output_end(&handle);
7567}
7568
7569static void perf_event_switch(struct task_struct *task,
7570 struct task_struct *next_prev, bool sched_in)
7571{
7572 struct perf_switch_event switch_event;
7573
7574
7575
7576 switch_event = (struct perf_switch_event){
7577 .task = task,
7578 .next_prev = next_prev,
7579 .event_id = {
7580 .header = {
7581
7582 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7583
7584 },
7585
7586
7587 },
7588 };
7589
7590 if (!sched_in && task->state == TASK_RUNNING)
7591 switch_event.event_id.header.misc |=
7592 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
7593
7594 perf_iterate_sb(perf_event_switch_output,
7595 &switch_event,
7596 NULL);
7597}
7598
7599
7600
7601
7602
7603static void perf_log_throttle(struct perf_event *event, int enable)
7604{
7605 struct perf_output_handle handle;
7606 struct perf_sample_data sample;
7607 int ret;
7608
7609 struct {
7610 struct perf_event_header header;
7611 u64 time;
7612 u64 id;
7613 u64 stream_id;
7614 } throttle_event = {
7615 .header = {
7616 .type = PERF_RECORD_THROTTLE,
7617 .misc = 0,
7618 .size = sizeof(throttle_event),
7619 },
7620 .time = perf_event_clock(event),
7621 .id = primary_event_id(event),
7622 .stream_id = event->id,
7623 };
7624
7625 if (enable)
7626 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7627
7628 perf_event_header__init_id(&throttle_event.header, &sample, event);
7629
7630 ret = perf_output_begin(&handle, event,
7631 throttle_event.header.size);
7632 if (ret)
7633 return;
7634
7635 perf_output_put(&handle, throttle_event);
7636 perf_event__output_id_sample(event, &handle, &sample);
7637 perf_output_end(&handle);
7638}
7639
7640void perf_event_itrace_started(struct perf_event *event)
7641{
7642 event->attach_state |= PERF_ATTACH_ITRACE;
7643}
7644
7645static void perf_log_itrace_start(struct perf_event *event)
7646{
7647 struct perf_output_handle handle;
7648 struct perf_sample_data sample;
7649 struct perf_aux_event {
7650 struct perf_event_header header;
7651 u32 pid;
7652 u32 tid;
7653 } rec;
7654 int ret;
7655
7656 if (event->parent)
7657 event = event->parent;
7658
7659 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7660 event->attach_state & PERF_ATTACH_ITRACE)
7661 return;
7662
7663 rec.header.type = PERF_RECORD_ITRACE_START;
7664 rec.header.misc = 0;
7665 rec.header.size = sizeof(rec);
7666 rec.pid = perf_event_pid(event, current);
7667 rec.tid = perf_event_tid(event, current);
7668
7669 perf_event_header__init_id(&rec.header, &sample, event);
7670 ret = perf_output_begin(&handle, event, rec.header.size);
7671
7672 if (ret)
7673 return;
7674
7675 perf_output_put(&handle, rec);
7676 perf_event__output_id_sample(event, &handle, &sample);
7677
7678 perf_output_end(&handle);
7679}
7680
7681static int
7682__perf_event_account_interrupt(struct perf_event *event, int throttle)
7683{
7684 struct hw_perf_event *hwc = &event->hw;
7685 int ret = 0;
7686 u64 seq;
7687
7688 seq = __this_cpu_read(perf_throttled_seq);
7689 if (seq != hwc->interrupts_seq) {
7690 hwc->interrupts_seq = seq;
7691 hwc->interrupts = 1;
7692 } else {
7693 hwc->interrupts++;
7694 if (unlikely(throttle
7695 && hwc->interrupts >= max_samples_per_tick)) {
7696 __this_cpu_inc(perf_throttled_count);
7697 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7698 hwc->interrupts = MAX_INTERRUPTS;
7699 perf_log_throttle(event, 0);
7700 ret = 1;
7701 }
7702 }
7703
7704 if (event->attr.freq) {
7705 u64 now = perf_clock();
7706 s64 delta = now - hwc->freq_time_stamp;
7707
7708 hwc->freq_time_stamp = now;
7709
7710 if (delta > 0 && delta < 2*TICK_NSEC)
7711 perf_adjust_period(event, delta, hwc->last_period, true);
7712 }
7713
7714 return ret;
7715}
7716
7717int perf_event_account_interrupt(struct perf_event *event)
7718{
7719 return __perf_event_account_interrupt(event, 1);
7720}
7721
7722
7723
7724
7725
7726static int __perf_event_overflow(struct perf_event *event,
7727 int throttle, struct perf_sample_data *data,
7728 struct pt_regs *regs)
7729{
7730 int events = atomic_read(&event->event_limit);
7731 int ret = 0;
7732
7733
7734
7735
7736
7737 if (unlikely(!is_sampling_event(event)))
7738 return 0;
7739
7740 ret = __perf_event_account_interrupt(event, throttle);
7741
7742
7743
7744
7745
7746
7747 event->pending_kill = POLL_IN;
7748 if (events && atomic_dec_and_test(&event->event_limit)) {
7749 ret = 1;
7750 event->pending_kill = POLL_HUP;
7751
7752 perf_event_disable_inatomic(event);
7753 }
7754
7755 READ_ONCE(event->overflow_handler)(event, data, regs);
7756
7757 if (*perf_event_fasync(event) && event->pending_kill) {
7758 event->pending_wakeup = 1;
7759 irq_work_queue(&event->pending);
7760 }
7761
7762 return ret;
7763}
7764
7765int perf_event_overflow(struct perf_event *event,
7766 struct perf_sample_data *data,
7767 struct pt_regs *regs)
7768{
7769 return __perf_event_overflow(event, 1, data, regs);
7770}
7771
7772
7773
7774
7775
7776struct swevent_htable {
7777 struct swevent_hlist *swevent_hlist;
7778 struct mutex hlist_mutex;
7779 int hlist_refcount;
7780
7781
7782 int recursion[PERF_NR_CONTEXTS];
7783};
7784
7785static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7786
7787
7788
7789
7790
7791
7792
7793
7794u64 perf_swevent_set_period(struct perf_event *event)
7795{
7796 struct hw_perf_event *hwc = &event->hw;
7797 u64 period = hwc->last_period;
7798 u64 nr, offset;
7799 s64 old, val;
7800
7801 hwc->last_period = hwc->sample_period;
7802
7803again:
7804 old = val = local64_read(&hwc->period_left);
7805 if (val < 0)
7806 return 0;
7807
7808 nr = div64_u64(period + val, period);
7809 offset = nr * period;
7810 val -= offset;
7811 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7812 goto again;
7813
7814 return nr;
7815}
7816
7817static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7818 struct perf_sample_data *data,
7819 struct pt_regs *regs)
7820{
7821 struct hw_perf_event *hwc = &event->hw;
7822 int throttle = 0;
7823
7824 if (!overflow)
7825 overflow = perf_swevent_set_period(event);
7826
7827 if (hwc->interrupts == MAX_INTERRUPTS)
7828 return;
7829
7830 for (; overflow; overflow--) {
7831 if (__perf_event_overflow(event, throttle,
7832 data, regs)) {
7833
7834
7835
7836
7837 break;
7838 }
7839 throttle = 1;
7840 }
7841}
7842
7843static void perf_swevent_event(struct perf_event *event, u64 nr,
7844 struct perf_sample_data *data,
7845 struct pt_regs *regs)
7846{
7847 struct hw_perf_event *hwc = &event->hw;
7848
7849 local64_add(nr, &event->count);
7850
7851 if (!regs)
7852 return;
7853
7854 if (!is_sampling_event(event))
7855 return;
7856
7857 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7858 data->period = nr;
7859 return perf_swevent_overflow(event, 1, data, regs);
7860 } else
7861 data->period = event->hw.last_period;
7862
7863 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7864 return perf_swevent_overflow(event, 1, data, regs);
7865
7866 if (local64_add_negative(nr, &hwc->period_left))
7867 return;
7868
7869 perf_swevent_overflow(event, 0, data, regs);
7870}
7871
7872static int perf_exclude_event(struct perf_event *event,
7873 struct pt_regs *regs)
7874{
7875 if (event->hw.state & PERF_HES_STOPPED)
7876 return 1;
7877
7878 if (regs) {
7879 if (event->attr.exclude_user && user_mode(regs))
7880 return 1;
7881
7882 if (event->attr.exclude_kernel && !user_mode(regs))
7883 return 1;
7884 }
7885
7886 return 0;
7887}
7888
7889static int perf_swevent_match(struct perf_event *event,
7890 enum perf_type_id type,
7891 u32 event_id,
7892 struct perf_sample_data *data,
7893 struct pt_regs *regs)
7894{
7895 if (event->attr.type != type)
7896 return 0;
7897
7898 if (event->attr.config != event_id)
7899 return 0;
7900
7901 if (perf_exclude_event(event, regs))
7902 return 0;
7903
7904 return 1;
7905}
7906
7907static inline u64 swevent_hash(u64 type, u32 event_id)
7908{
7909 u64 val = event_id | (type << 32);
7910
7911 return hash_64(val, SWEVENT_HLIST_BITS);
7912}
7913
7914static inline struct hlist_head *
7915__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7916{
7917 u64 hash = swevent_hash(type, event_id);
7918
7919 return &hlist->heads[hash];
7920}
7921
7922
7923static inline struct hlist_head *
7924find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7925{
7926 struct swevent_hlist *hlist;
7927
7928 hlist = rcu_dereference(swhash->swevent_hlist);
7929 if (!hlist)
7930 return NULL;
7931
7932 return __find_swevent_head(hlist, type, event_id);
7933}
7934
7935
7936static inline struct hlist_head *
7937find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7938{
7939 struct swevent_hlist *hlist;
7940 u32 event_id = event->attr.config;
7941 u64 type = event->attr.type;
7942
7943
7944
7945
7946
7947
7948 hlist = rcu_dereference_protected(swhash->swevent_hlist,
7949 lockdep_is_held(&event->ctx->lock));
7950 if (!hlist)
7951 return NULL;
7952
7953 return __find_swevent_head(hlist, type, event_id);
7954}
7955
7956static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7957 u64 nr,
7958 struct perf_sample_data *data,
7959 struct pt_regs *regs)
7960{
7961 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7962 struct perf_event *event;
7963 struct hlist_head *head;
7964
7965 rcu_read_lock();
7966 head = find_swevent_head_rcu(swhash, type, event_id);
7967 if (!head)
7968 goto end;
7969
7970 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7971 if (perf_swevent_match(event, type, event_id, data, regs))
7972 perf_swevent_event(event, nr, data, regs);
7973 }
7974end:
7975 rcu_read_unlock();
7976}
7977
7978DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7979
7980int perf_swevent_get_recursion_context(void)
7981{
7982 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7983
7984 return get_recursion_context(swhash->recursion);
7985}
7986EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7987
7988void perf_swevent_put_recursion_context(int rctx)
7989{
7990 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7991
7992 put_recursion_context(swhash->recursion, rctx);
7993}
7994
7995void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7996{
7997 struct perf_sample_data data;
7998
7999 if (WARN_ON_ONCE(!regs))
8000 return;
8001
8002 perf_sample_data_init(&data, addr, 0);
8003 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8004}
8005
8006void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8007{
8008 int rctx;
8009
8010 preempt_disable_notrace();
8011 rctx = perf_swevent_get_recursion_context();
8012 if (unlikely(rctx < 0))
8013 goto fail;
8014
8015 ___perf_sw_event(event_id, nr, regs, addr);
8016
8017 perf_swevent_put_recursion_context(rctx);
8018fail:
8019 preempt_enable_notrace();
8020}
8021
8022static void perf_swevent_read(struct perf_event *event)
8023{
8024}
8025
8026static int perf_swevent_add(struct perf_event *event, int flags)
8027{
8028 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8029 struct hw_perf_event *hwc = &event->hw;
8030 struct hlist_head *head;
8031
8032 if (is_sampling_event(event)) {
8033 hwc->last_period = hwc->sample_period;
8034 perf_swevent_set_period(event);
8035 }
8036
8037 hwc->state = !(flags & PERF_EF_START);
8038
8039 head = find_swevent_head(swhash, event);
8040 if (WARN_ON_ONCE(!head))
8041 return -EINVAL;
8042
8043 hlist_add_head_rcu(&event->hlist_entry, head);
8044 perf_event_update_userpage(event);
8045
8046 return 0;
8047}
8048
8049static void perf_swevent_del(struct perf_event *event, int flags)
8050{
8051 hlist_del_rcu(&event->hlist_entry);
8052}
8053
8054static void perf_swevent_start(struct perf_event *event, int flags)
8055{
8056 event->hw.state = 0;
8057}
8058
8059static void perf_swevent_stop(struct perf_event *event, int flags)
8060{
8061 event->hw.state = PERF_HES_STOPPED;
8062}
8063
8064
8065static inline struct swevent_hlist *
8066swevent_hlist_deref(struct swevent_htable *swhash)
8067{
8068 return rcu_dereference_protected(swhash->swevent_hlist,
8069 lockdep_is_held(&swhash->hlist_mutex));
8070}
8071
8072static void swevent_hlist_release(struct swevent_htable *swhash)
8073{
8074 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
8075
8076 if (!hlist)
8077 return;
8078
8079 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
8080 kfree_rcu(hlist, rcu_head);
8081}
8082
8083static void swevent_hlist_put_cpu(int cpu)
8084{
8085 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8086
8087 mutex_lock(&swhash->hlist_mutex);
8088
8089 if (!--swhash->hlist_refcount)
8090 swevent_hlist_release(swhash);
8091
8092 mutex_unlock(&swhash->hlist_mutex);
8093}
8094
8095static void swevent_hlist_put(void)
8096{
8097 int cpu;
8098
8099 for_each_possible_cpu(cpu)
8100 swevent_hlist_put_cpu(cpu);
8101}
8102
8103static int swevent_hlist_get_cpu(int cpu)
8104{
8105 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8106 int err = 0;
8107
8108 mutex_lock(&swhash->hlist_mutex);
8109 if (!swevent_hlist_deref(swhash) &&
8110 cpumask_test_cpu(cpu, perf_online_mask)) {
8111 struct swevent_hlist *hlist;
8112
8113 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
8114 if (!hlist) {
8115 err = -ENOMEM;
8116 goto exit;
8117 }
8118 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8119 }
8120 swhash->hlist_refcount++;
8121exit:
8122 mutex_unlock(&swhash->hlist_mutex);
8123
8124 return err;
8125}
8126
8127static int swevent_hlist_get(void)
8128{
8129 int err, cpu, failed_cpu;
8130
8131 mutex_lock(&pmus_lock);
8132 for_each_possible_cpu(cpu) {
8133 err = swevent_hlist_get_cpu(cpu);
8134 if (err) {
8135 failed_cpu = cpu;
8136 goto fail;
8137 }
8138 }
8139 mutex_unlock(&pmus_lock);
8140 return 0;
8141fail:
8142 for_each_possible_cpu(cpu) {
8143 if (cpu == failed_cpu)
8144 break;
8145 swevent_hlist_put_cpu(cpu);
8146 }
8147 mutex_unlock(&pmus_lock);
8148 return err;
8149}
8150
8151struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
8152
8153static void sw_perf_event_destroy(struct perf_event *event)
8154{
8155 u64 event_id = event->attr.config;
8156
8157 WARN_ON(event->parent);
8158
8159 static_key_slow_dec(&perf_swevent_enabled[event_id]);
8160 swevent_hlist_put();
8161}
8162
8163static int perf_swevent_init(struct perf_event *event)
8164{
8165 u64 event_id = event->attr.config;
8166
8167 if (event->attr.type != PERF_TYPE_SOFTWARE)
8168 return -ENOENT;
8169
8170
8171
8172
8173 if (has_branch_stack(event))
8174 return -EOPNOTSUPP;
8175
8176 switch (event_id) {
8177 case PERF_COUNT_SW_CPU_CLOCK:
8178 case PERF_COUNT_SW_TASK_CLOCK:
8179 return -ENOENT;
8180
8181 default:
8182 break;
8183 }
8184
8185 if (event_id >= PERF_COUNT_SW_MAX)
8186 return -ENOENT;
8187
8188 if (!event->parent) {
8189 int err;
8190
8191 err = swevent_hlist_get();
8192 if (err)
8193 return err;
8194
8195 static_key_slow_inc(&perf_swevent_enabled[event_id]);
8196 event->destroy = sw_perf_event_destroy;
8197 }
8198
8199 return 0;
8200}
8201
8202static struct pmu perf_swevent = {
8203 .task_ctx_nr = perf_sw_context,
8204
8205 .capabilities = PERF_PMU_CAP_NO_NMI,
8206
8207 .event_init = perf_swevent_init,
8208 .add = perf_swevent_add,
8209 .del = perf_swevent_del,
8210 .start = perf_swevent_start,
8211 .stop = perf_swevent_stop,
8212 .read = perf_swevent_read,
8213};
8214
8215#ifdef CONFIG_EVENT_TRACING
8216
8217static int perf_tp_filter_match(struct perf_event *event,
8218 struct perf_sample_data *data)
8219{
8220 void *record = data->raw->frag.data;
8221
8222
8223 if (event->parent)
8224 event = event->parent;
8225
8226 if (likely(!event->filter) || filter_match_preds(event->filter, record))
8227 return 1;
8228 return 0;
8229}
8230
8231static int perf_tp_event_match(struct perf_event *event,
8232 struct perf_sample_data *data,
8233 struct pt_regs *regs)
8234{
8235 if (event->hw.state & PERF_HES_STOPPED)
8236 return 0;
8237
8238
8239
8240 if (event->attr.exclude_kernel)
8241 return 0;
8242
8243 if (!perf_tp_filter_match(event, data))
8244 return 0;
8245
8246 return 1;
8247}
8248
8249void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8250 struct trace_event_call *call, u64 count,
8251 struct pt_regs *regs, struct hlist_head *head,
8252 struct task_struct *task)
8253{
8254 if (bpf_prog_array_valid(call)) {
8255 *(struct pt_regs **)raw_data = regs;
8256 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
8257 perf_swevent_put_recursion_context(rctx);
8258 return;
8259 }
8260 }
8261 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8262 rctx, task);
8263}
8264EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8265
8266void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8267 struct pt_regs *regs, struct hlist_head *head, int rctx,
8268 struct task_struct *task)
8269{
8270 struct perf_sample_data data;
8271 struct perf_event *event;
8272
8273 struct perf_raw_record raw = {
8274 .frag = {
8275 .size = entry_size,
8276 .data = record,
8277 },
8278 };
8279
8280 perf_sample_data_init(&data, 0, 0);
8281 data.raw = &raw;
8282
8283 perf_trace_buf_update(record, event_type);
8284
8285 hlist_for_each_entry_rcu(event, head, hlist_entry) {
8286 if (perf_tp_event_match(event, &data, regs))
8287 perf_swevent_event(event, count, &data, regs);
8288 }
8289
8290
8291
8292
8293
8294 if (task && task != current) {
8295 struct perf_event_context *ctx;
8296 struct trace_entry *entry = record;
8297
8298 rcu_read_lock();
8299 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8300 if (!ctx)
8301 goto unlock;
8302
8303 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8304 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8305 continue;
8306 if (event->attr.config != entry->type)
8307 continue;
8308 if (perf_tp_event_match(event, &data, regs))
8309 perf_swevent_event(event, count, &data, regs);
8310 }
8311unlock:
8312 rcu_read_unlock();
8313 }
8314
8315 perf_swevent_put_recursion_context(rctx);
8316}
8317EXPORT_SYMBOL_GPL(perf_tp_event);
8318
8319static void tp_perf_event_destroy(struct perf_event *event)
8320{
8321 perf_trace_destroy(event);
8322}
8323
8324static int perf_tp_event_init(struct perf_event *event)
8325{
8326 int err;
8327
8328 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8329 return -ENOENT;
8330
8331
8332
8333
8334 if (has_branch_stack(event))
8335 return -EOPNOTSUPP;
8336
8337 err = perf_trace_init(event);
8338 if (err)
8339 return err;
8340
8341 event->destroy = tp_perf_event_destroy;
8342
8343 return 0;
8344}
8345
8346static struct pmu perf_tracepoint = {
8347 .task_ctx_nr = perf_sw_context,
8348
8349 .event_init = perf_tp_event_init,
8350 .add = perf_trace_add,
8351 .del = perf_trace_del,
8352 .start = perf_swevent_start,
8353 .stop = perf_swevent_stop,
8354 .read = perf_swevent_read,
8355};
8356
8357#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8358
8359
8360
8361
8362
8363
8364
8365enum perf_probe_config {
8366 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
8367};
8368
8369PMU_FORMAT_ATTR(retprobe, "config:0");
8370
8371static struct attribute *probe_attrs[] = {
8372 &format_attr_retprobe.attr,
8373 NULL,
8374};
8375
8376static struct attribute_group probe_format_group = {
8377 .name = "format",
8378 .attrs = probe_attrs,
8379};
8380
8381static const struct attribute_group *probe_attr_groups[] = {
8382 &probe_format_group,
8383 NULL,
8384};
8385#endif
8386
8387#ifdef CONFIG_KPROBE_EVENTS
8388static int perf_kprobe_event_init(struct perf_event *event);
8389static struct pmu perf_kprobe = {
8390 .task_ctx_nr = perf_sw_context,
8391 .event_init = perf_kprobe_event_init,
8392 .add = perf_trace_add,
8393 .del = perf_trace_del,
8394 .start = perf_swevent_start,
8395 .stop = perf_swevent_stop,
8396 .read = perf_swevent_read,
8397 .attr_groups = probe_attr_groups,
8398};
8399
8400static int perf_kprobe_event_init(struct perf_event *event)
8401{
8402 int err;
8403 bool is_retprobe;
8404
8405 if (event->attr.type != perf_kprobe.type)
8406 return -ENOENT;
8407
8408 if (!capable(CAP_SYS_ADMIN))
8409 return -EACCES;
8410
8411
8412
8413
8414 if (has_branch_stack(event))
8415 return -EOPNOTSUPP;
8416
8417 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8418 err = perf_kprobe_init(event, is_retprobe);
8419 if (err)
8420 return err;
8421
8422 event->destroy = perf_kprobe_destroy;
8423
8424 return 0;
8425}
8426#endif
8427
8428#ifdef CONFIG_UPROBE_EVENTS
8429static int perf_uprobe_event_init(struct perf_event *event);
8430static struct pmu perf_uprobe = {
8431 .task_ctx_nr = perf_sw_context,
8432 .event_init = perf_uprobe_event_init,
8433 .add = perf_trace_add,
8434 .del = perf_trace_del,
8435 .start = perf_swevent_start,
8436 .stop = perf_swevent_stop,
8437 .read = perf_swevent_read,
8438 .attr_groups = probe_attr_groups,
8439};
8440
8441static int perf_uprobe_event_init(struct perf_event *event)
8442{
8443 int err;
8444 bool is_retprobe;
8445
8446 if (event->attr.type != perf_uprobe.type)
8447 return -ENOENT;
8448
8449 if (!capable(CAP_SYS_ADMIN))
8450 return -EACCES;
8451
8452
8453
8454
8455 if (has_branch_stack(event))
8456 return -EOPNOTSUPP;
8457
8458 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8459 err = perf_uprobe_init(event, is_retprobe);
8460 if (err)
8461 return err;
8462
8463 event->destroy = perf_uprobe_destroy;
8464
8465 return 0;
8466}
8467#endif
8468
8469static inline void perf_tp_register(void)
8470{
8471 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8472#ifdef CONFIG_KPROBE_EVENTS
8473 perf_pmu_register(&perf_kprobe, "kprobe", -1);
8474#endif
8475#ifdef CONFIG_UPROBE_EVENTS
8476 perf_pmu_register(&perf_uprobe, "uprobe", -1);
8477#endif
8478}
8479
8480static void perf_event_free_filter(struct perf_event *event)
8481{
8482 ftrace_profile_free_filter(event);
8483}
8484
8485#ifdef CONFIG_BPF_SYSCALL
8486static void bpf_overflow_handler(struct perf_event *event,
8487 struct perf_sample_data *data,
8488 struct pt_regs *regs)
8489{
8490 struct bpf_perf_event_data_kern ctx = {
8491 .data = data,
8492 .event = event,
8493 };
8494 int ret = 0;
8495
8496 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8497 preempt_disable();
8498 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8499 goto out;
8500 rcu_read_lock();
8501 ret = BPF_PROG_RUN(event->prog, &ctx);
8502 rcu_read_unlock();
8503out:
8504 __this_cpu_dec(bpf_prog_active);
8505 preempt_enable();
8506 if (!ret)
8507 return;
8508
8509 event->orig_overflow_handler(event, data, regs);
8510}
8511
8512static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8513{
8514 struct bpf_prog *prog;
8515
8516 if (event->overflow_handler_context)
8517
8518 return -EINVAL;
8519
8520 if (event->prog)
8521 return -EEXIST;
8522
8523 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8524 if (IS_ERR(prog))
8525 return PTR_ERR(prog);
8526
8527 event->prog = prog;
8528 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8529 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8530 return 0;
8531}
8532
8533static void perf_event_free_bpf_handler(struct perf_event *event)
8534{
8535 struct bpf_prog *prog = event->prog;
8536
8537 if (!prog)
8538 return;
8539
8540 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8541 event->prog = NULL;
8542 bpf_prog_put(prog);
8543}
8544#else
8545static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8546{
8547 return -EOPNOTSUPP;
8548}
8549static void perf_event_free_bpf_handler(struct perf_event *event)
8550{
8551}
8552#endif
8553
8554
8555
8556
8557
8558static inline bool perf_event_is_tracing(struct perf_event *event)
8559{
8560 if (event->pmu == &perf_tracepoint)
8561 return true;
8562#ifdef CONFIG_KPROBE_EVENTS
8563 if (event->pmu == &perf_kprobe)
8564 return true;
8565#endif
8566#ifdef CONFIG_UPROBE_EVENTS
8567 if (event->pmu == &perf_uprobe)
8568 return true;
8569#endif
8570 return false;
8571}
8572
8573static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8574{
8575 bool is_kprobe, is_tracepoint, is_syscall_tp;
8576 struct bpf_prog *prog;
8577 int ret;
8578
8579 if (!perf_event_is_tracing(event))
8580 return perf_event_set_bpf_handler(event, prog_fd);
8581
8582 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8583 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8584 is_syscall_tp = is_syscall_trace_event(event->tp_event);
8585 if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8586
8587 return -EINVAL;
8588
8589 prog = bpf_prog_get(prog_fd);
8590 if (IS_ERR(prog))
8591 return PTR_ERR(prog);
8592
8593 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8594 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8595 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8596
8597 bpf_prog_put(prog);
8598 return -EINVAL;
8599 }
8600
8601
8602 if (prog->kprobe_override &&
8603 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8604 bpf_prog_put(prog);
8605 return -EINVAL;
8606 }
8607
8608 if (is_tracepoint || is_syscall_tp) {
8609 int off = trace_event_get_offsets(event->tp_event);
8610
8611 if (prog->aux->max_ctx_offset > off) {
8612 bpf_prog_put(prog);
8613 return -EACCES;
8614 }
8615 }
8616
8617 ret = perf_event_attach_bpf_prog(event, prog);
8618 if (ret)
8619 bpf_prog_put(prog);
8620 return ret;
8621}
8622
8623static void perf_event_free_bpf_prog(struct perf_event *event)
8624{
8625 if (!perf_event_is_tracing(event)) {
8626 perf_event_free_bpf_handler(event);
8627 return;
8628 }
8629 perf_event_detach_bpf_prog(event);
8630}
8631
8632#else
8633
8634static inline void perf_tp_register(void)
8635{
8636}
8637
8638static void perf_event_free_filter(struct perf_event *event)
8639{
8640}
8641
8642static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8643{
8644 return -ENOENT;
8645}
8646
8647static void perf_event_free_bpf_prog(struct perf_event *event)
8648{
8649}
8650#endif
8651
8652#ifdef CONFIG_HAVE_HW_BREAKPOINT
8653void perf_bp_event(struct perf_event *bp, void *data)
8654{
8655 struct perf_sample_data sample;
8656 struct pt_regs *regs = data;
8657
8658 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8659
8660 if (!bp->hw.state && !perf_exclude_event(bp, regs))
8661 perf_swevent_event(bp, 1, &sample, regs);
8662}
8663#endif
8664
8665
8666
8667
8668static struct perf_addr_filter *
8669perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8670{
8671 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8672 struct perf_addr_filter *filter;
8673
8674 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8675 if (!filter)
8676 return NULL;
8677
8678 INIT_LIST_HEAD(&filter->entry);
8679 list_add_tail(&filter->entry, filters);
8680
8681 return filter;
8682}
8683
8684static void free_filters_list(struct list_head *filters)
8685{
8686 struct perf_addr_filter *filter, *iter;
8687
8688 list_for_each_entry_safe(filter, iter, filters, entry) {
8689 if (filter->inode)
8690 iput(filter->inode);
8691 list_del(&filter->entry);
8692 kfree(filter);
8693 }
8694}
8695
8696
8697
8698
8699static void perf_addr_filters_splice(struct perf_event *event,
8700 struct list_head *head)
8701{
8702 unsigned long flags;
8703 LIST_HEAD(list);
8704
8705 if (!has_addr_filter(event))
8706 return;
8707
8708
8709 if (event->parent)
8710 return;
8711
8712 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8713
8714 list_splice_init(&event->addr_filters.list, &list);
8715 if (head)
8716 list_splice(head, &event->addr_filters.list);
8717
8718 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8719
8720 free_filters_list(&list);
8721}
8722
8723
8724
8725
8726
8727
8728static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8729 struct mm_struct *mm)
8730{
8731 struct vm_area_struct *vma;
8732
8733 for (vma = mm->mmap; vma; vma = vma->vm_next) {
8734 struct file *file = vma->vm_file;
8735 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8736 unsigned long vma_size = vma->vm_end - vma->vm_start;
8737
8738 if (!file)
8739 continue;
8740
8741 if (!perf_addr_filter_match(filter, file, off, vma_size))
8742 continue;
8743
8744 return vma->vm_start;
8745 }
8746
8747 return 0;
8748}
8749
8750
8751
8752
8753
8754static void perf_event_addr_filters_apply(struct perf_event *event)
8755{
8756 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8757 struct task_struct *task = READ_ONCE(event->ctx->task);
8758 struct perf_addr_filter *filter;
8759 struct mm_struct *mm = NULL;
8760 unsigned int count = 0;
8761 unsigned long flags;
8762
8763
8764
8765
8766
8767 if (task == TASK_TOMBSTONE)
8768 return;
8769
8770 if (!ifh->nr_file_filters)
8771 return;
8772
8773 mm = get_task_mm(event->ctx->task);
8774 if (!mm)
8775 goto restart;
8776
8777 down_read(&mm->mmap_sem);
8778
8779 raw_spin_lock_irqsave(&ifh->lock, flags);
8780 list_for_each_entry(filter, &ifh->list, entry) {
8781 event->addr_filters_offs[count] = 0;
8782
8783
8784
8785
8786
8787 if (filter->inode)
8788 event->addr_filters_offs[count] =
8789 perf_addr_filter_apply(filter, mm);
8790
8791 count++;
8792 }
8793
8794 event->addr_filters_gen++;
8795 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8796
8797 up_read(&mm->mmap_sem);
8798
8799 mmput(mm);
8800
8801restart:
8802 perf_event_stop(event, 1);
8803}
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824enum {
8825 IF_ACT_NONE = -1,
8826 IF_ACT_FILTER,
8827 IF_ACT_START,
8828 IF_ACT_STOP,
8829 IF_SRC_FILE,
8830 IF_SRC_KERNEL,
8831 IF_SRC_FILEADDR,
8832 IF_SRC_KERNELADDR,
8833};
8834
8835enum {
8836 IF_STATE_ACTION = 0,
8837 IF_STATE_SOURCE,
8838 IF_STATE_END,
8839};
8840
8841static const match_table_t if_tokens = {
8842 { IF_ACT_FILTER, "filter" },
8843 { IF_ACT_START, "start" },
8844 { IF_ACT_STOP, "stop" },
8845 { IF_SRC_FILE, "%u/%u@%s" },
8846 { IF_SRC_KERNEL, "%u/%u" },
8847 { IF_SRC_FILEADDR, "%u@%s" },
8848 { IF_SRC_KERNELADDR, "%u" },
8849 { IF_ACT_NONE, NULL },
8850};
8851
8852
8853
8854
8855static int
8856perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8857 struct list_head *filters)
8858{
8859 struct perf_addr_filter *filter = NULL;
8860 char *start, *orig, *filename = NULL;
8861 struct path path;
8862 substring_t args[MAX_OPT_ARGS];
8863 int state = IF_STATE_ACTION, token;
8864 unsigned int kernel = 0;
8865 int ret = -EINVAL;
8866
8867 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8868 if (!fstr)
8869 return -ENOMEM;
8870
8871 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8872 static const enum perf_addr_filter_action_t actions[] = {
8873 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
8874 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
8875 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
8876 };
8877 ret = -EINVAL;
8878
8879 if (!*start)
8880 continue;
8881
8882
8883 if (state == IF_STATE_ACTION) {
8884 filter = perf_addr_filter_new(event, filters);
8885 if (!filter)
8886 goto fail;
8887 }
8888
8889 token = match_token(start, if_tokens, args);
8890 switch (token) {
8891 case IF_ACT_FILTER:
8892 case IF_ACT_START:
8893 case IF_ACT_STOP:
8894 if (state != IF_STATE_ACTION)
8895 goto fail;
8896
8897 filter->action = actions[token];
8898 state = IF_STATE_SOURCE;
8899 break;
8900
8901 case IF_SRC_KERNELADDR:
8902 case IF_SRC_KERNEL:
8903 kernel = 1;
8904
8905 case IF_SRC_FILEADDR:
8906 case IF_SRC_FILE:
8907 if (state != IF_STATE_SOURCE)
8908 goto fail;
8909
8910 *args[0].to = 0;
8911 ret = kstrtoul(args[0].from, 0, &filter->offset);
8912 if (ret)
8913 goto fail;
8914
8915 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
8916 *args[1].to = 0;
8917 ret = kstrtoul(args[1].from, 0, &filter->size);
8918 if (ret)
8919 goto fail;
8920 }
8921
8922 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8923 int fpos = token == IF_SRC_FILE ? 2 : 1;
8924
8925 filename = match_strdup(&args[fpos]);
8926 if (!filename) {
8927 ret = -ENOMEM;
8928 goto fail;
8929 }
8930 }
8931
8932 state = IF_STATE_END;
8933 break;
8934
8935 default:
8936 goto fail;
8937 }
8938
8939
8940
8941
8942
8943
8944 if (state == IF_STATE_END) {
8945 ret = -EINVAL;
8946 if (kernel && event->attr.exclude_kernel)
8947 goto fail;
8948
8949
8950
8951
8952
8953 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
8954 !filter->size)
8955 goto fail;
8956
8957 if (!kernel) {
8958 if (!filename)
8959 goto fail;
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969 ret = -EOPNOTSUPP;
8970 if (!event->ctx->task)
8971 goto fail_free_name;
8972
8973
8974 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8975 if (ret)
8976 goto fail_free_name;
8977
8978 filter->inode = igrab(d_inode(path.dentry));
8979 path_put(&path);
8980 kfree(filename);
8981 filename = NULL;
8982
8983 ret = -EINVAL;
8984 if (!filter->inode ||
8985 !S_ISREG(filter->inode->i_mode))
8986
8987 goto fail;
8988
8989 event->addr_filters.nr_file_filters++;
8990 }
8991
8992
8993 state = IF_STATE_ACTION;
8994 filter = NULL;
8995 }
8996 }
8997
8998 if (state != IF_STATE_ACTION)
8999 goto fail;
9000
9001 kfree(orig);
9002
9003 return 0;
9004
9005fail_free_name:
9006 kfree(filename);
9007fail:
9008 free_filters_list(filters);
9009 kfree(orig);
9010
9011 return ret;
9012}
9013
9014static int
9015perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
9016{
9017 LIST_HEAD(filters);
9018 int ret;
9019
9020
9021
9022
9023
9024 lockdep_assert_held(&event->ctx->mutex);
9025
9026 if (WARN_ON_ONCE(event->parent))
9027 return -EINVAL;
9028
9029 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
9030 if (ret)
9031 goto fail_clear_files;
9032
9033 ret = event->pmu->addr_filters_validate(&filters);
9034 if (ret)
9035 goto fail_free_filters;
9036
9037
9038 perf_addr_filters_splice(event, &filters);
9039
9040
9041 perf_event_for_each_child(event, perf_event_addr_filters_apply);
9042
9043 return ret;
9044
9045fail_free_filters:
9046 free_filters_list(&filters);
9047
9048fail_clear_files:
9049 event->addr_filters.nr_file_filters = 0;
9050
9051 return ret;
9052}
9053
9054static int perf_event_set_filter(struct perf_event *event, void __user *arg)
9055{
9056 int ret = -EINVAL;
9057 char *filter_str;
9058
9059 filter_str = strndup_user(arg, PAGE_SIZE);
9060 if (IS_ERR(filter_str))
9061 return PTR_ERR(filter_str);
9062
9063#ifdef CONFIG_EVENT_TRACING
9064 if (perf_event_is_tracing(event)) {
9065 struct perf_event_context *ctx = event->ctx;
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078 mutex_unlock(&ctx->mutex);
9079 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9080 mutex_lock(&ctx->mutex);
9081 } else
9082#endif
9083 if (has_addr_filter(event))
9084 ret = perf_event_set_addr_filter(event, filter_str);
9085
9086 kfree(filter_str);
9087 return ret;
9088}
9089
9090
9091
9092
9093
9094static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
9095{
9096 enum hrtimer_restart ret = HRTIMER_RESTART;
9097 struct perf_sample_data data;
9098 struct pt_regs *regs;
9099 struct perf_event *event;
9100 u64 period;
9101
9102 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
9103
9104 if (event->state != PERF_EVENT_STATE_ACTIVE)
9105 return HRTIMER_NORESTART;
9106
9107 event->pmu->read(event);
9108
9109 perf_sample_data_init(&data, 0, event->hw.last_period);
9110 regs = get_irq_regs();
9111
9112 if (regs && !perf_exclude_event(event, regs)) {
9113 if (!(event->attr.exclude_idle && is_idle_task(current)))
9114 if (__perf_event_overflow(event, 1, &data, regs))
9115 ret = HRTIMER_NORESTART;
9116 }
9117
9118 period = max_t(u64, 10000, event->hw.sample_period);
9119 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
9120
9121 return ret;
9122}
9123
9124static void perf_swevent_start_hrtimer(struct perf_event *event)
9125{
9126 struct hw_perf_event *hwc = &event->hw;
9127 s64 period;
9128
9129 if (!is_sampling_event(event))
9130 return;
9131
9132 period = local64_read(&hwc->period_left);
9133 if (period) {
9134 if (period < 0)
9135 period = 10000;
9136
9137 local64_set(&hwc->period_left, 0);
9138 } else {
9139 period = max_t(u64, 10000, hwc->sample_period);
9140 }
9141 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9142 HRTIMER_MODE_REL_PINNED);
9143}
9144
9145static void perf_swevent_cancel_hrtimer(struct perf_event *event)
9146{
9147 struct hw_perf_event *hwc = &event->hw;
9148
9149 if (is_sampling_event(event)) {
9150 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
9151 local64_set(&hwc->period_left, ktime_to_ns(remaining));
9152
9153 hrtimer_cancel(&hwc->hrtimer);
9154 }
9155}
9156
9157static void perf_swevent_init_hrtimer(struct perf_event *event)
9158{
9159 struct hw_perf_event *hwc = &event->hw;
9160
9161 if (!is_sampling_event(event))
9162 return;
9163
9164 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
9165 hwc->hrtimer.function = perf_swevent_hrtimer;
9166
9167
9168
9169
9170
9171 if (event->attr.freq) {
9172 long freq = event->attr.sample_freq;
9173
9174 event->attr.sample_period = NSEC_PER_SEC / freq;
9175 hwc->sample_period = event->attr.sample_period;
9176 local64_set(&hwc->period_left, hwc->sample_period);
9177 hwc->last_period = hwc->sample_period;
9178 event->attr.freq = 0;
9179 }
9180}
9181
9182
9183
9184
9185
9186static void cpu_clock_event_update(struct perf_event *event)
9187{
9188 s64 prev;
9189 u64 now;
9190
9191 now = local_clock();
9192 prev = local64_xchg(&event->hw.prev_count, now);
9193 local64_add(now - prev, &event->count);
9194}
9195
9196static void cpu_clock_event_start(struct perf_event *event, int flags)
9197{
9198 local64_set(&event->hw.prev_count, local_clock());
9199 perf_swevent_start_hrtimer(event);
9200}
9201
9202static void cpu_clock_event_stop(struct perf_event *event, int flags)
9203{
9204 perf_swevent_cancel_hrtimer(event);
9205 cpu_clock_event_update(event);
9206}
9207
9208static int cpu_clock_event_add(struct perf_event *event, int flags)
9209{
9210 if (flags & PERF_EF_START)
9211 cpu_clock_event_start(event, flags);
9212 perf_event_update_userpage(event);
9213
9214 return 0;
9215}
9216
9217static void cpu_clock_event_del(struct perf_event *event, int flags)
9218{
9219 cpu_clock_event_stop(event, flags);
9220}
9221
9222static void cpu_clock_event_read(struct perf_event *event)
9223{
9224 cpu_clock_event_update(event);
9225}
9226
9227static int cpu_clock_event_init(struct perf_event *event)
9228{
9229 if (event->attr.type != PERF_TYPE_SOFTWARE)
9230 return -ENOENT;
9231
9232 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
9233 return -ENOENT;
9234
9235
9236
9237
9238 if (has_branch_stack(event))
9239 return -EOPNOTSUPP;
9240
9241 perf_swevent_init_hrtimer(event);
9242
9243 return 0;
9244}
9245
9246static struct pmu perf_cpu_clock = {
9247 .task_ctx_nr = perf_sw_context,
9248
9249 .capabilities = PERF_PMU_CAP_NO_NMI,
9250
9251 .event_init = cpu_clock_event_init,
9252 .add = cpu_clock_event_add,
9253 .del = cpu_clock_event_del,
9254 .start = cpu_clock_event_start,
9255 .stop = cpu_clock_event_stop,
9256 .read = cpu_clock_event_read,
9257};
9258
9259
9260
9261
9262
9263static void task_clock_event_update(struct perf_event *event, u64 now)
9264{
9265 u64 prev;
9266 s64 delta;
9267
9268 prev = local64_xchg(&event->hw.prev_count, now);
9269 delta = now - prev;
9270 local64_add(delta, &event->count);
9271}
9272
9273static void task_clock_event_start(struct perf_event *event, int flags)
9274{
9275 local64_set(&event->hw.prev_count, event->ctx->time);
9276 perf_swevent_start_hrtimer(event);
9277}
9278
9279static void task_clock_event_stop(struct perf_event *event, int flags)
9280{
9281 perf_swevent_cancel_hrtimer(event);
9282 task_clock_event_update(event, event->ctx->time);
9283}
9284
9285static int task_clock_event_add(struct perf_event *event, int flags)
9286{
9287 if (flags & PERF_EF_START)
9288 task_clock_event_start(event, flags);
9289 perf_event_update_userpage(event);
9290
9291 return 0;
9292}
9293
9294static void task_clock_event_del(struct perf_event *event, int flags)
9295{
9296 task_clock_event_stop(event, PERF_EF_UPDATE);
9297}
9298
9299static void task_clock_event_read(struct perf_event *event)
9300{
9301 u64 now = perf_clock();
9302 u64 delta = now - event->ctx->timestamp;
9303 u64 time = event->ctx->time + delta;
9304
9305 task_clock_event_update(event, time);
9306}
9307
9308static int task_clock_event_init(struct perf_event *event)
9309{
9310 if (event->attr.type != PERF_TYPE_SOFTWARE)
9311 return -ENOENT;
9312
9313 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
9314 return -ENOENT;
9315
9316
9317
9318
9319 if (has_branch_stack(event))
9320 return -EOPNOTSUPP;
9321
9322 perf_swevent_init_hrtimer(event);
9323
9324 return 0;
9325}
9326
9327static struct pmu perf_task_clock = {
9328 .task_ctx_nr = perf_sw_context,
9329
9330 .capabilities = PERF_PMU_CAP_NO_NMI,
9331
9332 .event_init = task_clock_event_init,
9333 .add = task_clock_event_add,
9334 .del = task_clock_event_del,
9335 .start = task_clock_event_start,
9336 .stop = task_clock_event_stop,
9337 .read = task_clock_event_read,
9338};
9339
9340static void perf_pmu_nop_void(struct pmu *pmu)
9341{
9342}
9343
9344static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
9345{
9346}
9347
9348static int perf_pmu_nop_int(struct pmu *pmu)
9349{
9350 return 0;
9351}
9352
9353static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9354
9355static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9356{
9357 __this_cpu_write(nop_txn_flags, flags);
9358
9359 if (flags & ~PERF_PMU_TXN_ADD)
9360 return;
9361
9362 perf_pmu_disable(pmu);
9363}
9364
9365static int perf_pmu_commit_txn(struct pmu *pmu)
9366{
9367 unsigned int flags = __this_cpu_read(nop_txn_flags);
9368
9369 __this_cpu_write(nop_txn_flags, 0);
9370
9371 if (flags & ~PERF_PMU_TXN_ADD)
9372 return 0;
9373
9374 perf_pmu_enable(pmu);
9375 return 0;
9376}
9377
9378static void perf_pmu_cancel_txn(struct pmu *pmu)
9379{
9380 unsigned int flags = __this_cpu_read(nop_txn_flags);
9381
9382 __this_cpu_write(nop_txn_flags, 0);
9383
9384 if (flags & ~PERF_PMU_TXN_ADD)
9385 return;
9386
9387 perf_pmu_enable(pmu);
9388}
9389
9390static int perf_event_idx_default(struct perf_event *event)
9391{
9392 return 0;
9393}
9394
9395
9396
9397
9398
9399static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9400{
9401 struct pmu *pmu;
9402
9403 if (ctxn < 0)
9404 return NULL;
9405
9406 list_for_each_entry(pmu, &pmus, entry) {
9407 if (pmu->task_ctx_nr == ctxn)
9408 return pmu->pmu_cpu_context;
9409 }
9410
9411 return NULL;
9412}
9413
9414static void free_pmu_context(struct pmu *pmu)
9415{
9416
9417
9418
9419
9420
9421 if (pmu->task_ctx_nr > perf_invalid_context)
9422 return;
9423
9424 mutex_lock(&pmus_lock);
9425 free_percpu(pmu->pmu_cpu_context);
9426 mutex_unlock(&pmus_lock);
9427}
9428
9429
9430
9431
9432static ssize_t nr_addr_filters_show(struct device *dev,
9433 struct device_attribute *attr,
9434 char *page)
9435{
9436 struct pmu *pmu = dev_get_drvdata(dev);
9437
9438 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9439}
9440DEVICE_ATTR_RO(nr_addr_filters);
9441
9442static struct idr pmu_idr;
9443
9444static ssize_t
9445type_show(struct device *dev, struct device_attribute *attr, char *page)
9446{
9447 struct pmu *pmu = dev_get_drvdata(dev);
9448
9449 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9450}
9451static DEVICE_ATTR_RO(type);
9452
9453static ssize_t
9454perf_event_mux_interval_ms_show(struct device *dev,
9455 struct device_attribute *attr,
9456 char *page)
9457{
9458 struct pmu *pmu = dev_get_drvdata(dev);
9459
9460 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9461}
9462
9463static DEFINE_MUTEX(mux_interval_mutex);
9464
9465static ssize_t
9466perf_event_mux_interval_ms_store(struct device *dev,
9467 struct device_attribute *attr,
9468 const char *buf, size_t count)
9469{
9470 struct pmu *pmu = dev_get_drvdata(dev);
9471 int timer, cpu, ret;
9472
9473 ret = kstrtoint(buf, 0, &timer);
9474 if (ret)
9475 return ret;
9476
9477 if (timer < 1)
9478 return -EINVAL;
9479
9480
9481 if (timer == pmu->hrtimer_interval_ms)
9482 return count;
9483
9484 mutex_lock(&mux_interval_mutex);
9485 pmu->hrtimer_interval_ms = timer;
9486
9487
9488 cpus_read_lock();
9489 for_each_online_cpu(cpu) {
9490 struct perf_cpu_context *cpuctx;
9491 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9492 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9493
9494 cpu_function_call(cpu,
9495 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9496 }
9497 cpus_read_unlock();
9498 mutex_unlock(&mux_interval_mutex);
9499
9500 return count;
9501}
9502static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9503
9504static struct attribute *pmu_dev_attrs[] = {
9505 &dev_attr_type.attr,
9506 &dev_attr_perf_event_mux_interval_ms.attr,
9507 NULL,
9508};
9509ATTRIBUTE_GROUPS(pmu_dev);
9510
9511static int pmu_bus_running;
9512static struct bus_type pmu_bus = {
9513 .name = "event_source",
9514 .dev_groups = pmu_dev_groups,
9515};
9516
9517static void pmu_dev_release(struct device *dev)
9518{
9519 kfree(dev);
9520}
9521
9522static int pmu_dev_alloc(struct pmu *pmu)
9523{
9524 int ret = -ENOMEM;
9525
9526 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9527 if (!pmu->dev)
9528 goto out;
9529
9530 pmu->dev->groups = pmu->attr_groups;
9531 device_initialize(pmu->dev);
9532 ret = dev_set_name(pmu->dev, "%s", pmu->name);
9533 if (ret)
9534 goto free_dev;
9535
9536 dev_set_drvdata(pmu->dev, pmu);
9537 pmu->dev->bus = &pmu_bus;
9538 pmu->dev->release = pmu_dev_release;
9539 ret = device_add(pmu->dev);
9540 if (ret)
9541 goto free_dev;
9542
9543
9544 if (pmu->nr_addr_filters)
9545 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9546
9547 if (ret)
9548 goto del_dev;
9549
9550out:
9551 return ret;
9552
9553del_dev:
9554 device_del(pmu->dev);
9555
9556free_dev:
9557 put_device(pmu->dev);
9558 goto out;
9559}
9560
9561static struct lock_class_key cpuctx_mutex;
9562static struct lock_class_key cpuctx_lock;
9563
9564int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9565{
9566 int cpu, ret;
9567
9568 mutex_lock(&pmus_lock);
9569 ret = -ENOMEM;
9570 pmu->pmu_disable_count = alloc_percpu(int);
9571 if (!pmu->pmu_disable_count)
9572 goto unlock;
9573
9574 pmu->type = -1;
9575 if (!name)
9576 goto skip_type;
9577 pmu->name = name;
9578
9579 if (type < 0) {
9580 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9581 if (type < 0) {
9582 ret = type;
9583 goto free_pdc;
9584 }
9585 }
9586 pmu->type = type;
9587
9588 if (pmu_bus_running) {
9589 ret = pmu_dev_alloc(pmu);
9590 if (ret)
9591 goto free_idr;
9592 }
9593
9594skip_type:
9595 if (pmu->task_ctx_nr == perf_hw_context) {
9596 static int hw_context_taken = 0;
9597
9598
9599
9600
9601
9602
9603 if (WARN_ON_ONCE(hw_context_taken &&
9604 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9605 pmu->task_ctx_nr = perf_invalid_context;
9606
9607 hw_context_taken = 1;
9608 }
9609
9610 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9611 if (pmu->pmu_cpu_context)
9612 goto got_cpu_context;
9613
9614 ret = -ENOMEM;
9615 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9616 if (!pmu->pmu_cpu_context)
9617 goto free_dev;
9618
9619 for_each_possible_cpu(cpu) {
9620 struct perf_cpu_context *cpuctx;
9621
9622 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9623 __perf_event_init_context(&cpuctx->ctx);
9624 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9625 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9626 cpuctx->ctx.pmu = pmu;
9627 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9628
9629 __perf_mux_hrtimer_init(cpuctx, cpu);
9630 }
9631
9632got_cpu_context:
9633 if (!pmu->start_txn) {
9634 if (pmu->pmu_enable) {
9635
9636
9637
9638
9639
9640 pmu->start_txn = perf_pmu_start_txn;
9641 pmu->commit_txn = perf_pmu_commit_txn;
9642 pmu->cancel_txn = perf_pmu_cancel_txn;
9643 } else {
9644 pmu->start_txn = perf_pmu_nop_txn;
9645 pmu->commit_txn = perf_pmu_nop_int;
9646 pmu->cancel_txn = perf_pmu_nop_void;
9647 }
9648 }
9649
9650 if (!pmu->pmu_enable) {
9651 pmu->pmu_enable = perf_pmu_nop_void;
9652 pmu->pmu_disable = perf_pmu_nop_void;
9653 }
9654
9655 if (!pmu->event_idx)
9656 pmu->event_idx = perf_event_idx_default;
9657
9658 list_add_rcu(&pmu->entry, &pmus);
9659 atomic_set(&pmu->exclusive_cnt, 0);
9660 ret = 0;
9661unlock:
9662 mutex_unlock(&pmus_lock);
9663
9664 return ret;
9665
9666free_dev:
9667 device_del(pmu->dev);
9668 put_device(pmu->dev);
9669
9670free_idr:
9671 if (pmu->type >= PERF_TYPE_MAX)
9672 idr_remove(&pmu_idr, pmu->type);
9673
9674free_pdc:
9675 free_percpu(pmu->pmu_disable_count);
9676 goto unlock;
9677}
9678EXPORT_SYMBOL_GPL(perf_pmu_register);
9679
9680void perf_pmu_unregister(struct pmu *pmu)
9681{
9682 int remove_device;
9683
9684 mutex_lock(&pmus_lock);
9685 remove_device = pmu_bus_running;
9686 list_del_rcu(&pmu->entry);
9687 mutex_unlock(&pmus_lock);
9688
9689
9690
9691
9692
9693 synchronize_srcu(&pmus_srcu);
9694 synchronize_rcu();
9695
9696 free_percpu(pmu->pmu_disable_count);
9697 if (pmu->type >= PERF_TYPE_MAX)
9698 idr_remove(&pmu_idr, pmu->type);
9699 if (remove_device) {
9700 if (pmu->nr_addr_filters)
9701 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9702 device_del(pmu->dev);
9703 put_device(pmu->dev);
9704 }
9705 free_pmu_context(pmu);
9706}
9707EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9708
9709static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9710{
9711 struct perf_event_context *ctx = NULL;
9712 int ret;
9713
9714 if (!try_module_get(pmu->module))
9715 return -ENODEV;
9716
9717
9718
9719
9720
9721
9722
9723 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
9724
9725
9726
9727
9728 ctx = perf_event_ctx_lock_nested(event->group_leader,
9729 SINGLE_DEPTH_NESTING);
9730 BUG_ON(!ctx);
9731 }
9732
9733 event->pmu = pmu;
9734 ret = pmu->event_init(event);
9735
9736 if (ctx)
9737 perf_event_ctx_unlock(event->group_leader, ctx);
9738
9739 if (ret)
9740 module_put(pmu->module);
9741
9742 return ret;
9743}
9744
9745static struct pmu *perf_init_event(struct perf_event *event)
9746{
9747 struct pmu *pmu;
9748 int idx;
9749 int ret;
9750
9751 idx = srcu_read_lock(&pmus_srcu);
9752
9753
9754 if (event->parent && event->parent->pmu) {
9755 pmu = event->parent->pmu;
9756 ret = perf_try_init_event(pmu, event);
9757 if (!ret)
9758 goto unlock;
9759 }
9760
9761 rcu_read_lock();
9762 pmu = idr_find(&pmu_idr, event->attr.type);
9763 rcu_read_unlock();
9764 if (pmu) {
9765 ret = perf_try_init_event(pmu, event);
9766 if (ret)
9767 pmu = ERR_PTR(ret);
9768 goto unlock;
9769 }
9770
9771 list_for_each_entry_rcu(pmu, &pmus, entry) {
9772 ret = perf_try_init_event(pmu, event);
9773 if (!ret)
9774 goto unlock;
9775
9776 if (ret != -ENOENT) {
9777 pmu = ERR_PTR(ret);
9778 goto unlock;
9779 }
9780 }
9781 pmu = ERR_PTR(-ENOENT);
9782unlock:
9783 srcu_read_unlock(&pmus_srcu, idx);
9784
9785 return pmu;
9786}
9787
9788static void attach_sb_event(struct perf_event *event)
9789{
9790 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9791
9792 raw_spin_lock(&pel->lock);
9793 list_add_rcu(&event->sb_list, &pel->list);
9794 raw_spin_unlock(&pel->lock);
9795}
9796
9797
9798
9799
9800
9801
9802
9803
9804static void account_pmu_sb_event(struct perf_event *event)
9805{
9806 if (is_sb_event(event))
9807 attach_sb_event(event);
9808}
9809
9810static void account_event_cpu(struct perf_event *event, int cpu)
9811{
9812 if (event->parent)
9813 return;
9814
9815 if (is_cgroup_event(event))
9816 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9817}
9818
9819
9820static void account_freq_event_nohz(void)
9821{
9822#ifdef CONFIG_NO_HZ_FULL
9823
9824 spin_lock(&nr_freq_lock);
9825 if (atomic_inc_return(&nr_freq_events) == 1)
9826 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9827 spin_unlock(&nr_freq_lock);
9828#endif
9829}
9830
9831static void account_freq_event(void)
9832{
9833 if (tick_nohz_full_enabled())
9834 account_freq_event_nohz();
9835 else
9836 atomic_inc(&nr_freq_events);
9837}
9838
9839
9840static void account_event(struct perf_event *event)
9841{
9842 bool inc = false;
9843
9844 if (event->parent)
9845 return;
9846
9847 if (event->attach_state & PERF_ATTACH_TASK)
9848 inc = true;
9849 if (event->attr.mmap || event->attr.mmap_data)
9850 atomic_inc(&nr_mmap_events);
9851 if (event->attr.comm)
9852 atomic_inc(&nr_comm_events);
9853 if (event->attr.namespaces)
9854 atomic_inc(&nr_namespaces_events);
9855 if (event->attr.task)
9856 atomic_inc(&nr_task_events);
9857 if (event->attr.freq)
9858 account_freq_event();
9859 if (event->attr.context_switch) {
9860 atomic_inc(&nr_switch_events);
9861 inc = true;
9862 }
9863 if (has_branch_stack(event))
9864 inc = true;
9865 if (is_cgroup_event(event))
9866 inc = true;
9867
9868 if (inc) {
9869
9870
9871
9872
9873
9874 if (atomic_inc_not_zero(&perf_sched_count))
9875 goto enabled;
9876
9877 mutex_lock(&perf_sched_mutex);
9878 if (!atomic_read(&perf_sched_count)) {
9879 static_branch_enable(&perf_sched_events);
9880
9881
9882
9883
9884
9885 synchronize_sched();
9886 }
9887
9888
9889
9890
9891 atomic_inc(&perf_sched_count);
9892 mutex_unlock(&perf_sched_mutex);
9893 }
9894enabled:
9895
9896 account_event_cpu(event, event->cpu);
9897
9898 account_pmu_sb_event(event);
9899}
9900
9901
9902
9903
9904static struct perf_event *
9905perf_event_alloc(struct perf_event_attr *attr, int cpu,
9906 struct task_struct *task,
9907 struct perf_event *group_leader,
9908 struct perf_event *parent_event,
9909 perf_overflow_handler_t overflow_handler,
9910 void *context, int cgroup_fd)
9911{
9912 struct pmu *pmu;
9913 struct perf_event *event;
9914 struct hw_perf_event *hwc;
9915 long err = -EINVAL;
9916
9917 if ((unsigned)cpu >= nr_cpu_ids) {
9918 if (!task || cpu != -1)
9919 return ERR_PTR(-EINVAL);
9920 }
9921
9922 event = kzalloc(sizeof(*event), GFP_KERNEL);
9923 if (!event)
9924 return ERR_PTR(-ENOMEM);
9925
9926
9927
9928
9929
9930 if (!group_leader)
9931 group_leader = event;
9932
9933 mutex_init(&event->child_mutex);
9934 INIT_LIST_HEAD(&event->child_list);
9935
9936 INIT_LIST_HEAD(&event->event_entry);
9937 INIT_LIST_HEAD(&event->sibling_list);
9938 INIT_LIST_HEAD(&event->active_list);
9939 init_event_group(event);
9940 INIT_LIST_HEAD(&event->rb_entry);
9941 INIT_LIST_HEAD(&event->active_entry);
9942 INIT_LIST_HEAD(&event->addr_filters.list);
9943 INIT_HLIST_NODE(&event->hlist_entry);
9944
9945
9946 init_waitqueue_head(&event->waitq);
9947 init_irq_work(&event->pending, perf_pending_event);
9948
9949 mutex_init(&event->mmap_mutex);
9950 raw_spin_lock_init(&event->addr_filters.lock);
9951
9952 atomic_long_set(&event->refcount, 1);
9953 event->cpu = cpu;
9954 event->attr = *attr;
9955 event->group_leader = group_leader;
9956 event->pmu = NULL;
9957 event->oncpu = -1;
9958
9959 event->parent = parent_event;
9960
9961 event->ns = get_pid_ns(task_active_pid_ns(current));
9962 event->id = atomic64_inc_return(&perf_event_id);
9963
9964 event->state = PERF_EVENT_STATE_INACTIVE;
9965
9966 if (task) {
9967 event->attach_state = PERF_ATTACH_TASK;
9968
9969
9970
9971
9972
9973 get_task_struct(task);
9974 event->hw.target = task;
9975 }
9976
9977 event->clock = &local_clock;
9978 if (parent_event)
9979 event->clock = parent_event->clock;
9980
9981 if (!overflow_handler && parent_event) {
9982 overflow_handler = parent_event->overflow_handler;
9983 context = parent_event->overflow_handler_context;
9984#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9985 if (overflow_handler == bpf_overflow_handler) {
9986 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9987
9988 if (IS_ERR(prog)) {
9989 err = PTR_ERR(prog);
9990 goto err_ns;
9991 }
9992 event->prog = prog;
9993 event->orig_overflow_handler =
9994 parent_event->orig_overflow_handler;
9995 }
9996#endif
9997 }
9998
9999 if (overflow_handler) {
10000 event->overflow_handler = overflow_handler;
10001 event->overflow_handler_context = context;
10002 } else if (is_write_backward(event)){
10003 event->overflow_handler = perf_event_output_backward;
10004 event->overflow_handler_context = NULL;
10005 } else {
10006 event->overflow_handler = perf_event_output_forward;
10007 event->overflow_handler_context = NULL;
10008 }
10009
10010 perf_event__state_init(event);
10011
10012 pmu = NULL;
10013
10014 hwc = &event->hw;
10015 hwc->sample_period = attr->sample_period;
10016 if (attr->freq && attr->sample_freq)
10017 hwc->sample_period = 1;
10018 hwc->last_period = hwc->sample_period;
10019
10020 local64_set(&hwc->period_left, hwc->sample_period);
10021
10022
10023
10024
10025
10026 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
10027 goto err_ns;
10028
10029 if (!has_branch_stack(event))
10030 event->attr.branch_sample_type = 0;
10031
10032 if (cgroup_fd != -1) {
10033 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10034 if (err)
10035 goto err_ns;
10036 }
10037
10038 pmu = perf_init_event(event);
10039 if (IS_ERR(pmu)) {
10040 err = PTR_ERR(pmu);
10041 goto err_ns;
10042 }
10043
10044 err = exclusive_event_init(event);
10045 if (err)
10046 goto err_pmu;
10047
10048 if (has_addr_filter(event)) {
10049 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
10050 sizeof(unsigned long),
10051 GFP_KERNEL);
10052 if (!event->addr_filters_offs) {
10053 err = -ENOMEM;
10054 goto err_per_task;
10055 }
10056
10057
10058 event->addr_filters_gen = 1;
10059 }
10060
10061 if (!event->parent) {
10062 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
10063 err = get_callchain_buffers(attr->sample_max_stack);
10064 if (err)
10065 goto err_addr_filters;
10066 }
10067 }
10068
10069
10070 account_event(event);
10071
10072 return event;
10073
10074err_addr_filters:
10075 kfree(event->addr_filters_offs);
10076
10077err_per_task:
10078 exclusive_event_destroy(event);
10079
10080err_pmu:
10081 if (event->destroy)
10082 event->destroy(event);
10083 module_put(pmu->module);
10084err_ns:
10085 if (is_cgroup_event(event))
10086 perf_detach_cgroup(event);
10087 if (event->ns)
10088 put_pid_ns(event->ns);
10089 if (event->hw.target)
10090 put_task_struct(event->hw.target);
10091 kfree(event);
10092
10093 return ERR_PTR(err);
10094}
10095
10096static int perf_copy_attr(struct perf_event_attr __user *uattr,
10097 struct perf_event_attr *attr)
10098{
10099 u32 size;
10100 int ret;
10101
10102 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10103 return -EFAULT;
10104
10105
10106
10107
10108 memset(attr, 0, sizeof(*attr));
10109
10110 ret = get_user(size, &uattr->size);
10111 if (ret)
10112 return ret;
10113
10114 if (size > PAGE_SIZE)
10115 goto err_size;
10116
10117 if (!size)
10118 size = PERF_ATTR_SIZE_VER0;
10119
10120 if (size < PERF_ATTR_SIZE_VER0)
10121 goto err_size;
10122
10123
10124
10125
10126
10127
10128
10129 if (size > sizeof(*attr)) {
10130 unsigned char __user *addr;
10131 unsigned char __user *end;
10132 unsigned char val;
10133
10134 addr = (void __user *)uattr + sizeof(*attr);
10135 end = (void __user *)uattr + size;
10136
10137 for (; addr < end; addr++) {
10138 ret = get_user(val, addr);
10139 if (ret)
10140 return ret;
10141 if (val)
10142 goto err_size;
10143 }
10144 size = sizeof(*attr);
10145 }
10146
10147 ret = copy_from_user(attr, uattr, size);
10148 if (ret)
10149 return -EFAULT;
10150
10151 attr->size = size;
10152
10153 if (attr->__reserved_1)
10154 return -EINVAL;
10155
10156 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
10157 return -EINVAL;
10158
10159 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
10160 return -EINVAL;
10161
10162 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
10163 u64 mask = attr->branch_sample_type;
10164
10165
10166 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
10167 return -EINVAL;
10168
10169
10170 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
10171 return -EINVAL;
10172
10173
10174 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
10175
10176
10177 if (!attr->exclude_kernel)
10178 mask |= PERF_SAMPLE_BRANCH_KERNEL;
10179
10180 if (!attr->exclude_user)
10181 mask |= PERF_SAMPLE_BRANCH_USER;
10182
10183 if (!attr->exclude_hv)
10184 mask |= PERF_SAMPLE_BRANCH_HV;
10185
10186
10187
10188 attr->branch_sample_type = mask;
10189 }
10190
10191 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
10192 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10193 return -EACCES;
10194 }
10195
10196 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
10197 ret = perf_reg_validate(attr->sample_regs_user);
10198 if (ret)
10199 return ret;
10200 }
10201
10202 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
10203 if (!arch_perf_have_user_stack_dump())
10204 return -ENOSYS;
10205
10206
10207
10208
10209
10210
10211 if (attr->sample_stack_user >= USHRT_MAX)
10212 return -EINVAL;
10213 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
10214 return -EINVAL;
10215 }
10216
10217 if (!attr->sample_max_stack)
10218 attr->sample_max_stack = sysctl_perf_event_max_stack;
10219
10220 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10221 ret = perf_reg_validate(attr->sample_regs_intr);
10222out:
10223 return ret;
10224
10225err_size:
10226 put_user(sizeof(*attr), &uattr->size);
10227 ret = -E2BIG;
10228 goto out;
10229}
10230
10231static int
10232perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
10233{
10234 struct ring_buffer *rb = NULL;
10235 int ret = -EINVAL;
10236
10237 if (!output_event)
10238 goto set;
10239
10240
10241 if (event == output_event)
10242 goto out;
10243
10244
10245
10246
10247 if (output_event->cpu != event->cpu)
10248 goto out;
10249
10250
10251
10252
10253 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
10254 goto out;
10255
10256
10257
10258
10259 if (output_event->clock != event->clock)
10260 goto out;
10261
10262
10263
10264
10265
10266 if (is_write_backward(output_event) != is_write_backward(event))
10267 goto out;
10268
10269
10270
10271
10272 if (has_aux(event) && has_aux(output_event) &&
10273 event->pmu != output_event->pmu)
10274 goto out;
10275
10276set:
10277 mutex_lock(&event->mmap_mutex);
10278
10279 if (atomic_read(&event->mmap_count))
10280 goto unlock;
10281
10282 if (output_event) {
10283
10284 rb = ring_buffer_get(output_event);
10285 if (!rb)
10286 goto unlock;
10287 }
10288
10289 ring_buffer_attach(event, rb);
10290
10291 ret = 0;
10292unlock:
10293 mutex_unlock(&event->mmap_mutex);
10294
10295out:
10296 return ret;
10297}
10298
10299static void mutex_lock_double(struct mutex *a, struct mutex *b)
10300{
10301 if (b < a)
10302 swap(a, b);
10303
10304 mutex_lock(a);
10305 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10306}
10307
10308static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10309{
10310 bool nmi_safe = false;
10311
10312 switch (clk_id) {
10313 case CLOCK_MONOTONIC:
10314 event->clock = &ktime_get_mono_fast_ns;
10315 nmi_safe = true;
10316 break;
10317
10318 case CLOCK_MONOTONIC_RAW:
10319 event->clock = &ktime_get_raw_fast_ns;
10320 nmi_safe = true;
10321 break;
10322
10323 case CLOCK_REALTIME:
10324 event->clock = &ktime_get_real_ns;
10325 break;
10326
10327 case CLOCK_BOOTTIME:
10328 event->clock = &ktime_get_boot_ns;
10329 break;
10330
10331 case CLOCK_TAI:
10332 event->clock = &ktime_get_tai_ns;
10333 break;
10334
10335 default:
10336 return -EINVAL;
10337 }
10338
10339 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
10340 return -EINVAL;
10341
10342 return 0;
10343}
10344
10345
10346
10347
10348
10349static struct perf_event_context *
10350__perf_event_ctx_lock_double(struct perf_event *group_leader,
10351 struct perf_event_context *ctx)
10352{
10353 struct perf_event_context *gctx;
10354
10355again:
10356 rcu_read_lock();
10357 gctx = READ_ONCE(group_leader->ctx);
10358 if (!atomic_inc_not_zero(&gctx->refcount)) {
10359 rcu_read_unlock();
10360 goto again;
10361 }
10362 rcu_read_unlock();
10363
10364 mutex_lock_double(&gctx->mutex, &ctx->mutex);
10365
10366 if (group_leader->ctx != gctx) {
10367 mutex_unlock(&ctx->mutex);
10368 mutex_unlock(&gctx->mutex);
10369 put_ctx(gctx);
10370 goto again;
10371 }
10372
10373 return gctx;
10374}
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384SYSCALL_DEFINE5(perf_event_open,
10385 struct perf_event_attr __user *, attr_uptr,
10386 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10387{
10388 struct perf_event *group_leader = NULL, *output_event = NULL;
10389 struct perf_event *event, *sibling;
10390 struct perf_event_attr attr;
10391 struct perf_event_context *ctx, *uninitialized_var(gctx);
10392 struct file *event_file = NULL;
10393 struct fd group = {NULL, 0};
10394 struct task_struct *task = NULL;
10395 struct pmu *pmu;
10396 int event_fd;
10397 int move_group = 0;
10398 int err;
10399 int f_flags = O_RDWR;
10400 int cgroup_fd = -1;
10401
10402
10403 if (flags & ~PERF_FLAG_ALL)
10404 return -EINVAL;
10405
10406 err = perf_copy_attr(attr_uptr, &attr);
10407 if (err)
10408 return err;
10409
10410 if (!attr.exclude_kernel) {
10411 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10412 return -EACCES;
10413 }
10414
10415 if (attr.namespaces) {
10416 if (!capable(CAP_SYS_ADMIN))
10417 return -EACCES;
10418 }
10419
10420 if (attr.freq) {
10421 if (attr.sample_freq > sysctl_perf_event_sample_rate)
10422 return -EINVAL;
10423 } else {
10424 if (attr.sample_period & (1ULL << 63))
10425 return -EINVAL;
10426 }
10427
10428
10429 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10430 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10431 return -EACCES;
10432
10433
10434
10435
10436
10437
10438
10439 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10440 return -EINVAL;
10441
10442 if (flags & PERF_FLAG_FD_CLOEXEC)
10443 f_flags |= O_CLOEXEC;
10444
10445 event_fd = get_unused_fd_flags(f_flags);
10446 if (event_fd < 0)
10447 return event_fd;
10448
10449 if (group_fd != -1) {
10450 err = perf_fget_light(group_fd, &group);
10451 if (err)
10452 goto err_fd;
10453 group_leader = group.file->private_data;
10454 if (flags & PERF_FLAG_FD_OUTPUT)
10455 output_event = group_leader;
10456 if (flags & PERF_FLAG_FD_NO_GROUP)
10457 group_leader = NULL;
10458 }
10459
10460 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10461 task = find_lively_task_by_vpid(pid);
10462 if (IS_ERR(task)) {
10463 err = PTR_ERR(task);
10464 goto err_group_fd;
10465 }
10466 }
10467
10468 if (task && group_leader &&
10469 group_leader->attr.inherit != attr.inherit) {
10470 err = -EINVAL;
10471 goto err_task;
10472 }
10473
10474 if (task) {
10475 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10476 if (err)
10477 goto err_task;
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487 err = -EACCES;
10488 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10489 goto err_cred;
10490 }
10491
10492 if (flags & PERF_FLAG_PID_CGROUP)
10493 cgroup_fd = pid;
10494
10495 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10496 NULL, NULL, cgroup_fd);
10497 if (IS_ERR(event)) {
10498 err = PTR_ERR(event);
10499 goto err_cred;
10500 }
10501
10502 if (is_sampling_event(event)) {
10503 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10504 err = -EOPNOTSUPP;
10505 goto err_alloc;
10506 }
10507 }
10508
10509
10510
10511
10512
10513 pmu = event->pmu;
10514
10515 if (attr.use_clockid) {
10516 err = perf_event_set_clock(event, attr.clockid);
10517 if (err)
10518 goto err_alloc;
10519 }
10520
10521 if (pmu->task_ctx_nr == perf_sw_context)
10522 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10523
10524 if (group_leader &&
10525 (is_software_event(event) != is_software_event(group_leader))) {
10526 if (is_software_event(event)) {
10527
10528
10529
10530
10531
10532
10533
10534
10535 pmu = group_leader->pmu;
10536 } else if (is_software_event(group_leader) &&
10537 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10538
10539
10540
10541
10542
10543 move_group = 1;
10544 }
10545 }
10546
10547
10548
10549
10550 ctx = find_get_context(pmu, task, event);
10551 if (IS_ERR(ctx)) {
10552 err = PTR_ERR(ctx);
10553 goto err_alloc;
10554 }
10555
10556 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10557 err = -EBUSY;
10558 goto err_context;
10559 }
10560
10561
10562
10563
10564 if (group_leader) {
10565 err = -EINVAL;
10566
10567
10568
10569
10570
10571 if (group_leader->group_leader != group_leader)
10572 goto err_context;
10573
10574
10575 if (group_leader->clock != event->clock)
10576 goto err_context;
10577
10578
10579
10580
10581
10582
10583 if (group_leader->cpu != event->cpu)
10584 goto err_context;
10585
10586
10587
10588
10589
10590 if (group_leader->ctx->task != ctx->task)
10591 goto err_context;
10592
10593
10594
10595
10596
10597
10598 if (!move_group && group_leader->ctx != ctx)
10599 goto err_context;
10600
10601
10602
10603
10604 if (attr.exclusive || attr.pinned)
10605 goto err_context;
10606 }
10607
10608 if (output_event) {
10609 err = perf_event_set_output(event, output_event);
10610 if (err)
10611 goto err_context;
10612 }
10613
10614 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10615 f_flags);
10616 if (IS_ERR(event_file)) {
10617 err = PTR_ERR(event_file);
10618 event_file = NULL;
10619 goto err_context;
10620 }
10621
10622 if (move_group) {
10623 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10624
10625 if (gctx->task == TASK_TOMBSTONE) {
10626 err = -ESRCH;
10627 goto err_locked;
10628 }
10629
10630
10631
10632
10633
10634 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10635
10636
10637
10638
10639
10640 if (gctx != ctx) {
10641 err = -EINVAL;
10642 goto err_locked;
10643 } else {
10644 perf_event_ctx_unlock(group_leader, gctx);
10645 move_group = 0;
10646 }
10647 }
10648 } else {
10649 mutex_lock(&ctx->mutex);
10650 }
10651
10652 if (ctx->task == TASK_TOMBSTONE) {
10653 err = -ESRCH;
10654 goto err_locked;
10655 }
10656
10657 if (!perf_event_validate_size(event)) {
10658 err = -E2BIG;
10659 goto err_locked;
10660 }
10661
10662 if (!task) {
10663
10664
10665
10666
10667
10668
10669 struct perf_cpu_context *cpuctx =
10670 container_of(ctx, struct perf_cpu_context, ctx);
10671
10672 if (!cpuctx->online) {
10673 err = -ENODEV;
10674 goto err_locked;
10675 }
10676 }
10677
10678
10679
10680
10681
10682
10683 if (!exclusive_event_installable(event, ctx)) {
10684
10685 WARN_ON_ONCE(move_group);
10686
10687 err = -EBUSY;
10688 goto err_locked;
10689 }
10690
10691 WARN_ON_ONCE(ctx->parent_ctx);
10692
10693
10694
10695
10696
10697
10698 if (move_group) {
10699
10700
10701
10702
10703 perf_remove_from_context(group_leader, 0);
10704 put_ctx(gctx);
10705
10706 for_each_sibling_event(sibling, group_leader) {
10707 perf_remove_from_context(sibling, 0);
10708 put_ctx(gctx);
10709 }
10710
10711
10712
10713
10714
10715 synchronize_rcu();
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727 for_each_sibling_event(sibling, group_leader) {
10728 perf_event__state_init(sibling);
10729 perf_install_in_context(ctx, sibling, sibling->cpu);
10730 get_ctx(ctx);
10731 }
10732
10733
10734
10735
10736
10737
10738 perf_event__state_init(group_leader);
10739 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10740 get_ctx(ctx);
10741 }
10742
10743
10744
10745
10746
10747
10748
10749 perf_event__header_size(event);
10750 perf_event__id_header_size(event);
10751
10752 event->owner = current;
10753
10754 perf_install_in_context(ctx, event, event->cpu);
10755 perf_unpin_context(ctx);
10756
10757 if (move_group)
10758 perf_event_ctx_unlock(group_leader, gctx);
10759 mutex_unlock(&ctx->mutex);
10760
10761 if (task) {
10762 mutex_unlock(&task->signal->cred_guard_mutex);
10763 put_task_struct(task);
10764 }
10765
10766 mutex_lock(¤t->perf_event_mutex);
10767 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
10768 mutex_unlock(¤t->perf_event_mutex);
10769
10770
10771
10772
10773
10774
10775
10776 fdput(group);
10777 fd_install(event_fd, event_file);
10778 return event_fd;
10779
10780err_locked:
10781 if (move_group)
10782 perf_event_ctx_unlock(group_leader, gctx);
10783 mutex_unlock(&ctx->mutex);
10784
10785 fput(event_file);
10786err_context:
10787 perf_unpin_context(ctx);
10788 put_ctx(ctx);
10789err_alloc:
10790
10791
10792
10793
10794 if (!event_file)
10795 free_event(event);
10796err_cred:
10797 if (task)
10798 mutex_unlock(&task->signal->cred_guard_mutex);
10799err_task:
10800 if (task)
10801 put_task_struct(task);
10802err_group_fd:
10803 fdput(group);
10804err_fd:
10805 put_unused_fd(event_fd);
10806 return err;
10807}
10808
10809
10810
10811
10812
10813
10814
10815
10816struct perf_event *
10817perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10818 struct task_struct *task,
10819 perf_overflow_handler_t overflow_handler,
10820 void *context)
10821{
10822 struct perf_event_context *ctx;
10823 struct perf_event *event;
10824 int err;
10825
10826
10827
10828
10829
10830 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10831 overflow_handler, context, -1);
10832 if (IS_ERR(event)) {
10833 err = PTR_ERR(event);
10834 goto err;
10835 }
10836
10837
10838 event->owner = TASK_TOMBSTONE;
10839
10840 ctx = find_get_context(event->pmu, task, event);
10841 if (IS_ERR(ctx)) {
10842 err = PTR_ERR(ctx);
10843 goto err_free;
10844 }
10845
10846 WARN_ON_ONCE(ctx->parent_ctx);
10847 mutex_lock(&ctx->mutex);
10848 if (ctx->task == TASK_TOMBSTONE) {
10849 err = -ESRCH;
10850 goto err_unlock;
10851 }
10852
10853 if (!task) {
10854
10855
10856
10857
10858
10859
10860 struct perf_cpu_context *cpuctx =
10861 container_of(ctx, struct perf_cpu_context, ctx);
10862 if (!cpuctx->online) {
10863 err = -ENODEV;
10864 goto err_unlock;
10865 }
10866 }
10867
10868 if (!exclusive_event_installable(event, ctx)) {
10869 err = -EBUSY;
10870 goto err_unlock;
10871 }
10872
10873 perf_install_in_context(ctx, event, cpu);
10874 perf_unpin_context(ctx);
10875 mutex_unlock(&ctx->mutex);
10876
10877 return event;
10878
10879err_unlock:
10880 mutex_unlock(&ctx->mutex);
10881 perf_unpin_context(ctx);
10882 put_ctx(ctx);
10883err_free:
10884 free_event(event);
10885err:
10886 return ERR_PTR(err);
10887}
10888EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10889
10890void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10891{
10892 struct perf_event_context *src_ctx;
10893 struct perf_event_context *dst_ctx;
10894 struct perf_event *event, *tmp;
10895 LIST_HEAD(events);
10896
10897 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10898 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10899
10900
10901
10902
10903
10904 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10905 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10906 event_entry) {
10907 perf_remove_from_context(event, 0);
10908 unaccount_event_cpu(event, src_cpu);
10909 put_ctx(src_ctx);
10910 list_add(&event->migrate_entry, &events);
10911 }
10912
10913
10914
10915
10916 synchronize_rcu();
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10927 if (event->group_leader == event)
10928 continue;
10929
10930 list_del(&event->migrate_entry);
10931 if (event->state >= PERF_EVENT_STATE_OFF)
10932 event->state = PERF_EVENT_STATE_INACTIVE;
10933 account_event_cpu(event, dst_cpu);
10934 perf_install_in_context(dst_ctx, event, dst_cpu);
10935 get_ctx(dst_ctx);
10936 }
10937
10938
10939
10940
10941
10942 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10943 list_del(&event->migrate_entry);
10944 if (event->state >= PERF_EVENT_STATE_OFF)
10945 event->state = PERF_EVENT_STATE_INACTIVE;
10946 account_event_cpu(event, dst_cpu);
10947 perf_install_in_context(dst_ctx, event, dst_cpu);
10948 get_ctx(dst_ctx);
10949 }
10950 mutex_unlock(&dst_ctx->mutex);
10951 mutex_unlock(&src_ctx->mutex);
10952}
10953EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10954
10955static void sync_child_event(struct perf_event *child_event,
10956 struct task_struct *child)
10957{
10958 struct perf_event *parent_event = child_event->parent;
10959 u64 child_val;
10960
10961 if (child_event->attr.inherit_stat)
10962 perf_event_read_event(child_event, child);
10963
10964 child_val = perf_event_count(child_event);
10965
10966
10967
10968
10969 atomic64_add(child_val, &parent_event->child_count);
10970 atomic64_add(child_event->total_time_enabled,
10971 &parent_event->child_total_time_enabled);
10972 atomic64_add(child_event->total_time_running,
10973 &parent_event->child_total_time_running);
10974}
10975
10976static void
10977perf_event_exit_event(struct perf_event *child_event,
10978 struct perf_event_context *child_ctx,
10979 struct task_struct *child)
10980{
10981 struct perf_event *parent_event = child_event->parent;
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995 raw_spin_lock_irq(&child_ctx->lock);
10996 WARN_ON_ONCE(child_ctx->is_active);
10997
10998 if (parent_event)
10999 perf_group_detach(child_event);
11000 list_del_event(child_event, child_ctx);
11001 perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT);
11002 raw_spin_unlock_irq(&child_ctx->lock);
11003
11004
11005
11006
11007 if (!parent_event) {
11008 perf_event_wakeup(child_event);
11009 return;
11010 }
11011
11012
11013
11014
11015 sync_child_event(child_event, child);
11016
11017
11018
11019
11020 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
11021 mutex_lock(&parent_event->child_mutex);
11022 list_del_init(&child_event->child_list);
11023 mutex_unlock(&parent_event->child_mutex);
11024
11025
11026
11027
11028 perf_event_wakeup(parent_event);
11029 free_event(child_event);
11030 put_event(parent_event);
11031}
11032
11033static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
11034{
11035 struct perf_event_context *child_ctx, *clone_ctx = NULL;
11036 struct perf_event *child_event, *next;
11037
11038 WARN_ON_ONCE(child != current);
11039
11040 child_ctx = perf_pin_task_context(child, ctxn);
11041 if (!child_ctx)
11042 return;
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054 mutex_lock(&child_ctx->mutex);
11055
11056
11057
11058
11059
11060
11061 raw_spin_lock_irq(&child_ctx->lock);
11062 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
11063
11064
11065
11066
11067
11068 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
11069 put_ctx(child_ctx);
11070 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
11071 put_task_struct(current);
11072
11073 clone_ctx = unclone_ctx(child_ctx);
11074 raw_spin_unlock_irq(&child_ctx->lock);
11075
11076 if (clone_ctx)
11077 put_ctx(clone_ctx);
11078
11079
11080
11081
11082
11083
11084 perf_event_task(child, child_ctx, 0);
11085
11086 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
11087 perf_event_exit_event(child_event, child_ctx, child);
11088
11089 mutex_unlock(&child_ctx->mutex);
11090
11091 put_ctx(child_ctx);
11092}
11093
11094
11095
11096
11097
11098
11099
11100void perf_event_exit_task(struct task_struct *child)
11101{
11102 struct perf_event *event, *tmp;
11103 int ctxn;
11104
11105 mutex_lock(&child->perf_event_mutex);
11106 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
11107 owner_entry) {
11108 list_del_init(&event->owner_entry);
11109
11110
11111
11112
11113
11114
11115 smp_store_release(&event->owner, NULL);
11116 }
11117 mutex_unlock(&child->perf_event_mutex);
11118
11119 for_each_task_context_nr(ctxn)
11120 perf_event_exit_task_context(child, ctxn);
11121
11122
11123
11124
11125
11126
11127
11128 perf_event_task(child, NULL, 0);
11129}
11130
11131static void perf_free_event(struct perf_event *event,
11132 struct perf_event_context *ctx)
11133{
11134 struct perf_event *parent = event->parent;
11135
11136 if (WARN_ON_ONCE(!parent))
11137 return;
11138
11139 mutex_lock(&parent->child_mutex);
11140 list_del_init(&event->child_list);
11141 mutex_unlock(&parent->child_mutex);
11142
11143 put_event(parent);
11144
11145 raw_spin_lock_irq(&ctx->lock);
11146 perf_group_detach(event);
11147 list_del_event(event, ctx);
11148 raw_spin_unlock_irq(&ctx->lock);
11149 free_event(event);
11150}
11151
11152
11153
11154
11155
11156
11157
11158
11159void perf_event_free_task(struct task_struct *task)
11160{
11161 struct perf_event_context *ctx;
11162 struct perf_event *event, *tmp;
11163 int ctxn;
11164
11165 for_each_task_context_nr(ctxn) {
11166 ctx = task->perf_event_ctxp[ctxn];
11167 if (!ctx)
11168 continue;
11169
11170 mutex_lock(&ctx->mutex);
11171 raw_spin_lock_irq(&ctx->lock);
11172
11173
11174
11175
11176
11177
11178 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
11179 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
11180 put_task_struct(task);
11181 raw_spin_unlock_irq(&ctx->lock);
11182
11183 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
11184 perf_free_event(event, ctx);
11185
11186 mutex_unlock(&ctx->mutex);
11187 put_ctx(ctx);
11188 }
11189}
11190
11191void perf_event_delayed_put(struct task_struct *task)
11192{
11193 int ctxn;
11194
11195 for_each_task_context_nr(ctxn)
11196 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
11197}
11198
11199struct file *perf_event_get(unsigned int fd)
11200{
11201 struct file *file;
11202
11203 file = fget_raw(fd);
11204 if (!file)
11205 return ERR_PTR(-EBADF);
11206
11207 if (file->f_op != &perf_fops) {
11208 fput(file);
11209 return ERR_PTR(-EBADF);
11210 }
11211
11212 return file;
11213}
11214
11215const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
11216{
11217 if (!event)
11218 return ERR_PTR(-EINVAL);
11219
11220 return &event->attr;
11221}
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231static struct perf_event *
11232inherit_event(struct perf_event *parent_event,
11233 struct task_struct *parent,
11234 struct perf_event_context *parent_ctx,
11235 struct task_struct *child,
11236 struct perf_event *group_leader,
11237 struct perf_event_context *child_ctx)
11238{
11239 enum perf_event_state parent_state = parent_event->state;
11240 struct perf_event *child_event;
11241 unsigned long flags;
11242
11243
11244
11245
11246
11247
11248
11249 if (parent_event->parent)
11250 parent_event = parent_event->parent;
11251
11252 child_event = perf_event_alloc(&parent_event->attr,
11253 parent_event->cpu,
11254 child,
11255 group_leader, parent_event,
11256 NULL, NULL, -1);
11257 if (IS_ERR(child_event))
11258 return child_event;
11259
11260
11261 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
11262 !child_ctx->task_ctx_data) {
11263 struct pmu *pmu = child_event->pmu;
11264
11265 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11266 GFP_KERNEL);
11267 if (!child_ctx->task_ctx_data) {
11268 free_event(child_event);
11269 return NULL;
11270 }
11271 }
11272
11273
11274
11275
11276
11277
11278
11279 mutex_lock(&parent_event->child_mutex);
11280 if (is_orphaned_event(parent_event) ||
11281 !atomic_long_inc_not_zero(&parent_event->refcount)) {
11282 mutex_unlock(&parent_event->child_mutex);
11283
11284 free_event(child_event);
11285 return NULL;
11286 }
11287
11288 get_ctx(child_ctx);
11289
11290
11291
11292
11293
11294
11295 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
11296 child_event->state = PERF_EVENT_STATE_INACTIVE;
11297 else
11298 child_event->state = PERF_EVENT_STATE_OFF;
11299
11300 if (parent_event->attr.freq) {
11301 u64 sample_period = parent_event->hw.sample_period;
11302 struct hw_perf_event *hwc = &child_event->hw;
11303
11304 hwc->sample_period = sample_period;
11305 hwc->last_period = sample_period;
11306
11307 local64_set(&hwc->period_left, sample_period);
11308 }
11309
11310 child_event->ctx = child_ctx;
11311 child_event->overflow_handler = parent_event->overflow_handler;
11312 child_event->overflow_handler_context
11313 = parent_event->overflow_handler_context;
11314
11315
11316
11317
11318 perf_event__header_size(child_event);
11319 perf_event__id_header_size(child_event);
11320
11321
11322
11323
11324 raw_spin_lock_irqsave(&child_ctx->lock, flags);
11325 add_event_to_ctx(child_event, child_ctx);
11326 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
11327
11328
11329
11330
11331 list_add_tail(&child_event->child_list, &parent_event->child_list);
11332 mutex_unlock(&parent_event->child_mutex);
11333
11334 return child_event;
11335}
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347static int inherit_group(struct perf_event *parent_event,
11348 struct task_struct *parent,
11349 struct perf_event_context *parent_ctx,
11350 struct task_struct *child,
11351 struct perf_event_context *child_ctx)
11352{
11353 struct perf_event *leader;
11354 struct perf_event *sub;
11355 struct perf_event *child_ctr;
11356
11357 leader = inherit_event(parent_event, parent, parent_ctx,
11358 child, NULL, child_ctx);
11359 if (IS_ERR(leader))
11360 return PTR_ERR(leader);
11361
11362
11363
11364
11365
11366 for_each_sibling_event(sub, parent_event) {
11367 child_ctr = inherit_event(sub, parent, parent_ctx,
11368 child, leader, child_ctx);
11369 if (IS_ERR(child_ctr))
11370 return PTR_ERR(child_ctr);
11371 }
11372 return 0;
11373}
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386static int
11387inherit_task_group(struct perf_event *event, struct task_struct *parent,
11388 struct perf_event_context *parent_ctx,
11389 struct task_struct *child, int ctxn,
11390 int *inherited_all)
11391{
11392 int ret;
11393 struct perf_event_context *child_ctx;
11394
11395 if (!event->attr.inherit) {
11396 *inherited_all = 0;
11397 return 0;
11398 }
11399
11400 child_ctx = child->perf_event_ctxp[ctxn];
11401 if (!child_ctx) {
11402
11403
11404
11405
11406
11407
11408 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11409 if (!child_ctx)
11410 return -ENOMEM;
11411
11412 child->perf_event_ctxp[ctxn] = child_ctx;
11413 }
11414
11415 ret = inherit_group(event, parent, parent_ctx,
11416 child, child_ctx);
11417
11418 if (ret)
11419 *inherited_all = 0;
11420
11421 return ret;
11422}
11423
11424
11425
11426
11427static int perf_event_init_context(struct task_struct *child, int ctxn)
11428{
11429 struct perf_event_context *child_ctx, *parent_ctx;
11430 struct perf_event_context *cloned_ctx;
11431 struct perf_event *event;
11432 struct task_struct *parent = current;
11433 int inherited_all = 1;
11434 unsigned long flags;
11435 int ret = 0;
11436
11437 if (likely(!parent->perf_event_ctxp[ctxn]))
11438 return 0;
11439
11440
11441
11442
11443
11444 parent_ctx = perf_pin_task_context(parent, ctxn);
11445 if (!parent_ctx)
11446 return 0;
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459 mutex_lock(&parent_ctx->mutex);
11460
11461
11462
11463
11464
11465 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
11466 ret = inherit_task_group(event, parent, parent_ctx,
11467 child, ctxn, &inherited_all);
11468 if (ret)
11469 goto out_unlock;
11470 }
11471
11472
11473
11474
11475
11476
11477 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11478 parent_ctx->rotate_disable = 1;
11479 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11480
11481 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
11482 ret = inherit_task_group(event, parent, parent_ctx,
11483 child, ctxn, &inherited_all);
11484 if (ret)
11485 goto out_unlock;
11486 }
11487
11488 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11489 parent_ctx->rotate_disable = 0;
11490
11491 child_ctx = child->perf_event_ctxp[ctxn];
11492
11493 if (child_ctx && inherited_all) {
11494
11495
11496
11497
11498
11499
11500
11501 cloned_ctx = parent_ctx->parent_ctx;
11502 if (cloned_ctx) {
11503 child_ctx->parent_ctx = cloned_ctx;
11504 child_ctx->parent_gen = parent_ctx->parent_gen;
11505 } else {
11506 child_ctx->parent_ctx = parent_ctx;
11507 child_ctx->parent_gen = parent_ctx->generation;
11508 }
11509 get_ctx(child_ctx->parent_ctx);
11510 }
11511
11512 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11513out_unlock:
11514 mutex_unlock(&parent_ctx->mutex);
11515
11516 perf_unpin_context(parent_ctx);
11517 put_ctx(parent_ctx);
11518
11519 return ret;
11520}
11521
11522
11523
11524
11525int perf_event_init_task(struct task_struct *child)
11526{
11527 int ctxn, ret;
11528
11529 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11530 mutex_init(&child->perf_event_mutex);
11531 INIT_LIST_HEAD(&child->perf_event_list);
11532
11533 for_each_task_context_nr(ctxn) {
11534 ret = perf_event_init_context(child, ctxn);
11535 if (ret) {
11536 perf_event_free_task(child);
11537 return ret;
11538 }
11539 }
11540
11541 return 0;
11542}
11543
11544static void __init perf_event_init_all_cpus(void)
11545{
11546 struct swevent_htable *swhash;
11547 int cpu;
11548
11549 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11550
11551 for_each_possible_cpu(cpu) {
11552 swhash = &per_cpu(swevent_htable, cpu);
11553 mutex_init(&swhash->hlist_mutex);
11554 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11555
11556 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11557 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11558
11559#ifdef CONFIG_CGROUP_PERF
11560 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11561#endif
11562 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11563 }
11564}
11565
11566void perf_swevent_init_cpu(unsigned int cpu)
11567{
11568 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11569
11570 mutex_lock(&swhash->hlist_mutex);
11571 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11572 struct swevent_hlist *hlist;
11573
11574 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11575 WARN_ON(!hlist);
11576 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11577 }
11578 mutex_unlock(&swhash->hlist_mutex);
11579}
11580
11581#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11582static void __perf_event_exit_context(void *__info)
11583{
11584 struct perf_event_context *ctx = __info;
11585 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11586 struct perf_event *event;
11587
11588 raw_spin_lock(&ctx->lock);
11589 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
11590 list_for_each_entry(event, &ctx->event_list, event_entry)
11591 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11592 raw_spin_unlock(&ctx->lock);
11593}
11594
11595static void perf_event_exit_cpu_context(int cpu)
11596{
11597 struct perf_cpu_context *cpuctx;
11598 struct perf_event_context *ctx;
11599 struct pmu *pmu;
11600
11601 mutex_lock(&pmus_lock);
11602 list_for_each_entry(pmu, &pmus, entry) {
11603 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11604 ctx = &cpuctx->ctx;
11605
11606 mutex_lock(&ctx->mutex);
11607 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11608 cpuctx->online = 0;
11609 mutex_unlock(&ctx->mutex);
11610 }
11611 cpumask_clear_cpu(cpu, perf_online_mask);
11612 mutex_unlock(&pmus_lock);
11613}
11614#else
11615
11616static void perf_event_exit_cpu_context(int cpu) { }
11617
11618#endif
11619
11620int perf_event_init_cpu(unsigned int cpu)
11621{
11622 struct perf_cpu_context *cpuctx;
11623 struct perf_event_context *ctx;
11624 struct pmu *pmu;
11625
11626 perf_swevent_init_cpu(cpu);
11627
11628 mutex_lock(&pmus_lock);
11629 cpumask_set_cpu(cpu, perf_online_mask);
11630 list_for_each_entry(pmu, &pmus, entry) {
11631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11632 ctx = &cpuctx->ctx;
11633
11634 mutex_lock(&ctx->mutex);
11635 cpuctx->online = 1;
11636 mutex_unlock(&ctx->mutex);
11637 }
11638 mutex_unlock(&pmus_lock);
11639
11640 return 0;
11641}
11642
11643int perf_event_exit_cpu(unsigned int cpu)
11644{
11645 perf_event_exit_cpu_context(cpu);
11646 return 0;
11647}
11648
11649static int
11650perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11651{
11652 int cpu;
11653
11654 for_each_online_cpu(cpu)
11655 perf_event_exit_cpu(cpu);
11656
11657 return NOTIFY_OK;
11658}
11659
11660
11661
11662
11663
11664static struct notifier_block perf_reboot_notifier = {
11665 .notifier_call = perf_reboot,
11666 .priority = INT_MIN,
11667};
11668
11669void __init perf_event_init(void)
11670{
11671 int ret;
11672
11673 idr_init(&pmu_idr);
11674
11675 perf_event_init_all_cpus();
11676 init_srcu_struct(&pmus_srcu);
11677 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11678 perf_pmu_register(&perf_cpu_clock, NULL, -1);
11679 perf_pmu_register(&perf_task_clock, NULL, -1);
11680 perf_tp_register();
11681 perf_event_init_cpu(smp_processor_id());
11682 register_reboot_notifier(&perf_reboot_notifier);
11683
11684 ret = init_hw_breakpoint();
11685 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11686
11687
11688
11689
11690
11691 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11692 != 1024);
11693}
11694
11695ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11696 char *page)
11697{
11698 struct perf_pmu_events_attr *pmu_attr =
11699 container_of(attr, struct perf_pmu_events_attr, attr);
11700
11701 if (pmu_attr->event_str)
11702 return sprintf(page, "%s\n", pmu_attr->event_str);
11703
11704 return 0;
11705}
11706EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11707
11708static int __init perf_event_sysfs_init(void)
11709{
11710 struct pmu *pmu;
11711 int ret;
11712
11713 mutex_lock(&pmus_lock);
11714
11715 ret = bus_register(&pmu_bus);
11716 if (ret)
11717 goto unlock;
11718
11719 list_for_each_entry(pmu, &pmus, entry) {
11720 if (!pmu->name || pmu->type < 0)
11721 continue;
11722
11723 ret = pmu_dev_alloc(pmu);
11724 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11725 }
11726 pmu_bus_running = 1;
11727 ret = 0;
11728
11729unlock:
11730 mutex_unlock(&pmus_lock);
11731
11732 return ret;
11733}
11734device_initcall(perf_event_sysfs_init);
11735
11736#ifdef CONFIG_CGROUP_PERF
11737static struct cgroup_subsys_state *
11738perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11739{
11740 struct perf_cgroup *jc;
11741
11742 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11743 if (!jc)
11744 return ERR_PTR(-ENOMEM);
11745
11746 jc->info = alloc_percpu(struct perf_cgroup_info);
11747 if (!jc->info) {
11748 kfree(jc);
11749 return ERR_PTR(-ENOMEM);
11750 }
11751
11752 return &jc->css;
11753}
11754
11755static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11756{
11757 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11758
11759 free_percpu(jc->info);
11760 kfree(jc);
11761}
11762
11763static int __perf_cgroup_move(void *info)
11764{
11765 struct task_struct *task = info;
11766 rcu_read_lock();
11767 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11768 rcu_read_unlock();
11769 return 0;
11770}
11771
11772static void perf_cgroup_attach(struct cgroup_taskset *tset)
11773{
11774 struct task_struct *task;
11775 struct cgroup_subsys_state *css;
11776
11777 cgroup_taskset_for_each(task, css, tset)
11778 task_function_call(task, __perf_cgroup_move, task);
11779}
11780
11781struct cgroup_subsys perf_event_cgrp_subsys = {
11782 .css_alloc = perf_cgroup_css_alloc,
11783 .css_free = perf_cgroup_css_free,
11784 .attach = perf_cgroup_attach,
11785
11786
11787
11788
11789
11790 .implicit_on_dfl = true,
11791 .threaded = true,
11792};
11793#endif
11794