1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47
48#include "internal.h"
49
50#include <asm/irq_regs.h>
51
52typedef int (*remote_function_f)(void *);
53
54struct remote_function_call {
55 struct task_struct *p;
56 remote_function_f func;
57 void *info;
58 int ret;
59};
60
61static void remote_function(void *data)
62{
63 struct remote_function_call *tfc = data;
64 struct task_struct *p = tfc->p;
65
66 if (p) {
67
68 if (task_cpu(p) != smp_processor_id())
69 return;
70
71
72
73
74
75
76 tfc->ret = -ESRCH;
77 if (p != current)
78 return;
79 }
80
81 tfc->ret = tfc->func(tfc->info);
82}
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97static int
98task_function_call(struct task_struct *p, remote_function_f func, void *info)
99{
100 struct remote_function_call data = {
101 .p = p,
102 .func = func,
103 .info = info,
104 .ret = -EAGAIN,
105 };
106 int ret;
107
108 do {
109 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
110 if (!ret)
111 ret = data.ret;
112 } while (ret == -EAGAIN);
113
114 return ret;
115}
116
117
118
119
120
121
122
123
124
125
126static int cpu_function_call(int cpu, remote_function_f func, void *info)
127{
128 struct remote_function_call data = {
129 .p = NULL,
130 .func = func,
131 .info = info,
132 .ret = -ENXIO,
133 };
134
135 smp_call_function_single(cpu, remote_function, &data, 1);
136
137 return data.ret;
138}
139
140static inline struct perf_cpu_context *
141__get_cpu_context(struct perf_event_context *ctx)
142{
143 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
144}
145
146static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
147 struct perf_event_context *ctx)
148{
149 raw_spin_lock(&cpuctx->ctx.lock);
150 if (ctx)
151 raw_spin_lock(&ctx->lock);
152}
153
154static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
155 struct perf_event_context *ctx)
156{
157 if (ctx)
158 raw_spin_unlock(&ctx->lock);
159 raw_spin_unlock(&cpuctx->ctx.lock);
160}
161
162#define TASK_TOMBSTONE ((void *)-1L)
163
164static bool is_kernel_event(struct perf_event *event)
165{
166 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
167}
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
189 struct perf_event_context *, void *);
190
191struct event_function_struct {
192 struct perf_event *event;
193 event_f func;
194 void *data;
195};
196
197static int event_function(void *info)
198{
199 struct event_function_struct *efs = info;
200 struct perf_event *event = efs->event;
201 struct perf_event_context *ctx = event->ctx;
202 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
203 struct perf_event_context *task_ctx = cpuctx->task_ctx;
204 int ret = 0;
205
206 WARN_ON_ONCE(!irqs_disabled());
207
208 perf_ctx_lock(cpuctx, task_ctx);
209
210
211
212
213 if (ctx->task) {
214 if (ctx->task != current) {
215 ret = -ESRCH;
216 goto unlock;
217 }
218
219
220
221
222
223
224
225
226 WARN_ON_ONCE(!ctx->is_active);
227
228
229
230
231 WARN_ON_ONCE(task_ctx != ctx);
232 } else {
233 WARN_ON_ONCE(&cpuctx->ctx != ctx);
234 }
235
236 efs->func(event, cpuctx, ctx, efs->data);
237unlock:
238 perf_ctx_unlock(cpuctx, task_ctx);
239
240 return ret;
241}
242
243static void event_function_local(struct perf_event *event, event_f func, void *data)
244{
245 struct event_function_struct efs = {
246 .event = event,
247 .func = func,
248 .data = data,
249 };
250
251 int ret = event_function(&efs);
252 WARN_ON_ONCE(ret);
253}
254
255static void event_function_call(struct perf_event *event, event_f func, void *data)
256{
257 struct perf_event_context *ctx = event->ctx;
258 struct task_struct *task = READ_ONCE(ctx->task);
259 struct event_function_struct efs = {
260 .event = event,
261 .func = func,
262 .data = data,
263 };
264
265 if (!event->parent) {
266
267
268
269
270
271 lockdep_assert_held(&ctx->mutex);
272 }
273
274 if (!task) {
275 cpu_function_call(event->cpu, event_function, &efs);
276 return;
277 }
278
279 if (task == TASK_TOMBSTONE)
280 return;
281
282again:
283 if (!task_function_call(task, event_function, &efs))
284 return;
285
286 raw_spin_lock_irq(&ctx->lock);
287
288
289
290
291 task = ctx->task;
292 if (task == TASK_TOMBSTONE) {
293 raw_spin_unlock_irq(&ctx->lock);
294 return;
295 }
296 if (ctx->is_active) {
297 raw_spin_unlock_irq(&ctx->lock);
298 goto again;
299 }
300 func(event, NULL, ctx, data);
301 raw_spin_unlock_irq(&ctx->lock);
302}
303
304#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
305 PERF_FLAG_FD_OUTPUT |\
306 PERF_FLAG_PID_CGROUP |\
307 PERF_FLAG_FD_CLOEXEC)
308
309
310
311
312#define PERF_SAMPLE_BRANCH_PERM_PLM \
313 (PERF_SAMPLE_BRANCH_KERNEL |\
314 PERF_SAMPLE_BRANCH_HV)
315
316enum event_type_t {
317 EVENT_FLEXIBLE = 0x1,
318 EVENT_PINNED = 0x2,
319 EVENT_TIME = 0x4,
320 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
321};
322
323
324
325
326
327
328static void perf_sched_delayed(struct work_struct *work);
329DEFINE_STATIC_KEY_FALSE(perf_sched_events);
330static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
331static DEFINE_MUTEX(perf_sched_mutex);
332static atomic_t perf_sched_count;
333
334static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
335static DEFINE_PER_CPU(int, perf_sched_cb_usages);
336
337static atomic_t nr_mmap_events __read_mostly;
338static atomic_t nr_comm_events __read_mostly;
339static atomic_t nr_task_events __read_mostly;
340static atomic_t nr_freq_events __read_mostly;
341static atomic_t nr_switch_events __read_mostly;
342
343static LIST_HEAD(pmus);
344static DEFINE_MUTEX(pmus_lock);
345static struct srcu_struct pmus_srcu;
346
347
348
349
350
351
352
353
354int sysctl_perf_event_paranoid __read_mostly = 2;
355
356
357int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
358
359
360
361
362#define DEFAULT_MAX_SAMPLE_RATE 100000
363#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
364#define DEFAULT_CPU_TIME_MAX_PERCENT 25
365
366int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
367
368static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
369static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
370
371static int perf_sample_allowed_ns __read_mostly =
372 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
373
374static void update_perf_cpu_limits(void)
375{
376 u64 tmp = perf_sample_period_ns;
377
378 tmp *= sysctl_perf_cpu_time_max_percent;
379 tmp = div_u64(tmp, 100);
380 if (!tmp)
381 tmp = 1;
382
383 WRITE_ONCE(perf_sample_allowed_ns, tmp);
384}
385
386static int perf_rotate_context(struct perf_cpu_context *cpuctx);
387
388int perf_proc_update_handler(struct ctl_table *table, int write,
389 void __user *buffer, size_t *lenp,
390 loff_t *ppos)
391{
392 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
393
394 if (ret || !write)
395 return ret;
396
397 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
398 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
399 update_perf_cpu_limits();
400
401 return 0;
402}
403
404int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
405
406int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
407 void __user *buffer, size_t *lenp,
408 loff_t *ppos)
409{
410 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
411
412 if (ret || !write)
413 return ret;
414
415 if (sysctl_perf_cpu_time_max_percent == 100 ||
416 sysctl_perf_cpu_time_max_percent == 0) {
417 printk(KERN_WARNING
418 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
419 WRITE_ONCE(perf_sample_allowed_ns, 0);
420 } else {
421 update_perf_cpu_limits();
422 }
423
424 return 0;
425}
426
427
428
429
430
431
432
433#define NR_ACCUMULATED_SAMPLES 128
434static DEFINE_PER_CPU(u64, running_sample_length);
435
436static u64 __report_avg;
437static u64 __report_allowed;
438
439static void perf_duration_warn(struct irq_work *w)
440{
441 printk_ratelimited(KERN_WARNING
442 "perf: interrupt took too long (%lld > %lld), lowering "
443 "kernel.perf_event_max_sample_rate to %d\n",
444 __report_avg, __report_allowed,
445 sysctl_perf_event_sample_rate);
446}
447
448static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
449
450void perf_sample_event_took(u64 sample_len_ns)
451{
452 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
453 u64 running_len;
454 u64 avg_len;
455 u32 max;
456
457 if (max_len == 0)
458 return;
459
460
461 running_len = __this_cpu_read(running_sample_length);
462 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
463 running_len += sample_len_ns;
464 __this_cpu_write(running_sample_length, running_len);
465
466
467
468
469
470
471 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
472 if (avg_len <= max_len)
473 return;
474
475 __report_avg = avg_len;
476 __report_allowed = max_len;
477
478
479
480
481 avg_len += avg_len / 4;
482 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
483 if (avg_len < max)
484 max /= (u32)avg_len;
485 else
486 max = 1;
487
488 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
489 WRITE_ONCE(max_samples_per_tick, max);
490
491 sysctl_perf_event_sample_rate = max * HZ;
492 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
493
494 if (!irq_work_queue(&perf_duration_work)) {
495 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
496 "kernel.perf_event_max_sample_rate to %d\n",
497 __report_avg, __report_allowed,
498 sysctl_perf_event_sample_rate);
499 }
500}
501
502static atomic64_t perf_event_id;
503
504static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
505 enum event_type_t event_type);
506
507static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
508 enum event_type_t event_type,
509 struct task_struct *task);
510
511static void update_context_time(struct perf_event_context *ctx);
512static u64 perf_event_time(struct perf_event *event);
513
514void __weak perf_event_print_debug(void) { }
515
516extern __weak const char *perf_pmu_name(void)
517{
518 return "pmu";
519}
520
521static inline u64 perf_clock(void)
522{
523 return local_clock();
524}
525
526static inline u64 perf_event_clock(struct perf_event *event)
527{
528 return event->clock();
529}
530
531#ifdef CONFIG_CGROUP_PERF
532
533static inline bool
534perf_cgroup_match(struct perf_event *event)
535{
536 struct perf_event_context *ctx = event->ctx;
537 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
538
539
540 if (!event->cgrp)
541 return true;
542
543
544 if (!cpuctx->cgrp)
545 return false;
546
547
548
549
550
551
552
553 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
554 event->cgrp->css.cgroup);
555}
556
557static inline void perf_detach_cgroup(struct perf_event *event)
558{
559 css_put(&event->cgrp->css);
560 event->cgrp = NULL;
561}
562
563static inline int is_cgroup_event(struct perf_event *event)
564{
565 return event->cgrp != NULL;
566}
567
568static inline u64 perf_cgroup_event_time(struct perf_event *event)
569{
570 struct perf_cgroup_info *t;
571
572 t = per_cpu_ptr(event->cgrp->info, event->cpu);
573 return t->time;
574}
575
576static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
577{
578 struct perf_cgroup_info *info;
579 u64 now;
580
581 now = perf_clock();
582
583 info = this_cpu_ptr(cgrp->info);
584
585 info->time += now - info->timestamp;
586 info->timestamp = now;
587}
588
589static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
590{
591 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
592 if (cgrp_out)
593 __update_cgrp_time(cgrp_out);
594}
595
596static inline void update_cgrp_time_from_event(struct perf_event *event)
597{
598 struct perf_cgroup *cgrp;
599
600
601
602
603
604 if (!is_cgroup_event(event))
605 return;
606
607 cgrp = perf_cgroup_from_task(current, event->ctx);
608
609
610
611 if (cgrp == event->cgrp)
612 __update_cgrp_time(event->cgrp);
613}
614
615static inline void
616perf_cgroup_set_timestamp(struct task_struct *task,
617 struct perf_event_context *ctx)
618{
619 struct perf_cgroup *cgrp;
620 struct perf_cgroup_info *info;
621
622
623
624
625
626
627 if (!task || !ctx->nr_cgroups)
628 return;
629
630 cgrp = perf_cgroup_from_task(task, ctx);
631 info = this_cpu_ptr(cgrp->info);
632 info->timestamp = ctx->timestamp;
633}
634
635#define PERF_CGROUP_SWOUT 0x1
636#define PERF_CGROUP_SWIN 0x2
637
638
639
640
641
642
643
644static void perf_cgroup_switch(struct task_struct *task, int mode)
645{
646 struct perf_cpu_context *cpuctx;
647 struct pmu *pmu;
648 unsigned long flags;
649
650
651
652
653
654
655 local_irq_save(flags);
656
657
658
659
660
661
662 list_for_each_entry_rcu(pmu, &pmus, entry) {
663 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
664 if (cpuctx->unique_pmu != pmu)
665 continue;
666
667
668
669
670
671
672
673
674 if (cpuctx->ctx.nr_cgroups > 0) {
675 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
676 perf_pmu_disable(cpuctx->ctx.pmu);
677
678 if (mode & PERF_CGROUP_SWOUT) {
679 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
680
681
682
683
684 cpuctx->cgrp = NULL;
685 }
686
687 if (mode & PERF_CGROUP_SWIN) {
688 WARN_ON_ONCE(cpuctx->cgrp);
689
690
691
692
693
694
695
696 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
697 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
698 }
699 perf_pmu_enable(cpuctx->ctx.pmu);
700 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
701 }
702 }
703
704 local_irq_restore(flags);
705}
706
707static inline void perf_cgroup_sched_out(struct task_struct *task,
708 struct task_struct *next)
709{
710 struct perf_cgroup *cgrp1;
711 struct perf_cgroup *cgrp2 = NULL;
712
713 rcu_read_lock();
714
715
716
717
718
719 cgrp1 = perf_cgroup_from_task(task, NULL);
720 cgrp2 = perf_cgroup_from_task(next, NULL);
721
722
723
724
725
726
727 if (cgrp1 != cgrp2)
728 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
729
730 rcu_read_unlock();
731}
732
733static inline void perf_cgroup_sched_in(struct task_struct *prev,
734 struct task_struct *task)
735{
736 struct perf_cgroup *cgrp1;
737 struct perf_cgroup *cgrp2 = NULL;
738
739 rcu_read_lock();
740
741
742
743
744
745 cgrp1 = perf_cgroup_from_task(task, NULL);
746 cgrp2 = perf_cgroup_from_task(prev, NULL);
747
748
749
750
751
752
753 if (cgrp1 != cgrp2)
754 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
755
756 rcu_read_unlock();
757}
758
759static inline int perf_cgroup_connect(int fd, struct perf_event *event,
760 struct perf_event_attr *attr,
761 struct perf_event *group_leader)
762{
763 struct perf_cgroup *cgrp;
764 struct cgroup_subsys_state *css;
765 struct fd f = fdget(fd);
766 int ret = 0;
767
768 if (!f.file)
769 return -EBADF;
770
771 css = css_tryget_online_from_dir(f.file->f_path.dentry,
772 &perf_event_cgrp_subsys);
773 if (IS_ERR(css)) {
774 ret = PTR_ERR(css);
775 goto out;
776 }
777
778 cgrp = container_of(css, struct perf_cgroup, css);
779 event->cgrp = cgrp;
780
781
782
783
784
785
786 if (group_leader && group_leader->cgrp != cgrp) {
787 perf_detach_cgroup(event);
788 ret = -EINVAL;
789 }
790out:
791 fdput(f);
792 return ret;
793}
794
795static inline void
796perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
797{
798 struct perf_cgroup_info *t;
799 t = per_cpu_ptr(event->cgrp->info, event->cpu);
800 event->shadow_ctx_time = now - t->timestamp;
801}
802
803static inline void
804perf_cgroup_defer_enabled(struct perf_event *event)
805{
806
807
808
809
810
811
812 if (is_cgroup_event(event) && !perf_cgroup_match(event))
813 event->cgrp_defer_enabled = 1;
814}
815
816static inline void
817perf_cgroup_mark_enabled(struct perf_event *event,
818 struct perf_event_context *ctx)
819{
820 struct perf_event *sub;
821 u64 tstamp = perf_event_time(event);
822
823 if (!event->cgrp_defer_enabled)
824 return;
825
826 event->cgrp_defer_enabled = 0;
827
828 event->tstamp_enabled = tstamp - event->total_time_enabled;
829 list_for_each_entry(sub, &event->sibling_list, group_entry) {
830 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
831 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
832 sub->cgrp_defer_enabled = 0;
833 }
834 }
835}
836#else
837
838static inline bool
839perf_cgroup_match(struct perf_event *event)
840{
841 return true;
842}
843
844static inline void perf_detach_cgroup(struct perf_event *event)
845{}
846
847static inline int is_cgroup_event(struct perf_event *event)
848{
849 return 0;
850}
851
852static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
853{
854 return 0;
855}
856
857static inline void update_cgrp_time_from_event(struct perf_event *event)
858{
859}
860
861static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
862{
863}
864
865static inline void perf_cgroup_sched_out(struct task_struct *task,
866 struct task_struct *next)
867{
868}
869
870static inline void perf_cgroup_sched_in(struct task_struct *prev,
871 struct task_struct *task)
872{
873}
874
875static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
876 struct perf_event_attr *attr,
877 struct perf_event *group_leader)
878{
879 return -EINVAL;
880}
881
882static inline void
883perf_cgroup_set_timestamp(struct task_struct *task,
884 struct perf_event_context *ctx)
885{
886}
887
888void
889perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
890{
891}
892
893static inline void
894perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
895{
896}
897
898static inline u64 perf_cgroup_event_time(struct perf_event *event)
899{
900 return 0;
901}
902
903static inline void
904perf_cgroup_defer_enabled(struct perf_event *event)
905{
906}
907
908static inline void
909perf_cgroup_mark_enabled(struct perf_event *event,
910 struct perf_event_context *ctx)
911{
912}
913#endif
914
915
916
917
918
919#define PERF_CPU_HRTIMER (1000 / HZ)
920
921
922
923static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
924{
925 struct perf_cpu_context *cpuctx;
926 int rotations = 0;
927
928 WARN_ON(!irqs_disabled());
929
930 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
931 rotations = perf_rotate_context(cpuctx);
932
933 raw_spin_lock(&cpuctx->hrtimer_lock);
934 if (rotations)
935 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
936 else
937 cpuctx->hrtimer_active = 0;
938 raw_spin_unlock(&cpuctx->hrtimer_lock);
939
940 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
941}
942
943static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
944{
945 struct hrtimer *timer = &cpuctx->hrtimer;
946 struct pmu *pmu = cpuctx->ctx.pmu;
947 u64 interval;
948
949
950 if (pmu->task_ctx_nr == perf_sw_context)
951 return;
952
953
954
955
956
957 interval = pmu->hrtimer_interval_ms;
958 if (interval < 1)
959 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
960
961 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
962
963 raw_spin_lock_init(&cpuctx->hrtimer_lock);
964 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
965 timer->function = perf_mux_hrtimer_handler;
966}
967
968static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
969{
970 struct hrtimer *timer = &cpuctx->hrtimer;
971 struct pmu *pmu = cpuctx->ctx.pmu;
972 unsigned long flags;
973
974
975 if (pmu->task_ctx_nr == perf_sw_context)
976 return 0;
977
978 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
979 if (!cpuctx->hrtimer_active) {
980 cpuctx->hrtimer_active = 1;
981 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
982 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
983 }
984 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
985
986 return 0;
987}
988
989void perf_pmu_disable(struct pmu *pmu)
990{
991 int *count = this_cpu_ptr(pmu->pmu_disable_count);
992 if (!(*count)++)
993 pmu->pmu_disable(pmu);
994}
995
996void perf_pmu_enable(struct pmu *pmu)
997{
998 int *count = this_cpu_ptr(pmu->pmu_disable_count);
999 if (!--(*count))
1000 pmu->pmu_enable(pmu);
1001}
1002
1003static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1004
1005
1006
1007
1008
1009
1010
1011static void perf_event_ctx_activate(struct perf_event_context *ctx)
1012{
1013 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1014
1015 WARN_ON(!irqs_disabled());
1016
1017 WARN_ON(!list_empty(&ctx->active_ctx_list));
1018
1019 list_add(&ctx->active_ctx_list, head);
1020}
1021
1022static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1023{
1024 WARN_ON(!irqs_disabled());
1025
1026 WARN_ON(list_empty(&ctx->active_ctx_list));
1027
1028 list_del_init(&ctx->active_ctx_list);
1029}
1030
1031static void get_ctx(struct perf_event_context *ctx)
1032{
1033 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1034}
1035
1036static void free_ctx(struct rcu_head *head)
1037{
1038 struct perf_event_context *ctx;
1039
1040 ctx = container_of(head, struct perf_event_context, rcu_head);
1041 kfree(ctx->task_ctx_data);
1042 kfree(ctx);
1043}
1044
1045static void put_ctx(struct perf_event_context *ctx)
1046{
1047 if (atomic_dec_and_test(&ctx->refcount)) {
1048 if (ctx->parent_ctx)
1049 put_ctx(ctx->parent_ctx);
1050 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1051 put_task_struct(ctx->task);
1052 call_rcu(&ctx->rcu_head, free_ctx);
1053 }
1054}
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117static struct perf_event_context *
1118perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1119{
1120 struct perf_event_context *ctx;
1121
1122again:
1123 rcu_read_lock();
1124 ctx = ACCESS_ONCE(event->ctx);
1125 if (!atomic_inc_not_zero(&ctx->refcount)) {
1126 rcu_read_unlock();
1127 goto again;
1128 }
1129 rcu_read_unlock();
1130
1131 mutex_lock_nested(&ctx->mutex, nesting);
1132 if (event->ctx != ctx) {
1133 mutex_unlock(&ctx->mutex);
1134 put_ctx(ctx);
1135 goto again;
1136 }
1137
1138 return ctx;
1139}
1140
1141static inline struct perf_event_context *
1142perf_event_ctx_lock(struct perf_event *event)
1143{
1144 return perf_event_ctx_lock_nested(event, 0);
1145}
1146
1147static void perf_event_ctx_unlock(struct perf_event *event,
1148 struct perf_event_context *ctx)
1149{
1150 mutex_unlock(&ctx->mutex);
1151 put_ctx(ctx);
1152}
1153
1154
1155
1156
1157
1158
1159static __must_check struct perf_event_context *
1160unclone_ctx(struct perf_event_context *ctx)
1161{
1162 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1163
1164 lockdep_assert_held(&ctx->lock);
1165
1166 if (parent_ctx)
1167 ctx->parent_ctx = NULL;
1168 ctx->generation++;
1169
1170 return parent_ctx;
1171}
1172
1173static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1174{
1175
1176
1177
1178 if (event->parent)
1179 event = event->parent;
1180
1181 return task_tgid_nr_ns(p, event->ns);
1182}
1183
1184static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1185{
1186
1187
1188
1189 if (event->parent)
1190 event = event->parent;
1191
1192 return task_pid_nr_ns(p, event->ns);
1193}
1194
1195
1196
1197
1198
1199static u64 primary_event_id(struct perf_event *event)
1200{
1201 u64 id = event->id;
1202
1203 if (event->parent)
1204 id = event->parent->id;
1205
1206 return id;
1207}
1208
1209
1210
1211
1212
1213
1214
1215static struct perf_event_context *
1216perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1217{
1218 struct perf_event_context *ctx;
1219
1220retry:
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230 local_irq_save(*flags);
1231 rcu_read_lock();
1232 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1233 if (ctx) {
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244 raw_spin_lock(&ctx->lock);
1245 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1246 raw_spin_unlock(&ctx->lock);
1247 rcu_read_unlock();
1248 local_irq_restore(*flags);
1249 goto retry;
1250 }
1251
1252 if (ctx->task == TASK_TOMBSTONE ||
1253 !atomic_inc_not_zero(&ctx->refcount)) {
1254 raw_spin_unlock(&ctx->lock);
1255 ctx = NULL;
1256 } else {
1257 WARN_ON_ONCE(ctx->task != task);
1258 }
1259 }
1260 rcu_read_unlock();
1261 if (!ctx)
1262 local_irq_restore(*flags);
1263 return ctx;
1264}
1265
1266
1267
1268
1269
1270
1271static struct perf_event_context *
1272perf_pin_task_context(struct task_struct *task, int ctxn)
1273{
1274 struct perf_event_context *ctx;
1275 unsigned long flags;
1276
1277 ctx = perf_lock_task_context(task, ctxn, &flags);
1278 if (ctx) {
1279 ++ctx->pin_count;
1280 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1281 }
1282 return ctx;
1283}
1284
1285static void perf_unpin_context(struct perf_event_context *ctx)
1286{
1287 unsigned long flags;
1288
1289 raw_spin_lock_irqsave(&ctx->lock, flags);
1290 --ctx->pin_count;
1291 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1292}
1293
1294
1295
1296
1297static void update_context_time(struct perf_event_context *ctx)
1298{
1299 u64 now = perf_clock();
1300
1301 ctx->time += now - ctx->timestamp;
1302 ctx->timestamp = now;
1303}
1304
1305static u64 perf_event_time(struct perf_event *event)
1306{
1307 struct perf_event_context *ctx = event->ctx;
1308
1309 if (is_cgroup_event(event))
1310 return perf_cgroup_event_time(event);
1311
1312 return ctx ? ctx->time : 0;
1313}
1314
1315
1316
1317
1318static void update_event_times(struct perf_event *event)
1319{
1320 struct perf_event_context *ctx = event->ctx;
1321 u64 run_end;
1322
1323 lockdep_assert_held(&ctx->lock);
1324
1325 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1326 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1327 return;
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339 if (is_cgroup_event(event))
1340 run_end = perf_cgroup_event_time(event);
1341 else if (ctx->is_active)
1342 run_end = ctx->time;
1343 else
1344 run_end = event->tstamp_stopped;
1345
1346 event->total_time_enabled = run_end - event->tstamp_enabled;
1347
1348 if (event->state == PERF_EVENT_STATE_INACTIVE)
1349 run_end = event->tstamp_stopped;
1350 else
1351 run_end = perf_event_time(event);
1352
1353 event->total_time_running = run_end - event->tstamp_running;
1354
1355}
1356
1357
1358
1359
1360static void update_group_times(struct perf_event *leader)
1361{
1362 struct perf_event *event;
1363
1364 update_event_times(leader);
1365 list_for_each_entry(event, &leader->sibling_list, group_entry)
1366 update_event_times(event);
1367}
1368
1369static struct list_head *
1370ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1371{
1372 if (event->attr.pinned)
1373 return &ctx->pinned_groups;
1374 else
1375 return &ctx->flexible_groups;
1376}
1377
1378
1379
1380
1381
1382static void
1383list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1384{
1385 lockdep_assert_held(&ctx->lock);
1386
1387 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1388 event->attach_state |= PERF_ATTACH_CONTEXT;
1389
1390
1391
1392
1393
1394
1395 if (event->group_leader == event) {
1396 struct list_head *list;
1397
1398 if (is_software_event(event))
1399 event->group_flags |= PERF_GROUP_SOFTWARE;
1400
1401 list = ctx_group_list(event, ctx);
1402 list_add_tail(&event->group_entry, list);
1403 }
1404
1405 if (is_cgroup_event(event))
1406 ctx->nr_cgroups++;
1407
1408 list_add_rcu(&event->event_entry, &ctx->event_list);
1409 ctx->nr_events++;
1410 if (event->attr.inherit_stat)
1411 ctx->nr_stat++;
1412
1413 ctx->generation++;
1414}
1415
1416
1417
1418
1419static inline void perf_event__state_init(struct perf_event *event)
1420{
1421 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1422 PERF_EVENT_STATE_INACTIVE;
1423}
1424
1425static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1426{
1427 int entry = sizeof(u64);
1428 int size = 0;
1429 int nr = 1;
1430
1431 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1432 size += sizeof(u64);
1433
1434 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1435 size += sizeof(u64);
1436
1437 if (event->attr.read_format & PERF_FORMAT_ID)
1438 entry += sizeof(u64);
1439
1440 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1441 nr += nr_siblings;
1442 size += sizeof(u64);
1443 }
1444
1445 size += entry * nr;
1446 event->read_size = size;
1447}
1448
1449static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1450{
1451 struct perf_sample_data *data;
1452 u16 size = 0;
1453
1454 if (sample_type & PERF_SAMPLE_IP)
1455 size += sizeof(data->ip);
1456
1457 if (sample_type & PERF_SAMPLE_ADDR)
1458 size += sizeof(data->addr);
1459
1460 if (sample_type & PERF_SAMPLE_PERIOD)
1461 size += sizeof(data->period);
1462
1463 if (sample_type & PERF_SAMPLE_WEIGHT)
1464 size += sizeof(data->weight);
1465
1466 if (sample_type & PERF_SAMPLE_READ)
1467 size += event->read_size;
1468
1469 if (sample_type & PERF_SAMPLE_DATA_SRC)
1470 size += sizeof(data->data_src.val);
1471
1472 if (sample_type & PERF_SAMPLE_TRANSACTION)
1473 size += sizeof(data->txn);
1474
1475 event->header_size = size;
1476}
1477
1478
1479
1480
1481
1482static void perf_event__header_size(struct perf_event *event)
1483{
1484 __perf_event_read_size(event,
1485 event->group_leader->nr_siblings);
1486 __perf_event_header_size(event, event->attr.sample_type);
1487}
1488
1489static void perf_event__id_header_size(struct perf_event *event)
1490{
1491 struct perf_sample_data *data;
1492 u64 sample_type = event->attr.sample_type;
1493 u16 size = 0;
1494
1495 if (sample_type & PERF_SAMPLE_TID)
1496 size += sizeof(data->tid_entry);
1497
1498 if (sample_type & PERF_SAMPLE_TIME)
1499 size += sizeof(data->time);
1500
1501 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1502 size += sizeof(data->id);
1503
1504 if (sample_type & PERF_SAMPLE_ID)
1505 size += sizeof(data->id);
1506
1507 if (sample_type & PERF_SAMPLE_STREAM_ID)
1508 size += sizeof(data->stream_id);
1509
1510 if (sample_type & PERF_SAMPLE_CPU)
1511 size += sizeof(data->cpu_entry);
1512
1513 event->id_header_size = size;
1514}
1515
1516static bool perf_event_validate_size(struct perf_event *event)
1517{
1518
1519
1520
1521
1522 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1523 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1524 perf_event__id_header_size(event);
1525
1526
1527
1528
1529
1530 if (event->read_size + event->header_size +
1531 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1532 return false;
1533
1534 return true;
1535}
1536
1537static void perf_group_attach(struct perf_event *event)
1538{
1539 struct perf_event *group_leader = event->group_leader, *pos;
1540
1541
1542
1543
1544 if (event->attach_state & PERF_ATTACH_GROUP)
1545 return;
1546
1547 event->attach_state |= PERF_ATTACH_GROUP;
1548
1549 if (group_leader == event)
1550 return;
1551
1552 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1553
1554 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1555 !is_software_event(event))
1556 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1557
1558 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1559 group_leader->nr_siblings++;
1560
1561 perf_event__header_size(group_leader);
1562
1563 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1564 perf_event__header_size(pos);
1565}
1566
1567
1568
1569
1570
1571static void
1572list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1573{
1574 struct perf_cpu_context *cpuctx;
1575
1576 WARN_ON_ONCE(event->ctx != ctx);
1577 lockdep_assert_held(&ctx->lock);
1578
1579
1580
1581
1582 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1583 return;
1584
1585 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1586
1587 if (is_cgroup_event(event)) {
1588 ctx->nr_cgroups--;
1589
1590
1591
1592
1593 cpuctx = __get_cpu_context(ctx);
1594
1595
1596
1597
1598 if (!ctx->nr_cgroups)
1599 cpuctx->cgrp = NULL;
1600 }
1601
1602 ctx->nr_events--;
1603 if (event->attr.inherit_stat)
1604 ctx->nr_stat--;
1605
1606 list_del_rcu(&event->event_entry);
1607
1608 if (event->group_leader == event)
1609 list_del_init(&event->group_entry);
1610
1611 update_group_times(event);
1612
1613
1614
1615
1616
1617
1618
1619
1620 if (event->state > PERF_EVENT_STATE_OFF)
1621 event->state = PERF_EVENT_STATE_OFF;
1622
1623 ctx->generation++;
1624}
1625
1626static void perf_group_detach(struct perf_event *event)
1627{
1628 struct perf_event *sibling, *tmp;
1629 struct list_head *list = NULL;
1630
1631
1632
1633
1634 if (!(event->attach_state & PERF_ATTACH_GROUP))
1635 return;
1636
1637 event->attach_state &= ~PERF_ATTACH_GROUP;
1638
1639
1640
1641
1642 if (event->group_leader != event) {
1643 list_del_init(&event->group_entry);
1644 event->group_leader->nr_siblings--;
1645 goto out;
1646 }
1647
1648 if (!list_empty(&event->group_entry))
1649 list = &event->group_entry;
1650
1651
1652
1653
1654
1655
1656 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1657 if (list)
1658 list_move_tail(&sibling->group_entry, list);
1659 sibling->group_leader = sibling;
1660
1661
1662 sibling->group_flags = event->group_flags;
1663
1664 WARN_ON_ONCE(sibling->ctx != event->ctx);
1665 }
1666
1667out:
1668 perf_event__header_size(event->group_leader);
1669
1670 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1671 perf_event__header_size(tmp);
1672}
1673
1674static bool is_orphaned_event(struct perf_event *event)
1675{
1676 return event->state == PERF_EVENT_STATE_DEAD;
1677}
1678
1679static inline int pmu_filter_match(struct perf_event *event)
1680{
1681 struct pmu *pmu = event->pmu;
1682 return pmu->filter_match ? pmu->filter_match(event) : 1;
1683}
1684
1685static inline int
1686event_filter_match(struct perf_event *event)
1687{
1688 return (event->cpu == -1 || event->cpu == smp_processor_id())
1689 && perf_cgroup_match(event) && pmu_filter_match(event);
1690}
1691
1692static void
1693event_sched_out(struct perf_event *event,
1694 struct perf_cpu_context *cpuctx,
1695 struct perf_event_context *ctx)
1696{
1697 u64 tstamp = perf_event_time(event);
1698 u64 delta;
1699
1700 WARN_ON_ONCE(event->ctx != ctx);
1701 lockdep_assert_held(&ctx->lock);
1702
1703
1704
1705
1706
1707
1708
1709 if (event->state == PERF_EVENT_STATE_INACTIVE
1710 && !event_filter_match(event)) {
1711 delta = tstamp - event->tstamp_stopped;
1712 event->tstamp_running += delta;
1713 event->tstamp_stopped = tstamp;
1714 }
1715
1716 if (event->state != PERF_EVENT_STATE_ACTIVE)
1717 return;
1718
1719 perf_pmu_disable(event->pmu);
1720
1721 event->tstamp_stopped = tstamp;
1722 event->pmu->del(event, 0);
1723 event->oncpu = -1;
1724 event->state = PERF_EVENT_STATE_INACTIVE;
1725 if (event->pending_disable) {
1726 event->pending_disable = 0;
1727 event->state = PERF_EVENT_STATE_OFF;
1728 }
1729
1730 if (!is_software_event(event))
1731 cpuctx->active_oncpu--;
1732 if (!--ctx->nr_active)
1733 perf_event_ctx_deactivate(ctx);
1734 if (event->attr.freq && event->attr.sample_freq)
1735 ctx->nr_freq--;
1736 if (event->attr.exclusive || !cpuctx->active_oncpu)
1737 cpuctx->exclusive = 0;
1738
1739 perf_pmu_enable(event->pmu);
1740}
1741
1742static void
1743group_sched_out(struct perf_event *group_event,
1744 struct perf_cpu_context *cpuctx,
1745 struct perf_event_context *ctx)
1746{
1747 struct perf_event *event;
1748 int state = group_event->state;
1749
1750 event_sched_out(group_event, cpuctx, ctx);
1751
1752
1753
1754
1755 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1756 event_sched_out(event, cpuctx, ctx);
1757
1758 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1759 cpuctx->exclusive = 0;
1760}
1761
1762#define DETACH_GROUP 0x01UL
1763
1764
1765
1766
1767
1768
1769
1770static void
1771__perf_remove_from_context(struct perf_event *event,
1772 struct perf_cpu_context *cpuctx,
1773 struct perf_event_context *ctx,
1774 void *info)
1775{
1776 unsigned long flags = (unsigned long)info;
1777
1778 event_sched_out(event, cpuctx, ctx);
1779 if (flags & DETACH_GROUP)
1780 perf_group_detach(event);
1781 list_del_event(event, ctx);
1782
1783 if (!ctx->nr_events && ctx->is_active) {
1784 ctx->is_active = 0;
1785 if (ctx->task) {
1786 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1787 cpuctx->task_ctx = NULL;
1788 }
1789 }
1790}
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1803{
1804 lockdep_assert_held(&event->ctx->mutex);
1805
1806 event_function_call(event, __perf_remove_from_context, (void *)flags);
1807}
1808
1809
1810
1811
1812static void __perf_event_disable(struct perf_event *event,
1813 struct perf_cpu_context *cpuctx,
1814 struct perf_event_context *ctx,
1815 void *info)
1816{
1817 if (event->state < PERF_EVENT_STATE_INACTIVE)
1818 return;
1819
1820 update_context_time(ctx);
1821 update_cgrp_time_from_event(event);
1822 update_group_times(event);
1823 if (event == event->group_leader)
1824 group_sched_out(event, cpuctx, ctx);
1825 else
1826 event_sched_out(event, cpuctx, ctx);
1827 event->state = PERF_EVENT_STATE_OFF;
1828}
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844static void _perf_event_disable(struct perf_event *event)
1845{
1846 struct perf_event_context *ctx = event->ctx;
1847
1848 raw_spin_lock_irq(&ctx->lock);
1849 if (event->state <= PERF_EVENT_STATE_OFF) {
1850 raw_spin_unlock_irq(&ctx->lock);
1851 return;
1852 }
1853 raw_spin_unlock_irq(&ctx->lock);
1854
1855 event_function_call(event, __perf_event_disable, NULL);
1856}
1857
1858void perf_event_disable_local(struct perf_event *event)
1859{
1860 event_function_local(event, __perf_event_disable, NULL);
1861}
1862
1863
1864
1865
1866
1867void perf_event_disable(struct perf_event *event)
1868{
1869 struct perf_event_context *ctx;
1870
1871 ctx = perf_event_ctx_lock(event);
1872 _perf_event_disable(event);
1873 perf_event_ctx_unlock(event, ctx);
1874}
1875EXPORT_SYMBOL_GPL(perf_event_disable);
1876
1877static void perf_set_shadow_time(struct perf_event *event,
1878 struct perf_event_context *ctx,
1879 u64 tstamp)
1880{
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906 if (is_cgroup_event(event))
1907 perf_cgroup_set_shadow_time(event, tstamp);
1908 else
1909 event->shadow_ctx_time = tstamp - ctx->timestamp;
1910}
1911
1912#define MAX_INTERRUPTS (~0ULL)
1913
1914static void perf_log_throttle(struct perf_event *event, int enable);
1915static void perf_log_itrace_start(struct perf_event *event);
1916
1917static int
1918event_sched_in(struct perf_event *event,
1919 struct perf_cpu_context *cpuctx,
1920 struct perf_event_context *ctx)
1921{
1922 u64 tstamp = perf_event_time(event);
1923 int ret = 0;
1924
1925 lockdep_assert_held(&ctx->lock);
1926
1927 if (event->state <= PERF_EVENT_STATE_OFF)
1928 return 0;
1929
1930 event->state = PERF_EVENT_STATE_ACTIVE;
1931 event->oncpu = smp_processor_id();
1932
1933
1934
1935
1936
1937
1938 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1939 perf_log_throttle(event, 1);
1940 event->hw.interrupts = 0;
1941 }
1942
1943
1944
1945
1946 smp_wmb();
1947
1948 perf_pmu_disable(event->pmu);
1949
1950 perf_set_shadow_time(event, ctx, tstamp);
1951
1952 perf_log_itrace_start(event);
1953
1954 if (event->pmu->add(event, PERF_EF_START)) {
1955 event->state = PERF_EVENT_STATE_INACTIVE;
1956 event->oncpu = -1;
1957 ret = -EAGAIN;
1958 goto out;
1959 }
1960
1961 event->tstamp_running += tstamp - event->tstamp_stopped;
1962
1963 if (!is_software_event(event))
1964 cpuctx->active_oncpu++;
1965 if (!ctx->nr_active++)
1966 perf_event_ctx_activate(ctx);
1967 if (event->attr.freq && event->attr.sample_freq)
1968 ctx->nr_freq++;
1969
1970 if (event->attr.exclusive)
1971 cpuctx->exclusive = 1;
1972
1973out:
1974 perf_pmu_enable(event->pmu);
1975
1976 return ret;
1977}
1978
1979static int
1980group_sched_in(struct perf_event *group_event,
1981 struct perf_cpu_context *cpuctx,
1982 struct perf_event_context *ctx)
1983{
1984 struct perf_event *event, *partial_group = NULL;
1985 struct pmu *pmu = ctx->pmu;
1986 u64 now = ctx->time;
1987 bool simulate = false;
1988
1989 if (group_event->state == PERF_EVENT_STATE_OFF)
1990 return 0;
1991
1992 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
1993
1994 if (event_sched_in(group_event, cpuctx, ctx)) {
1995 pmu->cancel_txn(pmu);
1996 perf_mux_hrtimer_restart(cpuctx);
1997 return -EAGAIN;
1998 }
1999
2000
2001
2002
2003 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2004 if (event_sched_in(event, cpuctx, ctx)) {
2005 partial_group = event;
2006 goto group_error;
2007 }
2008 }
2009
2010 if (!pmu->commit_txn(pmu))
2011 return 0;
2012
2013group_error:
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2029 if (event == partial_group)
2030 simulate = true;
2031
2032 if (simulate) {
2033 event->tstamp_running += now - event->tstamp_stopped;
2034 event->tstamp_stopped = now;
2035 } else {
2036 event_sched_out(event, cpuctx, ctx);
2037 }
2038 }
2039 event_sched_out(group_event, cpuctx, ctx);
2040
2041 pmu->cancel_txn(pmu);
2042
2043 perf_mux_hrtimer_restart(cpuctx);
2044
2045 return -EAGAIN;
2046}
2047
2048
2049
2050
2051static int group_can_go_on(struct perf_event *event,
2052 struct perf_cpu_context *cpuctx,
2053 int can_add_hw)
2054{
2055
2056
2057
2058 if (event->group_flags & PERF_GROUP_SOFTWARE)
2059 return 1;
2060
2061
2062
2063
2064 if (cpuctx->exclusive)
2065 return 0;
2066
2067
2068
2069
2070 if (event->attr.exclusive && cpuctx->active_oncpu)
2071 return 0;
2072
2073
2074
2075
2076 return can_add_hw;
2077}
2078
2079static void add_event_to_ctx(struct perf_event *event,
2080 struct perf_event_context *ctx)
2081{
2082 u64 tstamp = perf_event_time(event);
2083
2084 list_add_event(event, ctx);
2085 perf_group_attach(event);
2086 event->tstamp_enabled = tstamp;
2087 event->tstamp_running = tstamp;
2088 event->tstamp_stopped = tstamp;
2089}
2090
2091static void ctx_sched_out(struct perf_event_context *ctx,
2092 struct perf_cpu_context *cpuctx,
2093 enum event_type_t event_type);
2094static void
2095ctx_sched_in(struct perf_event_context *ctx,
2096 struct perf_cpu_context *cpuctx,
2097 enum event_type_t event_type,
2098 struct task_struct *task);
2099
2100static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2101 struct perf_event_context *ctx)
2102{
2103 if (!cpuctx->task_ctx)
2104 return;
2105
2106 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2107 return;
2108
2109 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2110}
2111
2112static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2113 struct perf_event_context *ctx,
2114 struct task_struct *task)
2115{
2116 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2117 if (ctx)
2118 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2119 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2120 if (ctx)
2121 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2122}
2123
2124static void ctx_resched(struct perf_cpu_context *cpuctx,
2125 struct perf_event_context *task_ctx)
2126{
2127 perf_pmu_disable(cpuctx->ctx.pmu);
2128 if (task_ctx)
2129 task_ctx_sched_out(cpuctx, task_ctx);
2130 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2131 perf_event_sched_in(cpuctx, task_ctx, current);
2132 perf_pmu_enable(cpuctx->ctx.pmu);
2133}
2134
2135
2136
2137
2138
2139
2140
2141static int __perf_install_in_context(void *info)
2142{
2143 struct perf_event *event = info;
2144 struct perf_event_context *ctx = event->ctx;
2145 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2146 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2147 bool activate = true;
2148 int ret = 0;
2149
2150 raw_spin_lock(&cpuctx->ctx.lock);
2151 if (ctx->task) {
2152 raw_spin_lock(&ctx->lock);
2153 task_ctx = ctx;
2154
2155
2156 if (task_cpu(ctx->task) != smp_processor_id()) {
2157 ret = -ESRCH;
2158 goto unlock;
2159 }
2160
2161
2162
2163
2164
2165
2166 if (ctx->task != current)
2167 activate = false;
2168 else
2169 WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2170
2171 } else if (task_ctx) {
2172 raw_spin_lock(&task_ctx->lock);
2173 }
2174
2175 if (activate) {
2176 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2177 add_event_to_ctx(event, ctx);
2178 ctx_resched(cpuctx, task_ctx);
2179 } else {
2180 add_event_to_ctx(event, ctx);
2181 }
2182
2183unlock:
2184 perf_ctx_unlock(cpuctx, task_ctx);
2185
2186 return ret;
2187}
2188
2189
2190
2191
2192
2193
2194static void
2195perf_install_in_context(struct perf_event_context *ctx,
2196 struct perf_event *event,
2197 int cpu)
2198{
2199 struct task_struct *task = READ_ONCE(ctx->task);
2200
2201 lockdep_assert_held(&ctx->mutex);
2202
2203 event->ctx = ctx;
2204 if (event->cpu != -1)
2205 event->cpu = cpu;
2206
2207 if (!task) {
2208 cpu_function_call(cpu, __perf_install_in_context, event);
2209 return;
2210 }
2211
2212
2213
2214
2215 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2216 return;
2217
2218
2219
2220
2221
2222again:
2223
2224
2225
2226
2227 if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
2228 return;
2229
2230 raw_spin_lock_irq(&ctx->lock);
2231 task = ctx->task;
2232 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2233
2234
2235
2236
2237
2238 raw_spin_unlock_irq(&ctx->lock);
2239 return;
2240 }
2241 raw_spin_unlock_irq(&ctx->lock);
2242
2243
2244
2245
2246 goto again;
2247}
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257static void __perf_event_mark_enabled(struct perf_event *event)
2258{
2259 struct perf_event *sub;
2260 u64 tstamp = perf_event_time(event);
2261
2262 event->state = PERF_EVENT_STATE_INACTIVE;
2263 event->tstamp_enabled = tstamp - event->total_time_enabled;
2264 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2265 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2266 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2267 }
2268}
2269
2270
2271
2272
2273static void __perf_event_enable(struct perf_event *event,
2274 struct perf_cpu_context *cpuctx,
2275 struct perf_event_context *ctx,
2276 void *info)
2277{
2278 struct perf_event *leader = event->group_leader;
2279 struct perf_event_context *task_ctx;
2280
2281 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2282 event->state <= PERF_EVENT_STATE_ERROR)
2283 return;
2284
2285 if (ctx->is_active)
2286 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2287
2288 __perf_event_mark_enabled(event);
2289
2290 if (!ctx->is_active)
2291 return;
2292
2293 if (!event_filter_match(event)) {
2294 if (is_cgroup_event(event))
2295 perf_cgroup_defer_enabled(event);
2296 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2297 return;
2298 }
2299
2300
2301
2302
2303
2304 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2305 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2306 return;
2307 }
2308
2309 task_ctx = cpuctx->task_ctx;
2310 if (ctx->task)
2311 WARN_ON_ONCE(task_ctx != ctx);
2312
2313 ctx_resched(cpuctx, task_ctx);
2314}
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325static void _perf_event_enable(struct perf_event *event)
2326{
2327 struct perf_event_context *ctx = event->ctx;
2328
2329 raw_spin_lock_irq(&ctx->lock);
2330 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2331 event->state < PERF_EVENT_STATE_ERROR) {
2332 raw_spin_unlock_irq(&ctx->lock);
2333 return;
2334 }
2335
2336
2337
2338
2339
2340
2341
2342
2343 if (event->state == PERF_EVENT_STATE_ERROR)
2344 event->state = PERF_EVENT_STATE_OFF;
2345 raw_spin_unlock_irq(&ctx->lock);
2346
2347 event_function_call(event, __perf_event_enable, NULL);
2348}
2349
2350
2351
2352
2353void perf_event_enable(struct perf_event *event)
2354{
2355 struct perf_event_context *ctx;
2356
2357 ctx = perf_event_ctx_lock(event);
2358 _perf_event_enable(event);
2359 perf_event_ctx_unlock(event, ctx);
2360}
2361EXPORT_SYMBOL_GPL(perf_event_enable);
2362
2363static int _perf_event_refresh(struct perf_event *event, int refresh)
2364{
2365
2366
2367
2368 if (event->attr.inherit || !is_sampling_event(event))
2369 return -EINVAL;
2370
2371 atomic_add(refresh, &event->event_limit);
2372 _perf_event_enable(event);
2373
2374 return 0;
2375}
2376
2377
2378
2379
2380int perf_event_refresh(struct perf_event *event, int refresh)
2381{
2382 struct perf_event_context *ctx;
2383 int ret;
2384
2385 ctx = perf_event_ctx_lock(event);
2386 ret = _perf_event_refresh(event, refresh);
2387 perf_event_ctx_unlock(event, ctx);
2388
2389 return ret;
2390}
2391EXPORT_SYMBOL_GPL(perf_event_refresh);
2392
2393static void ctx_sched_out(struct perf_event_context *ctx,
2394 struct perf_cpu_context *cpuctx,
2395 enum event_type_t event_type)
2396{
2397 int is_active = ctx->is_active;
2398 struct perf_event *event;
2399
2400 lockdep_assert_held(&ctx->lock);
2401
2402 if (likely(!ctx->nr_events)) {
2403
2404
2405
2406 WARN_ON_ONCE(ctx->is_active);
2407 if (ctx->task)
2408 WARN_ON_ONCE(cpuctx->task_ctx);
2409 return;
2410 }
2411
2412 ctx->is_active &= ~event_type;
2413 if (!(ctx->is_active & EVENT_ALL))
2414 ctx->is_active = 0;
2415
2416 if (ctx->task) {
2417 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2418 if (!ctx->is_active)
2419 cpuctx->task_ctx = NULL;
2420 }
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432 if (is_active & EVENT_TIME) {
2433
2434 update_context_time(ctx);
2435 update_cgrp_time_from_cpuctx(cpuctx);
2436 }
2437
2438 is_active ^= ctx->is_active;
2439
2440 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2441 return;
2442
2443 perf_pmu_disable(ctx->pmu);
2444 if (is_active & EVENT_PINNED) {
2445 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2446 group_sched_out(event, cpuctx, ctx);
2447 }
2448
2449 if (is_active & EVENT_FLEXIBLE) {
2450 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2451 group_sched_out(event, cpuctx, ctx);
2452 }
2453 perf_pmu_enable(ctx->pmu);
2454}
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464static int context_equiv(struct perf_event_context *ctx1,
2465 struct perf_event_context *ctx2)
2466{
2467 lockdep_assert_held(&ctx1->lock);
2468 lockdep_assert_held(&ctx2->lock);
2469
2470
2471 if (ctx1->pin_count || ctx2->pin_count)
2472 return 0;
2473
2474
2475 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2476 return 1;
2477
2478
2479 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2480 return 1;
2481
2482
2483
2484
2485
2486 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2487 ctx1->parent_gen == ctx2->parent_gen)
2488 return 1;
2489
2490
2491 return 0;
2492}
2493
2494static void __perf_event_sync_stat(struct perf_event *event,
2495 struct perf_event *next_event)
2496{
2497 u64 value;
2498
2499 if (!event->attr.inherit_stat)
2500 return;
2501
2502
2503
2504
2505
2506
2507
2508
2509 switch (event->state) {
2510 case PERF_EVENT_STATE_ACTIVE:
2511 event->pmu->read(event);
2512
2513
2514 case PERF_EVENT_STATE_INACTIVE:
2515 update_event_times(event);
2516 break;
2517
2518 default:
2519 break;
2520 }
2521
2522
2523
2524
2525
2526 value = local64_read(&next_event->count);
2527 value = local64_xchg(&event->count, value);
2528 local64_set(&next_event->count, value);
2529
2530 swap(event->total_time_enabled, next_event->total_time_enabled);
2531 swap(event->total_time_running, next_event->total_time_running);
2532
2533
2534
2535
2536 perf_event_update_userpage(event);
2537 perf_event_update_userpage(next_event);
2538}
2539
2540static void perf_event_sync_stat(struct perf_event_context *ctx,
2541 struct perf_event_context *next_ctx)
2542{
2543 struct perf_event *event, *next_event;
2544
2545 if (!ctx->nr_stat)
2546 return;
2547
2548 update_context_time(ctx);
2549
2550 event = list_first_entry(&ctx->event_list,
2551 struct perf_event, event_entry);
2552
2553 next_event = list_first_entry(&next_ctx->event_list,
2554 struct perf_event, event_entry);
2555
2556 while (&event->event_entry != &ctx->event_list &&
2557 &next_event->event_entry != &next_ctx->event_list) {
2558
2559 __perf_event_sync_stat(event, next_event);
2560
2561 event = list_next_entry(event, event_entry);
2562 next_event = list_next_entry(next_event, event_entry);
2563 }
2564}
2565
2566static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2567 struct task_struct *next)
2568{
2569 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2570 struct perf_event_context *next_ctx;
2571 struct perf_event_context *parent, *next_parent;
2572 struct perf_cpu_context *cpuctx;
2573 int do_switch = 1;
2574
2575 if (likely(!ctx))
2576 return;
2577
2578 cpuctx = __get_cpu_context(ctx);
2579 if (!cpuctx->task_ctx)
2580 return;
2581
2582 rcu_read_lock();
2583 next_ctx = next->perf_event_ctxp[ctxn];
2584 if (!next_ctx)
2585 goto unlock;
2586
2587 parent = rcu_dereference(ctx->parent_ctx);
2588 next_parent = rcu_dereference(next_ctx->parent_ctx);
2589
2590
2591 if (!parent && !next_parent)
2592 goto unlock;
2593
2594 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604 raw_spin_lock(&ctx->lock);
2605 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2606 if (context_equiv(ctx, next_ctx)) {
2607 WRITE_ONCE(ctx->task, next);
2608 WRITE_ONCE(next_ctx->task, task);
2609
2610 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2611
2612
2613
2614
2615
2616
2617
2618
2619 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2620 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2621
2622 do_switch = 0;
2623
2624 perf_event_sync_stat(ctx, next_ctx);
2625 }
2626 raw_spin_unlock(&next_ctx->lock);
2627 raw_spin_unlock(&ctx->lock);
2628 }
2629unlock:
2630 rcu_read_unlock();
2631
2632 if (do_switch) {
2633 raw_spin_lock(&ctx->lock);
2634 task_ctx_sched_out(cpuctx, ctx);
2635 raw_spin_unlock(&ctx->lock);
2636 }
2637}
2638
2639void perf_sched_cb_dec(struct pmu *pmu)
2640{
2641 this_cpu_dec(perf_sched_cb_usages);
2642}
2643
2644void perf_sched_cb_inc(struct pmu *pmu)
2645{
2646 this_cpu_inc(perf_sched_cb_usages);
2647}
2648
2649
2650
2651
2652
2653static void perf_pmu_sched_task(struct task_struct *prev,
2654 struct task_struct *next,
2655 bool sched_in)
2656{
2657 struct perf_cpu_context *cpuctx;
2658 struct pmu *pmu;
2659 unsigned long flags;
2660
2661 if (prev == next)
2662 return;
2663
2664 local_irq_save(flags);
2665
2666 rcu_read_lock();
2667
2668 list_for_each_entry_rcu(pmu, &pmus, entry) {
2669 if (pmu->sched_task) {
2670 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2671
2672 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2673
2674 perf_pmu_disable(pmu);
2675
2676 pmu->sched_task(cpuctx->task_ctx, sched_in);
2677
2678 perf_pmu_enable(pmu);
2679
2680 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2681 }
2682 }
2683
2684 rcu_read_unlock();
2685
2686 local_irq_restore(flags);
2687}
2688
2689static void perf_event_switch(struct task_struct *task,
2690 struct task_struct *next_prev, bool sched_in);
2691
2692#define for_each_task_context_nr(ctxn) \
2693 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706void __perf_event_task_sched_out(struct task_struct *task,
2707 struct task_struct *next)
2708{
2709 int ctxn;
2710
2711 if (__this_cpu_read(perf_sched_cb_usages))
2712 perf_pmu_sched_task(task, next, false);
2713
2714 if (atomic_read(&nr_switch_events))
2715 perf_event_switch(task, next, false);
2716
2717 for_each_task_context_nr(ctxn)
2718 perf_event_context_sched_out(task, ctxn, next);
2719
2720
2721
2722
2723
2724
2725 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2726 perf_cgroup_sched_out(task, next);
2727}
2728
2729
2730
2731
2732static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2733 enum event_type_t event_type)
2734{
2735 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2736}
2737
2738static void
2739ctx_pinned_sched_in(struct perf_event_context *ctx,
2740 struct perf_cpu_context *cpuctx)
2741{
2742 struct perf_event *event;
2743
2744 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2745 if (event->state <= PERF_EVENT_STATE_OFF)
2746 continue;
2747 if (!event_filter_match(event))
2748 continue;
2749
2750
2751 if (is_cgroup_event(event))
2752 perf_cgroup_mark_enabled(event, ctx);
2753
2754 if (group_can_go_on(event, cpuctx, 1))
2755 group_sched_in(event, cpuctx, ctx);
2756
2757
2758
2759
2760
2761 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2762 update_group_times(event);
2763 event->state = PERF_EVENT_STATE_ERROR;
2764 }
2765 }
2766}
2767
2768static void
2769ctx_flexible_sched_in(struct perf_event_context *ctx,
2770 struct perf_cpu_context *cpuctx)
2771{
2772 struct perf_event *event;
2773 int can_add_hw = 1;
2774
2775 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2776
2777 if (event->state <= PERF_EVENT_STATE_OFF)
2778 continue;
2779
2780
2781
2782
2783 if (!event_filter_match(event))
2784 continue;
2785
2786
2787 if (is_cgroup_event(event))
2788 perf_cgroup_mark_enabled(event, ctx);
2789
2790 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2791 if (group_sched_in(event, cpuctx, ctx))
2792 can_add_hw = 0;
2793 }
2794 }
2795}
2796
2797static void
2798ctx_sched_in(struct perf_event_context *ctx,
2799 struct perf_cpu_context *cpuctx,
2800 enum event_type_t event_type,
2801 struct task_struct *task)
2802{
2803 int is_active = ctx->is_active;
2804 u64 now;
2805
2806 lockdep_assert_held(&ctx->lock);
2807
2808 if (likely(!ctx->nr_events))
2809 return;
2810
2811 ctx->is_active |= (event_type | EVENT_TIME);
2812 if (ctx->task) {
2813 if (!is_active)
2814 cpuctx->task_ctx = ctx;
2815 else
2816 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2817 }
2818
2819 is_active ^= ctx->is_active;
2820
2821 if (is_active & EVENT_TIME) {
2822
2823 now = perf_clock();
2824 ctx->timestamp = now;
2825 perf_cgroup_set_timestamp(task, ctx);
2826 }
2827
2828
2829
2830
2831
2832 if (is_active & EVENT_PINNED)
2833 ctx_pinned_sched_in(ctx, cpuctx);
2834
2835
2836 if (is_active & EVENT_FLEXIBLE)
2837 ctx_flexible_sched_in(ctx, cpuctx);
2838}
2839
2840static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2841 enum event_type_t event_type,
2842 struct task_struct *task)
2843{
2844 struct perf_event_context *ctx = &cpuctx->ctx;
2845
2846 ctx_sched_in(ctx, cpuctx, event_type, task);
2847}
2848
2849static void perf_event_context_sched_in(struct perf_event_context *ctx,
2850 struct task_struct *task)
2851{
2852 struct perf_cpu_context *cpuctx;
2853
2854 cpuctx = __get_cpu_context(ctx);
2855 if (cpuctx->task_ctx == ctx)
2856 return;
2857
2858 perf_ctx_lock(cpuctx, ctx);
2859 perf_pmu_disable(ctx->pmu);
2860
2861
2862
2863
2864
2865 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2866 perf_event_sched_in(cpuctx, ctx, task);
2867 perf_pmu_enable(ctx->pmu);
2868 perf_ctx_unlock(cpuctx, ctx);
2869}
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882void __perf_event_task_sched_in(struct task_struct *prev,
2883 struct task_struct *task)
2884{
2885 struct perf_event_context *ctx;
2886 int ctxn;
2887
2888
2889
2890
2891
2892
2893
2894
2895 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2896 perf_cgroup_sched_in(prev, task);
2897
2898 for_each_task_context_nr(ctxn) {
2899 ctx = task->perf_event_ctxp[ctxn];
2900 if (likely(!ctx))
2901 continue;
2902
2903 perf_event_context_sched_in(ctx, task);
2904 }
2905
2906 if (atomic_read(&nr_switch_events))
2907 perf_event_switch(task, prev, true);
2908
2909 if (__this_cpu_read(perf_sched_cb_usages))
2910 perf_pmu_sched_task(prev, task, true);
2911}
2912
2913static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2914{
2915 u64 frequency = event->attr.sample_freq;
2916 u64 sec = NSEC_PER_SEC;
2917 u64 divisor, dividend;
2918
2919 int count_fls, nsec_fls, frequency_fls, sec_fls;
2920
2921 count_fls = fls64(count);
2922 nsec_fls = fls64(nsec);
2923 frequency_fls = fls64(frequency);
2924 sec_fls = 30;
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940#define REDUCE_FLS(a, b) \
2941do { \
2942 if (a##_fls > b##_fls) { \
2943 a >>= 1; \
2944 a##_fls--; \
2945 } else { \
2946 b >>= 1; \
2947 b##_fls--; \
2948 } \
2949} while (0)
2950
2951
2952
2953
2954
2955 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2956 REDUCE_FLS(nsec, frequency);
2957 REDUCE_FLS(sec, count);
2958 }
2959
2960 if (count_fls + sec_fls > 64) {
2961 divisor = nsec * frequency;
2962
2963 while (count_fls + sec_fls > 64) {
2964 REDUCE_FLS(count, sec);
2965 divisor >>= 1;
2966 }
2967
2968 dividend = count * sec;
2969 } else {
2970 dividend = count * sec;
2971
2972 while (nsec_fls + frequency_fls > 64) {
2973 REDUCE_FLS(nsec, frequency);
2974 dividend >>= 1;
2975 }
2976
2977 divisor = nsec * frequency;
2978 }
2979
2980 if (!divisor)
2981 return dividend;
2982
2983 return div64_u64(dividend, divisor);
2984}
2985
2986static DEFINE_PER_CPU(int, perf_throttled_count);
2987static DEFINE_PER_CPU(u64, perf_throttled_seq);
2988
2989static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2990{
2991 struct hw_perf_event *hwc = &event->hw;
2992 s64 period, sample_period;
2993 s64 delta;
2994
2995 period = perf_calculate_period(event, nsec, count);
2996
2997 delta = (s64)(period - hwc->sample_period);
2998 delta = (delta + 7) / 8;
2999
3000 sample_period = hwc->sample_period + delta;
3001
3002 if (!sample_period)
3003 sample_period = 1;
3004
3005 hwc->sample_period = sample_period;
3006
3007 if (local64_read(&hwc->period_left) > 8*sample_period) {
3008 if (disable)
3009 event->pmu->stop(event, PERF_EF_UPDATE);
3010
3011 local64_set(&hwc->period_left, 0);
3012
3013 if (disable)
3014 event->pmu->start(event, PERF_EF_RELOAD);
3015 }
3016}
3017
3018
3019
3020
3021
3022
3023static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3024 int needs_unthr)
3025{
3026 struct perf_event *event;
3027 struct hw_perf_event *hwc;
3028 u64 now, period = TICK_NSEC;
3029 s64 delta;
3030
3031
3032
3033
3034
3035
3036 if (!(ctx->nr_freq || needs_unthr))
3037 return;
3038
3039 raw_spin_lock(&ctx->lock);
3040 perf_pmu_disable(ctx->pmu);
3041
3042 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3043 if (event->state != PERF_EVENT_STATE_ACTIVE)
3044 continue;
3045
3046 if (!event_filter_match(event))
3047 continue;
3048
3049 perf_pmu_disable(event->pmu);
3050
3051 hwc = &event->hw;
3052
3053 if (hwc->interrupts == MAX_INTERRUPTS) {
3054 hwc->interrupts = 0;
3055 perf_log_throttle(event, 1);
3056 event->pmu->start(event, 0);
3057 }
3058
3059 if (!event->attr.freq || !event->attr.sample_freq)
3060 goto next;
3061
3062
3063
3064
3065 event->pmu->stop(event, PERF_EF_UPDATE);
3066
3067 now = local64_read(&event->count);
3068 delta = now - hwc->freq_count_stamp;
3069 hwc->freq_count_stamp = now;
3070
3071
3072
3073
3074
3075
3076
3077
3078 if (delta > 0)
3079 perf_adjust_period(event, period, delta, false);
3080
3081 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3082 next:
3083 perf_pmu_enable(event->pmu);
3084 }
3085
3086 perf_pmu_enable(ctx->pmu);
3087 raw_spin_unlock(&ctx->lock);
3088}
3089
3090
3091
3092
3093static void rotate_ctx(struct perf_event_context *ctx)
3094{
3095
3096
3097
3098
3099 if (!ctx->rotate_disable)
3100 list_rotate_left(&ctx->flexible_groups);
3101}
3102
3103static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3104{
3105 struct perf_event_context *ctx = NULL;
3106 int rotate = 0;
3107
3108 if (cpuctx->ctx.nr_events) {
3109 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3110 rotate = 1;
3111 }
3112
3113 ctx = cpuctx->task_ctx;
3114 if (ctx && ctx->nr_events) {
3115 if (ctx->nr_events != ctx->nr_active)
3116 rotate = 1;
3117 }
3118
3119 if (!rotate)
3120 goto done;
3121
3122 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3123 perf_pmu_disable(cpuctx->ctx.pmu);
3124
3125 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3126 if (ctx)
3127 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3128
3129 rotate_ctx(&cpuctx->ctx);
3130 if (ctx)
3131 rotate_ctx(ctx);
3132
3133 perf_event_sched_in(cpuctx, ctx, current);
3134
3135 perf_pmu_enable(cpuctx->ctx.pmu);
3136 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3137done:
3138
3139 return rotate;
3140}
3141
3142void perf_event_task_tick(void)
3143{
3144 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3145 struct perf_event_context *ctx, *tmp;
3146 int throttled;
3147
3148 WARN_ON(!irqs_disabled());
3149
3150 __this_cpu_inc(perf_throttled_seq);
3151 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3152 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3153
3154 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3155 perf_adjust_freq_unthr_context(ctx, throttled);
3156}
3157
3158static int event_enable_on_exec(struct perf_event *event,
3159 struct perf_event_context *ctx)
3160{
3161 if (!event->attr.enable_on_exec)
3162 return 0;
3163
3164 event->attr.enable_on_exec = 0;
3165 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3166 return 0;
3167
3168 __perf_event_mark_enabled(event);
3169
3170 return 1;
3171}
3172
3173
3174
3175
3176
3177static void perf_event_enable_on_exec(int ctxn)
3178{
3179 struct perf_event_context *ctx, *clone_ctx = NULL;
3180 struct perf_cpu_context *cpuctx;
3181 struct perf_event *event;
3182 unsigned long flags;
3183 int enabled = 0;
3184
3185 local_irq_save(flags);
3186 ctx = current->perf_event_ctxp[ctxn];
3187 if (!ctx || !ctx->nr_events)
3188 goto out;
3189
3190 cpuctx = __get_cpu_context(ctx);
3191 perf_ctx_lock(cpuctx, ctx);
3192 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3193 list_for_each_entry(event, &ctx->event_list, event_entry)
3194 enabled |= event_enable_on_exec(event, ctx);
3195
3196
3197
3198
3199 if (enabled) {
3200 clone_ctx = unclone_ctx(ctx);
3201 ctx_resched(cpuctx, ctx);
3202 }
3203 perf_ctx_unlock(cpuctx, ctx);
3204
3205out:
3206 local_irq_restore(flags);
3207
3208 if (clone_ctx)
3209 put_ctx(clone_ctx);
3210}
3211
3212void perf_event_exec(void)
3213{
3214 int ctxn;
3215
3216 rcu_read_lock();
3217 for_each_task_context_nr(ctxn)
3218 perf_event_enable_on_exec(ctxn);
3219 rcu_read_unlock();
3220}
3221
3222struct perf_read_data {
3223 struct perf_event *event;
3224 bool group;
3225 int ret;
3226};
3227
3228
3229
3230
3231static void __perf_event_read(void *info)
3232{
3233 struct perf_read_data *data = info;
3234 struct perf_event *sub, *event = data->event;
3235 struct perf_event_context *ctx = event->ctx;
3236 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3237 struct pmu *pmu = event->pmu;
3238
3239
3240
3241
3242
3243
3244
3245
3246 if (ctx->task && cpuctx->task_ctx != ctx)
3247 return;
3248
3249 raw_spin_lock(&ctx->lock);
3250 if (ctx->is_active) {
3251 update_context_time(ctx);
3252 update_cgrp_time_from_event(event);
3253 }
3254
3255 update_event_times(event);
3256 if (event->state != PERF_EVENT_STATE_ACTIVE)
3257 goto unlock;
3258
3259 if (!data->group) {
3260 pmu->read(event);
3261 data->ret = 0;
3262 goto unlock;
3263 }
3264
3265 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3266
3267 pmu->read(event);
3268
3269 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3270 update_event_times(sub);
3271 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3272
3273
3274
3275
3276 sub->pmu->read(sub);
3277 }
3278 }
3279
3280 data->ret = pmu->commit_txn(pmu);
3281
3282unlock:
3283 raw_spin_unlock(&ctx->lock);
3284}
3285
3286static inline u64 perf_event_count(struct perf_event *event)
3287{
3288 if (event->pmu->count)
3289 return event->pmu->count(event);
3290
3291 return __perf_event_count(event);
3292}
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302u64 perf_event_read_local(struct perf_event *event)
3303{
3304 unsigned long flags;
3305 u64 val;
3306
3307
3308
3309
3310
3311 local_irq_save(flags);
3312
3313
3314 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3315 event->hw.target != current);
3316
3317
3318 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3319 event->cpu != smp_processor_id());
3320
3321
3322
3323
3324
3325 WARN_ON_ONCE(event->attr.inherit);
3326
3327
3328
3329
3330
3331 WARN_ON_ONCE(event->pmu->count);
3332
3333
3334
3335
3336
3337
3338 if (event->oncpu == smp_processor_id())
3339 event->pmu->read(event);
3340
3341 val = local64_read(&event->count);
3342 local_irq_restore(flags);
3343
3344 return val;
3345}
3346
3347static int perf_event_read(struct perf_event *event, bool group)
3348{
3349 int ret = 0;
3350
3351
3352
3353
3354
3355 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3356 struct perf_read_data data = {
3357 .event = event,
3358 .group = group,
3359 .ret = 0,
3360 };
3361 smp_call_function_single(event->oncpu,
3362 __perf_event_read, &data, 1);
3363 ret = data.ret;
3364 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3365 struct perf_event_context *ctx = event->ctx;
3366 unsigned long flags;
3367
3368 raw_spin_lock_irqsave(&ctx->lock, flags);
3369
3370
3371
3372
3373
3374 if (ctx->is_active) {
3375 update_context_time(ctx);
3376 update_cgrp_time_from_event(event);
3377 }
3378 if (group)
3379 update_group_times(event);
3380 else
3381 update_event_times(event);
3382 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3383 }
3384
3385 return ret;
3386}
3387
3388
3389
3390
3391static void __perf_event_init_context(struct perf_event_context *ctx)
3392{
3393 raw_spin_lock_init(&ctx->lock);
3394 mutex_init(&ctx->mutex);
3395 INIT_LIST_HEAD(&ctx->active_ctx_list);
3396 INIT_LIST_HEAD(&ctx->pinned_groups);
3397 INIT_LIST_HEAD(&ctx->flexible_groups);
3398 INIT_LIST_HEAD(&ctx->event_list);
3399 atomic_set(&ctx->refcount, 1);
3400}
3401
3402static struct perf_event_context *
3403alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3404{
3405 struct perf_event_context *ctx;
3406
3407 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3408 if (!ctx)
3409 return NULL;
3410
3411 __perf_event_init_context(ctx);
3412 if (task) {
3413 ctx->task = task;
3414 get_task_struct(task);
3415 }
3416 ctx->pmu = pmu;
3417
3418 return ctx;
3419}
3420
3421static struct task_struct *
3422find_lively_task_by_vpid(pid_t vpid)
3423{
3424 struct task_struct *task;
3425
3426 rcu_read_lock();
3427 if (!vpid)
3428 task = current;
3429 else
3430 task = find_task_by_vpid(vpid);
3431 if (task)
3432 get_task_struct(task);
3433 rcu_read_unlock();
3434
3435 if (!task)
3436 return ERR_PTR(-ESRCH);
3437
3438 return task;
3439}
3440
3441
3442
3443
3444static struct perf_event_context *
3445find_get_context(struct pmu *pmu, struct task_struct *task,
3446 struct perf_event *event)
3447{
3448 struct perf_event_context *ctx, *clone_ctx = NULL;
3449 struct perf_cpu_context *cpuctx;
3450 void *task_ctx_data = NULL;
3451 unsigned long flags;
3452 int ctxn, err;
3453 int cpu = event->cpu;
3454
3455 if (!task) {
3456
3457 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3458 return ERR_PTR(-EACCES);
3459
3460
3461
3462
3463
3464
3465 if (!cpu_online(cpu))
3466 return ERR_PTR(-ENODEV);
3467
3468 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3469 ctx = &cpuctx->ctx;
3470 get_ctx(ctx);
3471 ++ctx->pin_count;
3472
3473 return ctx;
3474 }
3475
3476 err = -EINVAL;
3477 ctxn = pmu->task_ctx_nr;
3478 if (ctxn < 0)
3479 goto errout;
3480
3481 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3482 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3483 if (!task_ctx_data) {
3484 err = -ENOMEM;
3485 goto errout;
3486 }
3487 }
3488
3489retry:
3490 ctx = perf_lock_task_context(task, ctxn, &flags);
3491 if (ctx) {
3492 clone_ctx = unclone_ctx(ctx);
3493 ++ctx->pin_count;
3494
3495 if (task_ctx_data && !ctx->task_ctx_data) {
3496 ctx->task_ctx_data = task_ctx_data;
3497 task_ctx_data = NULL;
3498 }
3499 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3500
3501 if (clone_ctx)
3502 put_ctx(clone_ctx);
3503 } else {
3504 ctx = alloc_perf_context(pmu, task);
3505 err = -ENOMEM;
3506 if (!ctx)
3507 goto errout;
3508
3509 if (task_ctx_data) {
3510 ctx->task_ctx_data = task_ctx_data;
3511 task_ctx_data = NULL;
3512 }
3513
3514 err = 0;
3515 mutex_lock(&task->perf_event_mutex);
3516
3517
3518
3519
3520 if (task->flags & PF_EXITING)
3521 err = -ESRCH;
3522 else if (task->perf_event_ctxp[ctxn])
3523 err = -EAGAIN;
3524 else {
3525 get_ctx(ctx);
3526 ++ctx->pin_count;
3527 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3528 }
3529 mutex_unlock(&task->perf_event_mutex);
3530
3531 if (unlikely(err)) {
3532 put_ctx(ctx);
3533
3534 if (err == -EAGAIN)
3535 goto retry;
3536 goto errout;
3537 }
3538 }
3539
3540 kfree(task_ctx_data);
3541 return ctx;
3542
3543errout:
3544 kfree(task_ctx_data);
3545 return ERR_PTR(err);
3546}
3547
3548static void perf_event_free_filter(struct perf_event *event);
3549static void perf_event_free_bpf_prog(struct perf_event *event);
3550
3551static void free_event_rcu(struct rcu_head *head)
3552{
3553 struct perf_event *event;
3554
3555 event = container_of(head, struct perf_event, rcu_head);
3556 if (event->ns)
3557 put_pid_ns(event->ns);
3558 perf_event_free_filter(event);
3559 kfree(event);
3560}
3561
3562static void ring_buffer_attach(struct perf_event *event,
3563 struct ring_buffer *rb);
3564
3565static void unaccount_event_cpu(struct perf_event *event, int cpu)
3566{
3567 if (event->parent)
3568 return;
3569
3570 if (is_cgroup_event(event))
3571 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3572}
3573
3574#ifdef CONFIG_NO_HZ_FULL
3575static DEFINE_SPINLOCK(nr_freq_lock);
3576#endif
3577
3578static void unaccount_freq_event_nohz(void)
3579{
3580#ifdef CONFIG_NO_HZ_FULL
3581 spin_lock(&nr_freq_lock);
3582 if (atomic_dec_and_test(&nr_freq_events))
3583 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3584 spin_unlock(&nr_freq_lock);
3585#endif
3586}
3587
3588static void unaccount_freq_event(void)
3589{
3590 if (tick_nohz_full_enabled())
3591 unaccount_freq_event_nohz();
3592 else
3593 atomic_dec(&nr_freq_events);
3594}
3595
3596static void unaccount_event(struct perf_event *event)
3597{
3598 bool dec = false;
3599
3600 if (event->parent)
3601 return;
3602
3603 if (event->attach_state & PERF_ATTACH_TASK)
3604 dec = true;
3605 if (event->attr.mmap || event->attr.mmap_data)
3606 atomic_dec(&nr_mmap_events);
3607 if (event->attr.comm)
3608 atomic_dec(&nr_comm_events);
3609 if (event->attr.task)
3610 atomic_dec(&nr_task_events);
3611 if (event->attr.freq)
3612 unaccount_freq_event();
3613 if (event->attr.context_switch) {
3614 dec = true;
3615 atomic_dec(&nr_switch_events);
3616 }
3617 if (is_cgroup_event(event))
3618 dec = true;
3619 if (has_branch_stack(event))
3620 dec = true;
3621
3622 if (dec) {
3623 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3624 schedule_delayed_work(&perf_sched_work, HZ);
3625 }
3626
3627 unaccount_event_cpu(event, event->cpu);
3628}
3629
3630static void perf_sched_delayed(struct work_struct *work)
3631{
3632 mutex_lock(&perf_sched_mutex);
3633 if (atomic_dec_and_test(&perf_sched_count))
3634 static_branch_disable(&perf_sched_events);
3635 mutex_unlock(&perf_sched_mutex);
3636}
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650static int exclusive_event_init(struct perf_event *event)
3651{
3652 struct pmu *pmu = event->pmu;
3653
3654 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3655 return 0;
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670 if (event->attach_state & PERF_ATTACH_TASK) {
3671 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3672 return -EBUSY;
3673 } else {
3674 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3675 return -EBUSY;
3676 }
3677
3678 return 0;
3679}
3680
3681static void exclusive_event_destroy(struct perf_event *event)
3682{
3683 struct pmu *pmu = event->pmu;
3684
3685 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3686 return;
3687
3688
3689 if (event->attach_state & PERF_ATTACH_TASK)
3690 atomic_dec(&pmu->exclusive_cnt);
3691 else
3692 atomic_inc(&pmu->exclusive_cnt);
3693}
3694
3695static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3696{
3697 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3698 (e1->cpu == e2->cpu ||
3699 e1->cpu == -1 ||
3700 e2->cpu == -1))
3701 return true;
3702 return false;
3703}
3704
3705
3706static bool exclusive_event_installable(struct perf_event *event,
3707 struct perf_event_context *ctx)
3708{
3709 struct perf_event *iter_event;
3710 struct pmu *pmu = event->pmu;
3711
3712 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3713 return true;
3714
3715 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3716 if (exclusive_event_match(iter_event, event))
3717 return false;
3718 }
3719
3720 return true;
3721}
3722
3723static void _free_event(struct perf_event *event)
3724{
3725 irq_work_sync(&event->pending);
3726
3727 unaccount_event(event);
3728
3729 if (event->rb) {
3730
3731
3732
3733
3734
3735
3736 mutex_lock(&event->mmap_mutex);
3737 ring_buffer_attach(event, NULL);
3738 mutex_unlock(&event->mmap_mutex);
3739 }
3740
3741 if (is_cgroup_event(event))
3742 perf_detach_cgroup(event);
3743
3744 if (!event->parent) {
3745 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3746 put_callchain_buffers();
3747 }
3748
3749 perf_event_free_bpf_prog(event);
3750
3751 if (event->destroy)
3752 event->destroy(event);
3753
3754 if (event->ctx)
3755 put_ctx(event->ctx);
3756
3757 if (event->pmu) {
3758 exclusive_event_destroy(event);
3759 module_put(event->pmu->module);
3760 }
3761
3762 call_rcu(&event->rcu_head, free_event_rcu);
3763}
3764
3765
3766
3767
3768
3769static void free_event(struct perf_event *event)
3770{
3771 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3772 "unexpected event refcount: %ld; ptr=%p\n",
3773 atomic_long_read(&event->refcount), event)) {
3774
3775 return;
3776 }
3777
3778 _free_event(event);
3779}
3780
3781
3782
3783
3784static void perf_remove_from_owner(struct perf_event *event)
3785{
3786 struct task_struct *owner;
3787
3788 rcu_read_lock();
3789
3790
3791
3792
3793
3794
3795 owner = lockless_dereference(event->owner);
3796 if (owner) {
3797
3798
3799
3800
3801
3802 get_task_struct(owner);
3803 }
3804 rcu_read_unlock();
3805
3806 if (owner) {
3807
3808
3809
3810
3811
3812
3813
3814
3815 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3816
3817
3818
3819
3820
3821
3822
3823 if (event->owner) {
3824 list_del_init(&event->owner_entry);
3825 smp_store_release(&event->owner, NULL);
3826 }
3827 mutex_unlock(&owner->perf_event_mutex);
3828 put_task_struct(owner);
3829 }
3830}
3831
3832static void put_event(struct perf_event *event)
3833{
3834 if (!atomic_long_dec_and_test(&event->refcount))
3835 return;
3836
3837 _free_event(event);
3838}
3839
3840
3841
3842
3843
3844
3845int perf_event_release_kernel(struct perf_event *event)
3846{
3847 struct perf_event_context *ctx = event->ctx;
3848 struct perf_event *child, *tmp;
3849
3850
3851
3852
3853
3854 if (!ctx) {
3855 WARN_ON_ONCE(event->attach_state &
3856 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
3857 goto no_ctx;
3858 }
3859
3860 if (!is_kernel_event(event))
3861 perf_remove_from_owner(event);
3862
3863 ctx = perf_event_ctx_lock(event);
3864 WARN_ON_ONCE(ctx->parent_ctx);
3865 perf_remove_from_context(event, DETACH_GROUP);
3866
3867 raw_spin_lock_irq(&ctx->lock);
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879 event->state = PERF_EVENT_STATE_DEAD;
3880 raw_spin_unlock_irq(&ctx->lock);
3881
3882 perf_event_ctx_unlock(event, ctx);
3883
3884again:
3885 mutex_lock(&event->child_mutex);
3886 list_for_each_entry(child, &event->child_list, child_list) {
3887
3888
3889
3890
3891
3892 ctx = lockless_dereference(child->ctx);
3893
3894
3895
3896
3897
3898
3899
3900
3901 get_ctx(ctx);
3902
3903
3904
3905
3906
3907
3908 mutex_unlock(&event->child_mutex);
3909 mutex_lock(&ctx->mutex);
3910 mutex_lock(&event->child_mutex);
3911
3912
3913
3914
3915
3916
3917 tmp = list_first_entry_or_null(&event->child_list,
3918 struct perf_event, child_list);
3919 if (tmp == child) {
3920 perf_remove_from_context(child, DETACH_GROUP);
3921 list_del(&child->child_list);
3922 free_event(child);
3923
3924
3925
3926
3927 put_event(event);
3928 }
3929
3930 mutex_unlock(&event->child_mutex);
3931 mutex_unlock(&ctx->mutex);
3932 put_ctx(ctx);
3933 goto again;
3934 }
3935 mutex_unlock(&event->child_mutex);
3936
3937no_ctx:
3938 put_event(event);
3939 return 0;
3940}
3941EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3942
3943
3944
3945
3946static int perf_release(struct inode *inode, struct file *file)
3947{
3948 perf_event_release_kernel(file->private_data);
3949 return 0;
3950}
3951
3952u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3953{
3954 struct perf_event *child;
3955 u64 total = 0;
3956
3957 *enabled = 0;
3958 *running = 0;
3959
3960 mutex_lock(&event->child_mutex);
3961
3962 (void)perf_event_read(event, false);
3963 total += perf_event_count(event);
3964
3965 *enabled += event->total_time_enabled +
3966 atomic64_read(&event->child_total_time_enabled);
3967 *running += event->total_time_running +
3968 atomic64_read(&event->child_total_time_running);
3969
3970 list_for_each_entry(child, &event->child_list, child_list) {
3971 (void)perf_event_read(child, false);
3972 total += perf_event_count(child);
3973 *enabled += child->total_time_enabled;
3974 *running += child->total_time_running;
3975 }
3976 mutex_unlock(&event->child_mutex);
3977
3978 return total;
3979}
3980EXPORT_SYMBOL_GPL(perf_event_read_value);
3981
3982static int __perf_read_group_add(struct perf_event *leader,
3983 u64 read_format, u64 *values)
3984{
3985 struct perf_event *sub;
3986 int n = 1;
3987 int ret;
3988
3989 ret = perf_event_read(leader, true);
3990 if (ret)
3991 return ret;
3992
3993
3994
3995
3996
3997
3998 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3999 values[n++] += leader->total_time_enabled +
4000 atomic64_read(&leader->child_total_time_enabled);
4001 }
4002
4003 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4004 values[n++] += leader->total_time_running +
4005 atomic64_read(&leader->child_total_time_running);
4006 }
4007
4008
4009
4010
4011 values[n++] += perf_event_count(leader);
4012 if (read_format & PERF_FORMAT_ID)
4013 values[n++] = primary_event_id(leader);
4014
4015 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4016 values[n++] += perf_event_count(sub);
4017 if (read_format & PERF_FORMAT_ID)
4018 values[n++] = primary_event_id(sub);
4019 }
4020
4021 return 0;
4022}
4023
4024static int perf_read_group(struct perf_event *event,
4025 u64 read_format, char __user *buf)
4026{
4027 struct perf_event *leader = event->group_leader, *child;
4028 struct perf_event_context *ctx = leader->ctx;
4029 int ret;
4030 u64 *values;
4031
4032 lockdep_assert_held(&ctx->mutex);
4033
4034 values = kzalloc(event->read_size, GFP_KERNEL);
4035 if (!values)
4036 return -ENOMEM;
4037
4038 values[0] = 1 + leader->nr_siblings;
4039
4040
4041
4042
4043
4044 mutex_lock(&leader->child_mutex);
4045
4046 ret = __perf_read_group_add(leader, read_format, values);
4047 if (ret)
4048 goto unlock;
4049
4050 list_for_each_entry(child, &leader->child_list, child_list) {
4051 ret = __perf_read_group_add(child, read_format, values);
4052 if (ret)
4053 goto unlock;
4054 }
4055
4056 mutex_unlock(&leader->child_mutex);
4057
4058 ret = event->read_size;
4059 if (copy_to_user(buf, values, event->read_size))
4060 ret = -EFAULT;
4061 goto out;
4062
4063unlock:
4064 mutex_unlock(&leader->child_mutex);
4065out:
4066 kfree(values);
4067 return ret;
4068}
4069
4070static int perf_read_one(struct perf_event *event,
4071 u64 read_format, char __user *buf)
4072{
4073 u64 enabled, running;
4074 u64 values[4];
4075 int n = 0;
4076
4077 values[n++] = perf_event_read_value(event, &enabled, &running);
4078 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4079 values[n++] = enabled;
4080 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4081 values[n++] = running;
4082 if (read_format & PERF_FORMAT_ID)
4083 values[n++] = primary_event_id(event);
4084
4085 if (copy_to_user(buf, values, n * sizeof(u64)))
4086 return -EFAULT;
4087
4088 return n * sizeof(u64);
4089}
4090
4091static bool is_event_hup(struct perf_event *event)
4092{
4093 bool no_children;
4094
4095 if (event->state > PERF_EVENT_STATE_EXIT)
4096 return false;
4097
4098 mutex_lock(&event->child_mutex);
4099 no_children = list_empty(&event->child_list);
4100 mutex_unlock(&event->child_mutex);
4101 return no_children;
4102}
4103
4104
4105
4106
4107static ssize_t
4108__perf_read(struct perf_event *event, char __user *buf, size_t count)
4109{
4110 u64 read_format = event->attr.read_format;
4111 int ret;
4112
4113
4114
4115
4116
4117
4118 if (event->state == PERF_EVENT_STATE_ERROR)
4119 return 0;
4120
4121 if (count < event->read_size)
4122 return -ENOSPC;
4123
4124 WARN_ON_ONCE(event->ctx->parent_ctx);
4125 if (read_format & PERF_FORMAT_GROUP)
4126 ret = perf_read_group(event, read_format, buf);
4127 else
4128 ret = perf_read_one(event, read_format, buf);
4129
4130 return ret;
4131}
4132
4133static ssize_t
4134perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4135{
4136 struct perf_event *event = file->private_data;
4137 struct perf_event_context *ctx;
4138 int ret;
4139
4140 ctx = perf_event_ctx_lock(event);
4141 ret = __perf_read(event, buf, count);
4142 perf_event_ctx_unlock(event, ctx);
4143
4144 return ret;
4145}
4146
4147static unsigned int perf_poll(struct file *file, poll_table *wait)
4148{
4149 struct perf_event *event = file->private_data;
4150 struct ring_buffer *rb;
4151 unsigned int events = POLLHUP;
4152
4153 poll_wait(file, &event->waitq, wait);
4154
4155 if (is_event_hup(event))
4156 return events;
4157
4158
4159
4160
4161
4162 mutex_lock(&event->mmap_mutex);
4163 rb = event->rb;
4164 if (rb)
4165 events = atomic_xchg(&rb->poll, 0);
4166 mutex_unlock(&event->mmap_mutex);
4167 return events;
4168}
4169
4170static void _perf_event_reset(struct perf_event *event)
4171{
4172 (void)perf_event_read(event, false);
4173 local64_set(&event->count, 0);
4174 perf_event_update_userpage(event);
4175}
4176
4177
4178
4179
4180
4181
4182
4183static void perf_event_for_each_child(struct perf_event *event,
4184 void (*func)(struct perf_event *))
4185{
4186 struct perf_event *child;
4187
4188 WARN_ON_ONCE(event->ctx->parent_ctx);
4189
4190 mutex_lock(&event->child_mutex);
4191 func(event);
4192 list_for_each_entry(child, &event->child_list, child_list)
4193 func(child);
4194 mutex_unlock(&event->child_mutex);
4195}
4196
4197static void perf_event_for_each(struct perf_event *event,
4198 void (*func)(struct perf_event *))
4199{
4200 struct perf_event_context *ctx = event->ctx;
4201 struct perf_event *sibling;
4202
4203 lockdep_assert_held(&ctx->mutex);
4204
4205 event = event->group_leader;
4206
4207 perf_event_for_each_child(event, func);
4208 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4209 perf_event_for_each_child(sibling, func);
4210}
4211
4212static void __perf_event_period(struct perf_event *event,
4213 struct perf_cpu_context *cpuctx,
4214 struct perf_event_context *ctx,
4215 void *info)
4216{
4217 u64 value = *((u64 *)info);
4218 bool active;
4219
4220 if (event->attr.freq) {
4221 event->attr.sample_freq = value;
4222 } else {
4223 event->attr.sample_period = value;
4224 event->hw.sample_period = value;
4225 }
4226
4227 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4228 if (active) {
4229 perf_pmu_disable(ctx->pmu);
4230
4231
4232
4233
4234 if (event->hw.interrupts == MAX_INTERRUPTS) {
4235 event->hw.interrupts = 0;
4236 perf_log_throttle(event, 1);
4237 }
4238 event->pmu->stop(event, PERF_EF_UPDATE);
4239 }
4240
4241 local64_set(&event->hw.period_left, 0);
4242
4243 if (active) {
4244 event->pmu->start(event, PERF_EF_RELOAD);
4245 perf_pmu_enable(ctx->pmu);
4246 }
4247}
4248
4249static int perf_event_period(struct perf_event *event, u64 __user *arg)
4250{
4251 u64 value;
4252
4253 if (!is_sampling_event(event))
4254 return -EINVAL;
4255
4256 if (copy_from_user(&value, arg, sizeof(value)))
4257 return -EFAULT;
4258
4259 if (!value)
4260 return -EINVAL;
4261
4262 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4263 return -EINVAL;
4264
4265 event_function_call(event, __perf_event_period, &value);
4266
4267 return 0;
4268}
4269
4270static const struct file_operations perf_fops;
4271
4272static inline int perf_fget_light(int fd, struct fd *p)
4273{
4274 struct fd f = fdget(fd);
4275 if (!f.file)
4276 return -EBADF;
4277
4278 if (f.file->f_op != &perf_fops) {
4279 fdput(f);
4280 return -EBADF;
4281 }
4282 *p = f;
4283 return 0;
4284}
4285
4286static int perf_event_set_output(struct perf_event *event,
4287 struct perf_event *output_event);
4288static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4289static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4290
4291static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4292{
4293 void (*func)(struct perf_event *);
4294 u32 flags = arg;
4295
4296 switch (cmd) {
4297 case PERF_EVENT_IOC_ENABLE:
4298 func = _perf_event_enable;
4299 break;
4300 case PERF_EVENT_IOC_DISABLE:
4301 func = _perf_event_disable;
4302 break;
4303 case PERF_EVENT_IOC_RESET:
4304 func = _perf_event_reset;
4305 break;
4306
4307 case PERF_EVENT_IOC_REFRESH:
4308 return _perf_event_refresh(event, arg);
4309
4310 case PERF_EVENT_IOC_PERIOD:
4311 return perf_event_period(event, (u64 __user *)arg);
4312
4313 case PERF_EVENT_IOC_ID:
4314 {
4315 u64 id = primary_event_id(event);
4316
4317 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4318 return -EFAULT;
4319 return 0;
4320 }
4321
4322 case PERF_EVENT_IOC_SET_OUTPUT:
4323 {
4324 int ret;
4325 if (arg != -1) {
4326 struct perf_event *output_event;
4327 struct fd output;
4328 ret = perf_fget_light(arg, &output);
4329 if (ret)
4330 return ret;
4331 output_event = output.file->private_data;
4332 ret = perf_event_set_output(event, output_event);
4333 fdput(output);
4334 } else {
4335 ret = perf_event_set_output(event, NULL);
4336 }
4337 return ret;
4338 }
4339
4340 case PERF_EVENT_IOC_SET_FILTER:
4341 return perf_event_set_filter(event, (void __user *)arg);
4342
4343 case PERF_EVENT_IOC_SET_BPF:
4344 return perf_event_set_bpf_prog(event, arg);
4345
4346 default:
4347 return -ENOTTY;
4348 }
4349
4350 if (flags & PERF_IOC_FLAG_GROUP)
4351 perf_event_for_each(event, func);
4352 else
4353 perf_event_for_each_child(event, func);
4354
4355 return 0;
4356}
4357
4358static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4359{
4360 struct perf_event *event = file->private_data;
4361 struct perf_event_context *ctx;
4362 long ret;
4363
4364 ctx = perf_event_ctx_lock(event);
4365 ret = _perf_ioctl(event, cmd, arg);
4366 perf_event_ctx_unlock(event, ctx);
4367
4368 return ret;
4369}
4370
4371#ifdef CONFIG_COMPAT
4372static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4373 unsigned long arg)
4374{
4375 switch (_IOC_NR(cmd)) {
4376 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4377 case _IOC_NR(PERF_EVENT_IOC_ID):
4378
4379 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4380 cmd &= ~IOCSIZE_MASK;
4381 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4382 }
4383 break;
4384 }
4385 return perf_ioctl(file, cmd, arg);
4386}
4387#else
4388# define perf_compat_ioctl NULL
4389#endif
4390
4391int perf_event_task_enable(void)
4392{
4393 struct perf_event_context *ctx;
4394 struct perf_event *event;
4395
4396 mutex_lock(¤t->perf_event_mutex);
4397 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4398 ctx = perf_event_ctx_lock(event);
4399 perf_event_for_each_child(event, _perf_event_enable);
4400 perf_event_ctx_unlock(event, ctx);
4401 }
4402 mutex_unlock(¤t->perf_event_mutex);
4403
4404 return 0;
4405}
4406
4407int perf_event_task_disable(void)
4408{
4409 struct perf_event_context *ctx;
4410 struct perf_event *event;
4411
4412 mutex_lock(¤t->perf_event_mutex);
4413 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4414 ctx = perf_event_ctx_lock(event);
4415 perf_event_for_each_child(event, _perf_event_disable);
4416 perf_event_ctx_unlock(event, ctx);
4417 }
4418 mutex_unlock(¤t->perf_event_mutex);
4419
4420 return 0;
4421}
4422
4423static int perf_event_index(struct perf_event *event)
4424{
4425 if (event->hw.state & PERF_HES_STOPPED)
4426 return 0;
4427
4428 if (event->state != PERF_EVENT_STATE_ACTIVE)
4429 return 0;
4430
4431 return event->pmu->event_idx(event);
4432}
4433
4434static void calc_timer_values(struct perf_event *event,
4435 u64 *now,
4436 u64 *enabled,
4437 u64 *running)
4438{
4439 u64 ctx_time;
4440
4441 *now = perf_clock();
4442 ctx_time = event->shadow_ctx_time + *now;
4443 *enabled = ctx_time - event->tstamp_enabled;
4444 *running = ctx_time - event->tstamp_running;
4445}
4446
4447static void perf_event_init_userpage(struct perf_event *event)
4448{
4449 struct perf_event_mmap_page *userpg;
4450 struct ring_buffer *rb;
4451
4452 rcu_read_lock();
4453 rb = rcu_dereference(event->rb);
4454 if (!rb)
4455 goto unlock;
4456
4457 userpg = rb->user_page;
4458
4459
4460 userpg->cap_bit0_is_deprecated = 1;
4461 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4462 userpg->data_offset = PAGE_SIZE;
4463 userpg->data_size = perf_data_size(rb);
4464
4465unlock:
4466 rcu_read_unlock();
4467}
4468
4469void __weak arch_perf_update_userpage(
4470 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4471{
4472}
4473
4474
4475
4476
4477
4478
4479void perf_event_update_userpage(struct perf_event *event)
4480{
4481 struct perf_event_mmap_page *userpg;
4482 struct ring_buffer *rb;
4483 u64 enabled, running, now;
4484
4485 rcu_read_lock();
4486 rb = rcu_dereference(event->rb);
4487 if (!rb)
4488 goto unlock;
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499 calc_timer_values(event, &now, &enabled, &running);
4500
4501 userpg = rb->user_page;
4502
4503
4504
4505
4506 preempt_disable();
4507 ++userpg->lock;
4508 barrier();
4509 userpg->index = perf_event_index(event);
4510 userpg->offset = perf_event_count(event);
4511 if (userpg->index)
4512 userpg->offset -= local64_read(&event->hw.prev_count);
4513
4514 userpg->time_enabled = enabled +
4515 atomic64_read(&event->child_total_time_enabled);
4516
4517 userpg->time_running = running +
4518 atomic64_read(&event->child_total_time_running);
4519
4520 arch_perf_update_userpage(event, userpg, now);
4521
4522 barrier();
4523 ++userpg->lock;
4524 preempt_enable();
4525unlock:
4526 rcu_read_unlock();
4527}
4528
4529static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4530{
4531 struct perf_event *event = vma->vm_file->private_data;
4532 struct ring_buffer *rb;
4533 int ret = VM_FAULT_SIGBUS;
4534
4535 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4536 if (vmf->pgoff == 0)
4537 ret = 0;
4538 return ret;
4539 }
4540
4541 rcu_read_lock();
4542 rb = rcu_dereference(event->rb);
4543 if (!rb)
4544 goto unlock;
4545
4546 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4547 goto unlock;
4548
4549 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4550 if (!vmf->page)
4551 goto unlock;
4552
4553 get_page(vmf->page);
4554 vmf->page->mapping = vma->vm_file->f_mapping;
4555 vmf->page->index = vmf->pgoff;
4556
4557 ret = 0;
4558unlock:
4559 rcu_read_unlock();
4560
4561 return ret;
4562}
4563
4564static void ring_buffer_attach(struct perf_event *event,
4565 struct ring_buffer *rb)
4566{
4567 struct ring_buffer *old_rb = NULL;
4568 unsigned long flags;
4569
4570 if (event->rb) {
4571
4572
4573
4574
4575 WARN_ON_ONCE(event->rcu_pending);
4576
4577 old_rb = event->rb;
4578 spin_lock_irqsave(&old_rb->event_lock, flags);
4579 list_del_rcu(&event->rb_entry);
4580 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4581
4582 event->rcu_batches = get_state_synchronize_rcu();
4583 event->rcu_pending = 1;
4584 }
4585
4586 if (rb) {
4587 if (event->rcu_pending) {
4588 cond_synchronize_rcu(event->rcu_batches);
4589 event->rcu_pending = 0;
4590 }
4591
4592 spin_lock_irqsave(&rb->event_lock, flags);
4593 list_add_rcu(&event->rb_entry, &rb->event_list);
4594 spin_unlock_irqrestore(&rb->event_lock, flags);
4595 }
4596
4597 rcu_assign_pointer(event->rb, rb);
4598
4599 if (old_rb) {
4600 ring_buffer_put(old_rb);
4601
4602
4603
4604
4605
4606 wake_up_all(&event->waitq);
4607 }
4608}
4609
4610static void ring_buffer_wakeup(struct perf_event *event)
4611{
4612 struct ring_buffer *rb;
4613
4614 rcu_read_lock();
4615 rb = rcu_dereference(event->rb);
4616 if (rb) {
4617 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4618 wake_up_all(&event->waitq);
4619 }
4620 rcu_read_unlock();
4621}
4622
4623struct ring_buffer *ring_buffer_get(struct perf_event *event)
4624{
4625 struct ring_buffer *rb;
4626
4627 rcu_read_lock();
4628 rb = rcu_dereference(event->rb);
4629 if (rb) {
4630 if (!atomic_inc_not_zero(&rb->refcount))
4631 rb = NULL;
4632 }
4633 rcu_read_unlock();
4634
4635 return rb;
4636}
4637
4638void ring_buffer_put(struct ring_buffer *rb)
4639{
4640 if (!atomic_dec_and_test(&rb->refcount))
4641 return;
4642
4643 WARN_ON_ONCE(!list_empty(&rb->event_list));
4644
4645 call_rcu(&rb->rcu_head, rb_free_rcu);
4646}
4647
4648static void perf_mmap_open(struct vm_area_struct *vma)
4649{
4650 struct perf_event *event = vma->vm_file->private_data;
4651
4652 atomic_inc(&event->mmap_count);
4653 atomic_inc(&event->rb->mmap_count);
4654
4655 if (vma->vm_pgoff)
4656 atomic_inc(&event->rb->aux_mmap_count);
4657
4658 if (event->pmu->event_mapped)
4659 event->pmu->event_mapped(event);
4660}
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670static void perf_mmap_close(struct vm_area_struct *vma)
4671{
4672 struct perf_event *event = vma->vm_file->private_data;
4673
4674 struct ring_buffer *rb = ring_buffer_get(event);
4675 struct user_struct *mmap_user = rb->mmap_user;
4676 int mmap_locked = rb->mmap_locked;
4677 unsigned long size = perf_data_size(rb);
4678
4679 if (event->pmu->event_unmapped)
4680 event->pmu->event_unmapped(event);
4681
4682
4683
4684
4685
4686
4687 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4688 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4689 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4690 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4691
4692 rb_free_aux(rb);
4693 mutex_unlock(&event->mmap_mutex);
4694 }
4695
4696 atomic_dec(&rb->mmap_count);
4697
4698 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4699 goto out_put;
4700
4701 ring_buffer_attach(event, NULL);
4702 mutex_unlock(&event->mmap_mutex);
4703
4704
4705 if (atomic_read(&rb->mmap_count))
4706 goto out_put;
4707
4708
4709
4710
4711
4712
4713again:
4714 rcu_read_lock();
4715 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4716 if (!atomic_long_inc_not_zero(&event->refcount)) {
4717
4718
4719
4720
4721 continue;
4722 }
4723 rcu_read_unlock();
4724
4725 mutex_lock(&event->mmap_mutex);
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736 if (event->rb == rb)
4737 ring_buffer_attach(event, NULL);
4738
4739 mutex_unlock(&event->mmap_mutex);
4740 put_event(event);
4741
4742
4743
4744
4745
4746 goto again;
4747 }
4748 rcu_read_unlock();
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4760 vma->vm_mm->pinned_vm -= mmap_locked;
4761 free_uid(mmap_user);
4762
4763out_put:
4764 ring_buffer_put(rb);
4765}
4766
4767static const struct vm_operations_struct perf_mmap_vmops = {
4768 .open = perf_mmap_open,
4769 .close = perf_mmap_close,
4770 .fault = perf_mmap_fault,
4771 .page_mkwrite = perf_mmap_fault,
4772};
4773
4774static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4775{
4776 struct perf_event *event = file->private_data;
4777 unsigned long user_locked, user_lock_limit;
4778 struct user_struct *user = current_user();
4779 unsigned long locked, lock_limit;
4780 struct ring_buffer *rb = NULL;
4781 unsigned long vma_size;
4782 unsigned long nr_pages;
4783 long user_extra = 0, extra = 0;
4784 int ret = 0, flags = 0;
4785
4786
4787
4788
4789
4790
4791 if (event->cpu == -1 && event->attr.inherit)
4792 return -EINVAL;
4793
4794 if (!(vma->vm_flags & VM_SHARED))
4795 return -EINVAL;
4796
4797 vma_size = vma->vm_end - vma->vm_start;
4798
4799 if (vma->vm_pgoff == 0) {
4800 nr_pages = (vma_size / PAGE_SIZE) - 1;
4801 } else {
4802
4803
4804
4805
4806
4807 u64 aux_offset, aux_size;
4808
4809 if (!event->rb)
4810 return -EINVAL;
4811
4812 nr_pages = vma_size / PAGE_SIZE;
4813
4814 mutex_lock(&event->mmap_mutex);
4815 ret = -EINVAL;
4816
4817 rb = event->rb;
4818 if (!rb)
4819 goto aux_unlock;
4820
4821 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4822 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4823
4824 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4825 goto aux_unlock;
4826
4827 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4828 goto aux_unlock;
4829
4830
4831 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4832 goto aux_unlock;
4833
4834 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4835 goto aux_unlock;
4836
4837
4838 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4839 goto aux_unlock;
4840
4841 if (!is_power_of_2(nr_pages))
4842 goto aux_unlock;
4843
4844 if (!atomic_inc_not_zero(&rb->mmap_count))
4845 goto aux_unlock;
4846
4847 if (rb_has_aux(rb)) {
4848 atomic_inc(&rb->aux_mmap_count);
4849 ret = 0;
4850 goto unlock;
4851 }
4852
4853 atomic_set(&rb->aux_mmap_count, 1);
4854 user_extra = nr_pages;
4855
4856 goto accounting;
4857 }
4858
4859
4860
4861
4862
4863 if (nr_pages != 0 && !is_power_of_2(nr_pages))
4864 return -EINVAL;
4865
4866 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4867 return -EINVAL;
4868
4869 WARN_ON_ONCE(event->ctx->parent_ctx);
4870again:
4871 mutex_lock(&event->mmap_mutex);
4872 if (event->rb) {
4873 if (event->rb->nr_pages != nr_pages) {
4874 ret = -EINVAL;
4875 goto unlock;
4876 }
4877
4878 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4879
4880
4881
4882
4883
4884 mutex_unlock(&event->mmap_mutex);
4885 goto again;
4886 }
4887
4888 goto unlock;
4889 }
4890
4891 user_extra = nr_pages + 1;
4892
4893accounting:
4894 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4895
4896
4897
4898
4899 user_lock_limit *= num_online_cpus();
4900
4901 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4902
4903 if (user_locked > user_lock_limit)
4904 extra = user_locked - user_lock_limit;
4905
4906 lock_limit = rlimit(RLIMIT_MEMLOCK);
4907 lock_limit >>= PAGE_SHIFT;
4908 locked = vma->vm_mm->pinned_vm + extra;
4909
4910 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4911 !capable(CAP_IPC_LOCK)) {
4912 ret = -EPERM;
4913 goto unlock;
4914 }
4915
4916 WARN_ON(!rb && event->rb);
4917
4918 if (vma->vm_flags & VM_WRITE)
4919 flags |= RING_BUFFER_WRITABLE;
4920
4921 if (!rb) {
4922 rb = rb_alloc(nr_pages,
4923 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4924 event->cpu, flags);
4925
4926 if (!rb) {
4927 ret = -ENOMEM;
4928 goto unlock;
4929 }
4930
4931 atomic_set(&rb->mmap_count, 1);
4932 rb->mmap_user = get_current_user();
4933 rb->mmap_locked = extra;
4934
4935 ring_buffer_attach(event, rb);
4936
4937 perf_event_init_userpage(event);
4938 perf_event_update_userpage(event);
4939 } else {
4940 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4941 event->attr.aux_watermark, flags);
4942 if (!ret)
4943 rb->aux_mmap_locked = extra;
4944 }
4945
4946unlock:
4947 if (!ret) {
4948 atomic_long_add(user_extra, &user->locked_vm);
4949 vma->vm_mm->pinned_vm += extra;
4950
4951 atomic_inc(&event->mmap_count);
4952 } else if (rb) {
4953 atomic_dec(&rb->mmap_count);
4954 }
4955aux_unlock:
4956 mutex_unlock(&event->mmap_mutex);
4957
4958
4959
4960
4961
4962 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4963 vma->vm_ops = &perf_mmap_vmops;
4964
4965 if (event->pmu->event_mapped)
4966 event->pmu->event_mapped(event);
4967
4968 return ret;
4969}
4970
4971static int perf_fasync(int fd, struct file *filp, int on)
4972{
4973 struct inode *inode = file_inode(filp);
4974 struct perf_event *event = filp->private_data;
4975 int retval;
4976
4977 inode_lock(inode);
4978 retval = fasync_helper(fd, filp, on, &event->fasync);
4979 inode_unlock(inode);
4980
4981 if (retval < 0)
4982 return retval;
4983
4984 return 0;
4985}
4986
4987static const struct file_operations perf_fops = {
4988 .llseek = no_llseek,
4989 .release = perf_release,
4990 .read = perf_read,
4991 .poll = perf_poll,
4992 .unlocked_ioctl = perf_ioctl,
4993 .compat_ioctl = perf_compat_ioctl,
4994 .mmap = perf_mmap,
4995 .fasync = perf_fasync,
4996};
4997
4998
4999
5000
5001
5002
5003
5004
5005static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5006{
5007
5008 if (event->parent)
5009 event = event->parent;
5010 return &event->fasync;
5011}
5012
5013void perf_event_wakeup(struct perf_event *event)
5014{
5015 ring_buffer_wakeup(event);
5016
5017 if (event->pending_kill) {
5018 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5019 event->pending_kill = 0;
5020 }
5021}
5022
5023static void perf_pending_event(struct irq_work *entry)
5024{
5025 struct perf_event *event = container_of(entry,
5026 struct perf_event, pending);
5027 int rctx;
5028
5029 rctx = perf_swevent_get_recursion_context();
5030
5031
5032
5033
5034
5035 if (event->pending_disable) {
5036 event->pending_disable = 0;
5037 perf_event_disable_local(event);
5038 }
5039
5040 if (event->pending_wakeup) {
5041 event->pending_wakeup = 0;
5042 perf_event_wakeup(event);
5043 }
5044
5045 if (rctx >= 0)
5046 perf_swevent_put_recursion_context(rctx);
5047}
5048
5049
5050
5051
5052
5053
5054struct perf_guest_info_callbacks *perf_guest_cbs;
5055
5056int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5057{
5058 perf_guest_cbs = cbs;
5059 return 0;
5060}
5061EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5062
5063int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5064{
5065 perf_guest_cbs = NULL;
5066 return 0;
5067}
5068EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5069
5070static void
5071perf_output_sample_regs(struct perf_output_handle *handle,
5072 struct pt_regs *regs, u64 mask)
5073{
5074 int bit;
5075
5076 for_each_set_bit(bit, (const unsigned long *) &mask,
5077 sizeof(mask) * BITS_PER_BYTE) {
5078 u64 val;
5079
5080 val = perf_reg_value(regs, bit);
5081 perf_output_put(handle, val);
5082 }
5083}
5084
5085static void perf_sample_regs_user(struct perf_regs *regs_user,
5086 struct pt_regs *regs,
5087 struct pt_regs *regs_user_copy)
5088{
5089 if (user_mode(regs)) {
5090 regs_user->abi = perf_reg_abi(current);
5091 regs_user->regs = regs;
5092 } else if (current->mm) {
5093 perf_get_regs_user(regs_user, regs, regs_user_copy);
5094 } else {
5095 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5096 regs_user->regs = NULL;
5097 }
5098}
5099
5100static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5101 struct pt_regs *regs)
5102{
5103 regs_intr->regs = regs;
5104 regs_intr->abi = perf_reg_abi(current);
5105}
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115static u64 perf_ustack_task_size(struct pt_regs *regs)
5116{
5117 unsigned long addr = perf_user_stack_pointer(regs);
5118
5119 if (!addr || addr >= TASK_SIZE)
5120 return 0;
5121
5122 return TASK_SIZE - addr;
5123}
5124
5125static u16
5126perf_sample_ustack_size(u16 stack_size, u16 header_size,
5127 struct pt_regs *regs)
5128{
5129 u64 task_size;
5130
5131
5132 if (!regs)
5133 return 0;
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5146 stack_size = min(stack_size, (u16) task_size);
5147
5148
5149 header_size += 2 * sizeof(u64);
5150
5151
5152 if ((u16) (header_size + stack_size) < header_size) {
5153
5154
5155
5156
5157 stack_size = USHRT_MAX - header_size - sizeof(u64);
5158 stack_size = round_up(stack_size, sizeof(u64));
5159 }
5160
5161 return stack_size;
5162}
5163
5164static void
5165perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5166 struct pt_regs *regs)
5167{
5168
5169 if (!regs) {
5170 u64 size = 0;
5171 perf_output_put(handle, size);
5172 } else {
5173 unsigned long sp;
5174 unsigned int rem;
5175 u64 dyn_size;
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189 perf_output_put(handle, dump_size);
5190
5191
5192 sp = perf_user_stack_pointer(regs);
5193 rem = __output_copy_user(handle, (void *) sp, dump_size);
5194 dyn_size = dump_size - rem;
5195
5196 perf_output_skip(handle, rem);
5197
5198
5199 perf_output_put(handle, dyn_size);
5200 }
5201}
5202
5203static void __perf_event_header__init_id(struct perf_event_header *header,
5204 struct perf_sample_data *data,
5205 struct perf_event *event)
5206{
5207 u64 sample_type = event->attr.sample_type;
5208
5209 data->type = sample_type;
5210 header->size += event->id_header_size;
5211
5212 if (sample_type & PERF_SAMPLE_TID) {
5213
5214 data->tid_entry.pid = perf_event_pid(event, current);
5215 data->tid_entry.tid = perf_event_tid(event, current);
5216 }
5217
5218 if (sample_type & PERF_SAMPLE_TIME)
5219 data->time = perf_event_clock(event);
5220
5221 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5222 data->id = primary_event_id(event);
5223
5224 if (sample_type & PERF_SAMPLE_STREAM_ID)
5225 data->stream_id = event->id;
5226
5227 if (sample_type & PERF_SAMPLE_CPU) {
5228 data->cpu_entry.cpu = raw_smp_processor_id();
5229 data->cpu_entry.reserved = 0;
5230 }
5231}
5232
5233void perf_event_header__init_id(struct perf_event_header *header,
5234 struct perf_sample_data *data,
5235 struct perf_event *event)
5236{
5237 if (event->attr.sample_id_all)
5238 __perf_event_header__init_id(header, data, event);
5239}
5240
5241static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5242 struct perf_sample_data *data)
5243{
5244 u64 sample_type = data->type;
5245
5246 if (sample_type & PERF_SAMPLE_TID)
5247 perf_output_put(handle, data->tid_entry);
5248
5249 if (sample_type & PERF_SAMPLE_TIME)
5250 perf_output_put(handle, data->time);
5251
5252 if (sample_type & PERF_SAMPLE_ID)
5253 perf_output_put(handle, data->id);
5254
5255 if (sample_type & PERF_SAMPLE_STREAM_ID)
5256 perf_output_put(handle, data->stream_id);
5257
5258 if (sample_type & PERF_SAMPLE_CPU)
5259 perf_output_put(handle, data->cpu_entry);
5260
5261 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5262 perf_output_put(handle, data->id);
5263}
5264
5265void perf_event__output_id_sample(struct perf_event *event,
5266 struct perf_output_handle *handle,
5267 struct perf_sample_data *sample)
5268{
5269 if (event->attr.sample_id_all)
5270 __perf_event__output_id_sample(handle, sample);
5271}
5272
5273static void perf_output_read_one(struct perf_output_handle *handle,
5274 struct perf_event *event,
5275 u64 enabled, u64 running)
5276{
5277 u64 read_format = event->attr.read_format;
5278 u64 values[4];
5279 int n = 0;
5280
5281 values[n++] = perf_event_count(event);
5282 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5283 values[n++] = enabled +
5284 atomic64_read(&event->child_total_time_enabled);
5285 }
5286 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5287 values[n++] = running +
5288 atomic64_read(&event->child_total_time_running);
5289 }
5290 if (read_format & PERF_FORMAT_ID)
5291 values[n++] = primary_event_id(event);
5292
5293 __output_copy(handle, values, n * sizeof(u64));
5294}
5295
5296
5297
5298
5299static void perf_output_read_group(struct perf_output_handle *handle,
5300 struct perf_event *event,
5301 u64 enabled, u64 running)
5302{
5303 struct perf_event *leader = event->group_leader, *sub;
5304 u64 read_format = event->attr.read_format;
5305 u64 values[5];
5306 int n = 0;
5307
5308 values[n++] = 1 + leader->nr_siblings;
5309
5310 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5311 values[n++] = enabled;
5312
5313 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5314 values[n++] = running;
5315
5316 if (leader != event)
5317 leader->pmu->read(leader);
5318
5319 values[n++] = perf_event_count(leader);
5320 if (read_format & PERF_FORMAT_ID)
5321 values[n++] = primary_event_id(leader);
5322
5323 __output_copy(handle, values, n * sizeof(u64));
5324
5325 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5326 n = 0;
5327
5328 if ((sub != event) &&
5329 (sub->state == PERF_EVENT_STATE_ACTIVE))
5330 sub->pmu->read(sub);
5331
5332 values[n++] = perf_event_count(sub);
5333 if (read_format & PERF_FORMAT_ID)
5334 values[n++] = primary_event_id(sub);
5335
5336 __output_copy(handle, values, n * sizeof(u64));
5337 }
5338}
5339
5340#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5341 PERF_FORMAT_TOTAL_TIME_RUNNING)
5342
5343static void perf_output_read(struct perf_output_handle *handle,
5344 struct perf_event *event)
5345{
5346 u64 enabled = 0, running = 0, now;
5347 u64 read_format = event->attr.read_format;
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5359 calc_timer_values(event, &now, &enabled, &running);
5360
5361 if (event->attr.read_format & PERF_FORMAT_GROUP)
5362 perf_output_read_group(handle, event, enabled, running);
5363 else
5364 perf_output_read_one(handle, event, enabled, running);
5365}
5366
5367void perf_output_sample(struct perf_output_handle *handle,
5368 struct perf_event_header *header,
5369 struct perf_sample_data *data,
5370 struct perf_event *event)
5371{
5372 u64 sample_type = data->type;
5373
5374 perf_output_put(handle, *header);
5375
5376 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5377 perf_output_put(handle, data->id);
5378
5379 if (sample_type & PERF_SAMPLE_IP)
5380 perf_output_put(handle, data->ip);
5381
5382 if (sample_type & PERF_SAMPLE_TID)
5383 perf_output_put(handle, data->tid_entry);
5384
5385 if (sample_type & PERF_SAMPLE_TIME)
5386 perf_output_put(handle, data->time);
5387
5388 if (sample_type & PERF_SAMPLE_ADDR)
5389 perf_output_put(handle, data->addr);
5390
5391 if (sample_type & PERF_SAMPLE_ID)
5392 perf_output_put(handle, data->id);
5393
5394 if (sample_type & PERF_SAMPLE_STREAM_ID)
5395 perf_output_put(handle, data->stream_id);
5396
5397 if (sample_type & PERF_SAMPLE_CPU)
5398 perf_output_put(handle, data->cpu_entry);
5399
5400 if (sample_type & PERF_SAMPLE_PERIOD)
5401 perf_output_put(handle, data->period);
5402
5403 if (sample_type & PERF_SAMPLE_READ)
5404 perf_output_read(handle, event);
5405
5406 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5407 if (data->callchain) {
5408 int size = 1;
5409
5410 if (data->callchain)
5411 size += data->callchain->nr;
5412
5413 size *= sizeof(u64);
5414
5415 __output_copy(handle, data->callchain, size);
5416 } else {
5417 u64 nr = 0;
5418 perf_output_put(handle, nr);
5419 }
5420 }
5421
5422 if (sample_type & PERF_SAMPLE_RAW) {
5423 if (data->raw) {
5424 u32 raw_size = data->raw->size;
5425 u32 real_size = round_up(raw_size + sizeof(u32),
5426 sizeof(u64)) - sizeof(u32);
5427 u64 zero = 0;
5428
5429 perf_output_put(handle, real_size);
5430 __output_copy(handle, data->raw->data, raw_size);
5431 if (real_size - raw_size)
5432 __output_copy(handle, &zero, real_size - raw_size);
5433 } else {
5434 struct {
5435 u32 size;
5436 u32 data;
5437 } raw = {
5438 .size = sizeof(u32),
5439 .data = 0,
5440 };
5441 perf_output_put(handle, raw);
5442 }
5443 }
5444
5445 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5446 if (data->br_stack) {
5447 size_t size;
5448
5449 size = data->br_stack->nr
5450 * sizeof(struct perf_branch_entry);
5451
5452 perf_output_put(handle, data->br_stack->nr);
5453 perf_output_copy(handle, data->br_stack->entries, size);
5454 } else {
5455
5456
5457
5458 u64 nr = 0;
5459 perf_output_put(handle, nr);
5460 }
5461 }
5462
5463 if (sample_type & PERF_SAMPLE_REGS_USER) {
5464 u64 abi = data->regs_user.abi;
5465
5466
5467
5468
5469
5470 perf_output_put(handle, abi);
5471
5472 if (abi) {
5473 u64 mask = event->attr.sample_regs_user;
5474 perf_output_sample_regs(handle,
5475 data->regs_user.regs,
5476 mask);
5477 }
5478 }
5479
5480 if (sample_type & PERF_SAMPLE_STACK_USER) {
5481 perf_output_sample_ustack(handle,
5482 data->stack_user_size,
5483 data->regs_user.regs);
5484 }
5485
5486 if (sample_type & PERF_SAMPLE_WEIGHT)
5487 perf_output_put(handle, data->weight);
5488
5489 if (sample_type & PERF_SAMPLE_DATA_SRC)
5490 perf_output_put(handle, data->data_src.val);
5491
5492 if (sample_type & PERF_SAMPLE_TRANSACTION)
5493 perf_output_put(handle, data->txn);
5494
5495 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5496 u64 abi = data->regs_intr.abi;
5497
5498
5499
5500
5501 perf_output_put(handle, abi);
5502
5503 if (abi) {
5504 u64 mask = event->attr.sample_regs_intr;
5505
5506 perf_output_sample_regs(handle,
5507 data->regs_intr.regs,
5508 mask);
5509 }
5510 }
5511
5512 if (!event->attr.watermark) {
5513 int wakeup_events = event->attr.wakeup_events;
5514
5515 if (wakeup_events) {
5516 struct ring_buffer *rb = handle->rb;
5517 int events = local_inc_return(&rb->events);
5518
5519 if (events >= wakeup_events) {
5520 local_sub(wakeup_events, &rb->events);
5521 local_inc(&rb->wakeup);
5522 }
5523 }
5524 }
5525}
5526
5527void perf_prepare_sample(struct perf_event_header *header,
5528 struct perf_sample_data *data,
5529 struct perf_event *event,
5530 struct pt_regs *regs)
5531{
5532 u64 sample_type = event->attr.sample_type;
5533
5534 header->type = PERF_RECORD_SAMPLE;
5535 header->size = sizeof(*header) + event->header_size;
5536
5537 header->misc = 0;
5538 header->misc |= perf_misc_flags(regs);
5539
5540 __perf_event_header__init_id(header, data, event);
5541
5542 if (sample_type & PERF_SAMPLE_IP)
5543 data->ip = perf_instruction_pointer(regs);
5544
5545 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5546 int size = 1;
5547
5548 data->callchain = perf_callchain(event, regs);
5549
5550 if (data->callchain)
5551 size += data->callchain->nr;
5552
5553 header->size += size * sizeof(u64);
5554 }
5555
5556 if (sample_type & PERF_SAMPLE_RAW) {
5557 int size = sizeof(u32);
5558
5559 if (data->raw)
5560 size += data->raw->size;
5561 else
5562 size += sizeof(u32);
5563
5564 header->size += round_up(size, sizeof(u64));
5565 }
5566
5567 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5568 int size = sizeof(u64);
5569 if (data->br_stack) {
5570 size += data->br_stack->nr
5571 * sizeof(struct perf_branch_entry);
5572 }
5573 header->size += size;
5574 }
5575
5576 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5577 perf_sample_regs_user(&data->regs_user, regs,
5578 &data->regs_user_copy);
5579
5580 if (sample_type & PERF_SAMPLE_REGS_USER) {
5581
5582 int size = sizeof(u64);
5583
5584 if (data->regs_user.regs) {
5585 u64 mask = event->attr.sample_regs_user;
5586 size += hweight64(mask) * sizeof(u64);
5587 }
5588
5589 header->size += size;
5590 }
5591
5592 if (sample_type & PERF_SAMPLE_STACK_USER) {
5593
5594
5595
5596
5597
5598
5599 u16 stack_size = event->attr.sample_stack_user;
5600 u16 size = sizeof(u64);
5601
5602 stack_size = perf_sample_ustack_size(stack_size, header->size,
5603 data->regs_user.regs);
5604
5605
5606
5607
5608
5609
5610 if (stack_size)
5611 size += sizeof(u64) + stack_size;
5612
5613 data->stack_user_size = stack_size;
5614 header->size += size;
5615 }
5616
5617 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5618
5619 int size = sizeof(u64);
5620
5621 perf_sample_regs_intr(&data->regs_intr, regs);
5622
5623 if (data->regs_intr.regs) {
5624 u64 mask = event->attr.sample_regs_intr;
5625
5626 size += hweight64(mask) * sizeof(u64);
5627 }
5628
5629 header->size += size;
5630 }
5631}
5632
5633void perf_event_output(struct perf_event *event,
5634 struct perf_sample_data *data,
5635 struct pt_regs *regs)
5636{
5637 struct perf_output_handle handle;
5638 struct perf_event_header header;
5639
5640
5641 rcu_read_lock();
5642
5643 perf_prepare_sample(&header, data, event, regs);
5644
5645 if (perf_output_begin(&handle, event, header.size))
5646 goto exit;
5647
5648 perf_output_sample(&handle, &header, data, event);
5649
5650 perf_output_end(&handle);
5651
5652exit:
5653 rcu_read_unlock();
5654}
5655
5656
5657
5658
5659
5660struct perf_read_event {
5661 struct perf_event_header header;
5662
5663 u32 pid;
5664 u32 tid;
5665};
5666
5667static void
5668perf_event_read_event(struct perf_event *event,
5669 struct task_struct *task)
5670{
5671 struct perf_output_handle handle;
5672 struct perf_sample_data sample;
5673 struct perf_read_event read_event = {
5674 .header = {
5675 .type = PERF_RECORD_READ,
5676 .misc = 0,
5677 .size = sizeof(read_event) + event->read_size,
5678 },
5679 .pid = perf_event_pid(event, task),
5680 .tid = perf_event_tid(event, task),
5681 };
5682 int ret;
5683
5684 perf_event_header__init_id(&read_event.header, &sample, event);
5685 ret = perf_output_begin(&handle, event, read_event.header.size);
5686 if (ret)
5687 return;
5688
5689 perf_output_put(&handle, read_event);
5690 perf_output_read(&handle, event);
5691 perf_event__output_id_sample(event, &handle, &sample);
5692
5693 perf_output_end(&handle);
5694}
5695
5696typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5697
5698static void
5699perf_event_aux_ctx(struct perf_event_context *ctx,
5700 perf_event_aux_output_cb output,
5701 void *data)
5702{
5703 struct perf_event *event;
5704
5705 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5706 if (event->state < PERF_EVENT_STATE_INACTIVE)
5707 continue;
5708 if (!event_filter_match(event))
5709 continue;
5710 output(event, data);
5711 }
5712}
5713
5714static void
5715perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5716 struct perf_event_context *task_ctx)
5717{
5718 rcu_read_lock();
5719 preempt_disable();
5720 perf_event_aux_ctx(task_ctx, output, data);
5721 preempt_enable();
5722 rcu_read_unlock();
5723}
5724
5725static void
5726perf_event_aux(perf_event_aux_output_cb output, void *data,
5727 struct perf_event_context *task_ctx)
5728{
5729 struct perf_cpu_context *cpuctx;
5730 struct perf_event_context *ctx;
5731 struct pmu *pmu;
5732 int ctxn;
5733
5734
5735
5736
5737
5738
5739
5740 if (task_ctx) {
5741 perf_event_aux_task_ctx(output, data, task_ctx);
5742 return;
5743 }
5744
5745 rcu_read_lock();
5746 list_for_each_entry_rcu(pmu, &pmus, entry) {
5747 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5748 if (cpuctx->unique_pmu != pmu)
5749 goto next;
5750 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5751 ctxn = pmu->task_ctx_nr;
5752 if (ctxn < 0)
5753 goto next;
5754 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5755 if (ctx)
5756 perf_event_aux_ctx(ctx, output, data);
5757next:
5758 put_cpu_ptr(pmu->pmu_cpu_context);
5759 }
5760 rcu_read_unlock();
5761}
5762
5763
5764
5765
5766
5767
5768
5769struct perf_task_event {
5770 struct task_struct *task;
5771 struct perf_event_context *task_ctx;
5772
5773 struct {
5774 struct perf_event_header header;
5775
5776 u32 pid;
5777 u32 ppid;
5778 u32 tid;
5779 u32 ptid;
5780 u64 time;
5781 } event_id;
5782};
5783
5784static int perf_event_task_match(struct perf_event *event)
5785{
5786 return event->attr.comm || event->attr.mmap ||
5787 event->attr.mmap2 || event->attr.mmap_data ||
5788 event->attr.task;
5789}
5790
5791static void perf_event_task_output(struct perf_event *event,
5792 void *data)
5793{
5794 struct perf_task_event *task_event = data;
5795 struct perf_output_handle handle;
5796 struct perf_sample_data sample;
5797 struct task_struct *task = task_event->task;
5798 int ret, size = task_event->event_id.header.size;
5799
5800 if (!perf_event_task_match(event))
5801 return;
5802
5803 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5804
5805 ret = perf_output_begin(&handle, event,
5806 task_event->event_id.header.size);
5807 if (ret)
5808 goto out;
5809
5810 task_event->event_id.pid = perf_event_pid(event, task);
5811 task_event->event_id.ppid = perf_event_pid(event, current);
5812
5813 task_event->event_id.tid = perf_event_tid(event, task);
5814 task_event->event_id.ptid = perf_event_tid(event, current);
5815
5816 task_event->event_id.time = perf_event_clock(event);
5817
5818 perf_output_put(&handle, task_event->event_id);
5819
5820 perf_event__output_id_sample(event, &handle, &sample);
5821
5822 perf_output_end(&handle);
5823out:
5824 task_event->event_id.header.size = size;
5825}
5826
5827static void perf_event_task(struct task_struct *task,
5828 struct perf_event_context *task_ctx,
5829 int new)
5830{
5831 struct perf_task_event task_event;
5832
5833 if (!atomic_read(&nr_comm_events) &&
5834 !atomic_read(&nr_mmap_events) &&
5835 !atomic_read(&nr_task_events))
5836 return;
5837
5838 task_event = (struct perf_task_event){
5839 .task = task,
5840 .task_ctx = task_ctx,
5841 .event_id = {
5842 .header = {
5843 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5844 .misc = 0,
5845 .size = sizeof(task_event.event_id),
5846 },
5847
5848
5849
5850
5851
5852 },
5853 };
5854
5855 perf_event_aux(perf_event_task_output,
5856 &task_event,
5857 task_ctx);
5858}
5859
5860void perf_event_fork(struct task_struct *task)
5861{
5862 perf_event_task(task, NULL, 1);
5863}
5864
5865
5866
5867
5868
5869struct perf_comm_event {
5870 struct task_struct *task;
5871 char *comm;
5872 int comm_size;
5873
5874 struct {
5875 struct perf_event_header header;
5876
5877 u32 pid;
5878 u32 tid;
5879 } event_id;
5880};
5881
5882static int perf_event_comm_match(struct perf_event *event)
5883{
5884 return event->attr.comm;
5885}
5886
5887static void perf_event_comm_output(struct perf_event *event,
5888 void *data)
5889{
5890 struct perf_comm_event *comm_event = data;
5891 struct perf_output_handle handle;
5892 struct perf_sample_data sample;
5893 int size = comm_event->event_id.header.size;
5894 int ret;
5895
5896 if (!perf_event_comm_match(event))
5897 return;
5898
5899 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5900 ret = perf_output_begin(&handle, event,
5901 comm_event->event_id.header.size);
5902
5903 if (ret)
5904 goto out;
5905
5906 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5907 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5908
5909 perf_output_put(&handle, comm_event->event_id);
5910 __output_copy(&handle, comm_event->comm,
5911 comm_event->comm_size);
5912
5913 perf_event__output_id_sample(event, &handle, &sample);
5914
5915 perf_output_end(&handle);
5916out:
5917 comm_event->event_id.header.size = size;
5918}
5919
5920static void perf_event_comm_event(struct perf_comm_event *comm_event)
5921{
5922 char comm[TASK_COMM_LEN];
5923 unsigned int size;
5924
5925 memset(comm, 0, sizeof(comm));
5926 strlcpy(comm, comm_event->task->comm, sizeof(comm));
5927 size = ALIGN(strlen(comm)+1, sizeof(u64));
5928
5929 comm_event->comm = comm;
5930 comm_event->comm_size = size;
5931
5932 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5933
5934 perf_event_aux(perf_event_comm_output,
5935 comm_event,
5936 NULL);
5937}
5938
5939void perf_event_comm(struct task_struct *task, bool exec)
5940{
5941 struct perf_comm_event comm_event;
5942
5943 if (!atomic_read(&nr_comm_events))
5944 return;
5945
5946 comm_event = (struct perf_comm_event){
5947 .task = task,
5948
5949
5950 .event_id = {
5951 .header = {
5952 .type = PERF_RECORD_COMM,
5953 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5954
5955 },
5956
5957
5958 },
5959 };
5960
5961 perf_event_comm_event(&comm_event);
5962}
5963
5964
5965
5966
5967
5968struct perf_mmap_event {
5969 struct vm_area_struct *vma;
5970
5971 const char *file_name;
5972 int file_size;
5973 int maj, min;
5974 u64 ino;
5975 u64 ino_generation;
5976 u32 prot, flags;
5977
5978 struct {
5979 struct perf_event_header header;
5980
5981 u32 pid;
5982 u32 tid;
5983 u64 start;
5984 u64 len;
5985 u64 pgoff;
5986 } event_id;
5987};
5988
5989static int perf_event_mmap_match(struct perf_event *event,
5990 void *data)
5991{
5992 struct perf_mmap_event *mmap_event = data;
5993 struct vm_area_struct *vma = mmap_event->vma;
5994 int executable = vma->vm_flags & VM_EXEC;
5995
5996 return (!executable && event->attr.mmap_data) ||
5997 (executable && (event->attr.mmap || event->attr.mmap2));
5998}
5999
6000static void perf_event_mmap_output(struct perf_event *event,
6001 void *data)
6002{
6003 struct perf_mmap_event *mmap_event = data;
6004 struct perf_output_handle handle;
6005 struct perf_sample_data sample;
6006 int size = mmap_event->event_id.header.size;
6007 int ret;
6008
6009 if (!perf_event_mmap_match(event, data))
6010 return;
6011
6012 if (event->attr.mmap2) {
6013 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6014 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6015 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6016 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6017 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6018 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6019 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6020 }
6021
6022 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6023 ret = perf_output_begin(&handle, event,
6024 mmap_event->event_id.header.size);
6025 if (ret)
6026 goto out;
6027
6028 mmap_event->event_id.pid = perf_event_pid(event, current);
6029 mmap_event->event_id.tid = perf_event_tid(event, current);
6030
6031 perf_output_put(&handle, mmap_event->event_id);
6032
6033 if (event->attr.mmap2) {
6034 perf_output_put(&handle, mmap_event->maj);
6035 perf_output_put(&handle, mmap_event->min);
6036 perf_output_put(&handle, mmap_event->ino);
6037 perf_output_put(&handle, mmap_event->ino_generation);
6038 perf_output_put(&handle, mmap_event->prot);
6039 perf_output_put(&handle, mmap_event->flags);
6040 }
6041
6042 __output_copy(&handle, mmap_event->file_name,
6043 mmap_event->file_size);
6044
6045 perf_event__output_id_sample(event, &handle, &sample);
6046
6047 perf_output_end(&handle);
6048out:
6049 mmap_event->event_id.header.size = size;
6050}
6051
6052static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6053{
6054 struct vm_area_struct *vma = mmap_event->vma;
6055 struct file *file = vma->vm_file;
6056 int maj = 0, min = 0;
6057 u64 ino = 0, gen = 0;
6058 u32 prot = 0, flags = 0;
6059 unsigned int size;
6060 char tmp[16];
6061 char *buf = NULL;
6062 char *name;
6063
6064 if (file) {
6065 struct inode *inode;
6066 dev_t dev;
6067
6068 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6069 if (!buf) {
6070 name = "//enomem";
6071 goto cpy_name;
6072 }
6073
6074
6075
6076
6077
6078 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6079 if (IS_ERR(name)) {
6080 name = "//toolong";
6081 goto cpy_name;
6082 }
6083 inode = file_inode(vma->vm_file);
6084 dev = inode->i_sb->s_dev;
6085 ino = inode->i_ino;
6086 gen = inode->i_generation;
6087 maj = MAJOR(dev);
6088 min = MINOR(dev);
6089
6090 if (vma->vm_flags & VM_READ)
6091 prot |= PROT_READ;
6092 if (vma->vm_flags & VM_WRITE)
6093 prot |= PROT_WRITE;
6094 if (vma->vm_flags & VM_EXEC)
6095 prot |= PROT_EXEC;
6096
6097 if (vma->vm_flags & VM_MAYSHARE)
6098 flags = MAP_SHARED;
6099 else
6100 flags = MAP_PRIVATE;
6101
6102 if (vma->vm_flags & VM_DENYWRITE)
6103 flags |= MAP_DENYWRITE;
6104 if (vma->vm_flags & VM_MAYEXEC)
6105 flags |= MAP_EXECUTABLE;
6106 if (vma->vm_flags & VM_LOCKED)
6107 flags |= MAP_LOCKED;
6108 if (vma->vm_flags & VM_HUGETLB)
6109 flags |= MAP_HUGETLB;
6110
6111 goto got_name;
6112 } else {
6113 if (vma->vm_ops && vma->vm_ops->name) {
6114 name = (char *) vma->vm_ops->name(vma);
6115 if (name)
6116 goto cpy_name;
6117 }
6118
6119 name = (char *)arch_vma_name(vma);
6120 if (name)
6121 goto cpy_name;
6122
6123 if (vma->vm_start <= vma->vm_mm->start_brk &&
6124 vma->vm_end >= vma->vm_mm->brk) {
6125 name = "[heap]";
6126 goto cpy_name;
6127 }
6128 if (vma->vm_start <= vma->vm_mm->start_stack &&
6129 vma->vm_end >= vma->vm_mm->start_stack) {
6130 name = "[stack]";
6131 goto cpy_name;
6132 }
6133
6134 name = "//anon";
6135 goto cpy_name;
6136 }
6137
6138cpy_name:
6139 strlcpy(tmp, name, sizeof(tmp));
6140 name = tmp;
6141got_name:
6142
6143
6144
6145
6146
6147 size = strlen(name)+1;
6148 while (!IS_ALIGNED(size, sizeof(u64)))
6149 name[size++] = '\0';
6150
6151 mmap_event->file_name = name;
6152 mmap_event->file_size = size;
6153 mmap_event->maj = maj;
6154 mmap_event->min = min;
6155 mmap_event->ino = ino;
6156 mmap_event->ino_generation = gen;
6157 mmap_event->prot = prot;
6158 mmap_event->flags = flags;
6159
6160 if (!(vma->vm_flags & VM_EXEC))
6161 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6162
6163 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6164
6165 perf_event_aux(perf_event_mmap_output,
6166 mmap_event,
6167 NULL);
6168
6169 kfree(buf);
6170}
6171
6172void perf_event_mmap(struct vm_area_struct *vma)
6173{
6174 struct perf_mmap_event mmap_event;
6175
6176 if (!atomic_read(&nr_mmap_events))
6177 return;
6178
6179 mmap_event = (struct perf_mmap_event){
6180 .vma = vma,
6181
6182
6183 .event_id = {
6184 .header = {
6185 .type = PERF_RECORD_MMAP,
6186 .misc = PERF_RECORD_MISC_USER,
6187
6188 },
6189
6190
6191 .start = vma->vm_start,
6192 .len = vma->vm_end - vma->vm_start,
6193 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
6194 },
6195
6196
6197
6198
6199
6200
6201 };
6202
6203 perf_event_mmap_event(&mmap_event);
6204}
6205
6206void perf_event_aux_event(struct perf_event *event, unsigned long head,
6207 unsigned long size, u64 flags)
6208{
6209 struct perf_output_handle handle;
6210 struct perf_sample_data sample;
6211 struct perf_aux_event {
6212 struct perf_event_header header;
6213 u64 offset;
6214 u64 size;
6215 u64 flags;
6216 } rec = {
6217 .header = {
6218 .type = PERF_RECORD_AUX,
6219 .misc = 0,
6220 .size = sizeof(rec),
6221 },
6222 .offset = head,
6223 .size = size,
6224 .flags = flags,
6225 };
6226 int ret;
6227
6228 perf_event_header__init_id(&rec.header, &sample, event);
6229 ret = perf_output_begin(&handle, event, rec.header.size);
6230
6231 if (ret)
6232 return;
6233
6234 perf_output_put(&handle, rec);
6235 perf_event__output_id_sample(event, &handle, &sample);
6236
6237 perf_output_end(&handle);
6238}
6239
6240
6241
6242
6243void perf_log_lost_samples(struct perf_event *event, u64 lost)
6244{
6245 struct perf_output_handle handle;
6246 struct perf_sample_data sample;
6247 int ret;
6248
6249 struct {
6250 struct perf_event_header header;
6251 u64 lost;
6252 } lost_samples_event = {
6253 .header = {
6254 .type = PERF_RECORD_LOST_SAMPLES,
6255 .misc = 0,
6256 .size = sizeof(lost_samples_event),
6257 },
6258 .lost = lost,
6259 };
6260
6261 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6262
6263 ret = perf_output_begin(&handle, event,
6264 lost_samples_event.header.size);
6265 if (ret)
6266 return;
6267
6268 perf_output_put(&handle, lost_samples_event);
6269 perf_event__output_id_sample(event, &handle, &sample);
6270 perf_output_end(&handle);
6271}
6272
6273
6274
6275
6276
6277struct perf_switch_event {
6278 struct task_struct *task;
6279 struct task_struct *next_prev;
6280
6281 struct {
6282 struct perf_event_header header;
6283 u32 next_prev_pid;
6284 u32 next_prev_tid;
6285 } event_id;
6286};
6287
6288static int perf_event_switch_match(struct perf_event *event)
6289{
6290 return event->attr.context_switch;
6291}
6292
6293static void perf_event_switch_output(struct perf_event *event, void *data)
6294{
6295 struct perf_switch_event *se = data;
6296 struct perf_output_handle handle;
6297 struct perf_sample_data sample;
6298 int ret;
6299
6300 if (!perf_event_switch_match(event))
6301 return;
6302
6303
6304 if (event->ctx->task) {
6305 se->event_id.header.type = PERF_RECORD_SWITCH;
6306 se->event_id.header.size = sizeof(se->event_id.header);
6307 } else {
6308 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6309 se->event_id.header.size = sizeof(se->event_id);
6310 se->event_id.next_prev_pid =
6311 perf_event_pid(event, se->next_prev);
6312 se->event_id.next_prev_tid =
6313 perf_event_tid(event, se->next_prev);
6314 }
6315
6316 perf_event_header__init_id(&se->event_id.header, &sample, event);
6317
6318 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6319 if (ret)
6320 return;
6321
6322 if (event->ctx->task)
6323 perf_output_put(&handle, se->event_id.header);
6324 else
6325 perf_output_put(&handle, se->event_id);
6326
6327 perf_event__output_id_sample(event, &handle, &sample);
6328
6329 perf_output_end(&handle);
6330}
6331
6332static void perf_event_switch(struct task_struct *task,
6333 struct task_struct *next_prev, bool sched_in)
6334{
6335 struct perf_switch_event switch_event;
6336
6337
6338
6339 switch_event = (struct perf_switch_event){
6340 .task = task,
6341 .next_prev = next_prev,
6342 .event_id = {
6343 .header = {
6344
6345 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6346
6347 },
6348
6349
6350 },
6351 };
6352
6353 perf_event_aux(perf_event_switch_output,
6354 &switch_event,
6355 NULL);
6356}
6357
6358
6359
6360
6361
6362static void perf_log_throttle(struct perf_event *event, int enable)
6363{
6364 struct perf_output_handle handle;
6365 struct perf_sample_data sample;
6366 int ret;
6367
6368 struct {
6369 struct perf_event_header header;
6370 u64 time;
6371 u64 id;
6372 u64 stream_id;
6373 } throttle_event = {
6374 .header = {
6375 .type = PERF_RECORD_THROTTLE,
6376 .misc = 0,
6377 .size = sizeof(throttle_event),
6378 },
6379 .time = perf_event_clock(event),
6380 .id = primary_event_id(event),
6381 .stream_id = event->id,
6382 };
6383
6384 if (enable)
6385 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6386
6387 perf_event_header__init_id(&throttle_event.header, &sample, event);
6388
6389 ret = perf_output_begin(&handle, event,
6390 throttle_event.header.size);
6391 if (ret)
6392 return;
6393
6394 perf_output_put(&handle, throttle_event);
6395 perf_event__output_id_sample(event, &handle, &sample);
6396 perf_output_end(&handle);
6397}
6398
6399static void perf_log_itrace_start(struct perf_event *event)
6400{
6401 struct perf_output_handle handle;
6402 struct perf_sample_data sample;
6403 struct perf_aux_event {
6404 struct perf_event_header header;
6405 u32 pid;
6406 u32 tid;
6407 } rec;
6408 int ret;
6409
6410 if (event->parent)
6411 event = event->parent;
6412
6413 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6414 event->hw.itrace_started)
6415 return;
6416
6417 rec.header.type = PERF_RECORD_ITRACE_START;
6418 rec.header.misc = 0;
6419 rec.header.size = sizeof(rec);
6420 rec.pid = perf_event_pid(event, current);
6421 rec.tid = perf_event_tid(event, current);
6422
6423 perf_event_header__init_id(&rec.header, &sample, event);
6424 ret = perf_output_begin(&handle, event, rec.header.size);
6425
6426 if (ret)
6427 return;
6428
6429 perf_output_put(&handle, rec);
6430 perf_event__output_id_sample(event, &handle, &sample);
6431
6432 perf_output_end(&handle);
6433}
6434
6435
6436
6437
6438
6439static int __perf_event_overflow(struct perf_event *event,
6440 int throttle, struct perf_sample_data *data,
6441 struct pt_regs *regs)
6442{
6443 int events = atomic_read(&event->event_limit);
6444 struct hw_perf_event *hwc = &event->hw;
6445 u64 seq;
6446 int ret = 0;
6447
6448
6449
6450
6451
6452 if (unlikely(!is_sampling_event(event)))
6453 return 0;
6454
6455 seq = __this_cpu_read(perf_throttled_seq);
6456 if (seq != hwc->interrupts_seq) {
6457 hwc->interrupts_seq = seq;
6458 hwc->interrupts = 1;
6459 } else {
6460 hwc->interrupts++;
6461 if (unlikely(throttle
6462 && hwc->interrupts >= max_samples_per_tick)) {
6463 __this_cpu_inc(perf_throttled_count);
6464 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
6465 hwc->interrupts = MAX_INTERRUPTS;
6466 perf_log_throttle(event, 0);
6467 ret = 1;
6468 }
6469 }
6470
6471 if (event->attr.freq) {
6472 u64 now = perf_clock();
6473 s64 delta = now - hwc->freq_time_stamp;
6474
6475 hwc->freq_time_stamp = now;
6476
6477 if (delta > 0 && delta < 2*TICK_NSEC)
6478 perf_adjust_period(event, delta, hwc->last_period, true);
6479 }
6480
6481
6482
6483
6484
6485
6486 event->pending_kill = POLL_IN;
6487 if (events && atomic_dec_and_test(&event->event_limit)) {
6488 ret = 1;
6489 event->pending_kill = POLL_HUP;
6490 event->pending_disable = 1;
6491 irq_work_queue(&event->pending);
6492 }
6493
6494 if (event->overflow_handler)
6495 event->overflow_handler(event, data, regs);
6496 else
6497 perf_event_output(event, data, regs);
6498
6499 if (*perf_event_fasync(event) && event->pending_kill) {
6500 event->pending_wakeup = 1;
6501 irq_work_queue(&event->pending);
6502 }
6503
6504 return ret;
6505}
6506
6507int perf_event_overflow(struct perf_event *event,
6508 struct perf_sample_data *data,
6509 struct pt_regs *regs)
6510{
6511 return __perf_event_overflow(event, 1, data, regs);
6512}
6513
6514
6515
6516
6517
6518struct swevent_htable {
6519 struct swevent_hlist *swevent_hlist;
6520 struct mutex hlist_mutex;
6521 int hlist_refcount;
6522
6523
6524 int recursion[PERF_NR_CONTEXTS];
6525};
6526
6527static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6528
6529
6530
6531
6532
6533
6534
6535
6536u64 perf_swevent_set_period(struct perf_event *event)
6537{
6538 struct hw_perf_event *hwc = &event->hw;
6539 u64 period = hwc->last_period;
6540 u64 nr, offset;
6541 s64 old, val;
6542
6543 hwc->last_period = hwc->sample_period;
6544
6545again:
6546 old = val = local64_read(&hwc->period_left);
6547 if (val < 0)
6548 return 0;
6549
6550 nr = div64_u64(period + val, period);
6551 offset = nr * period;
6552 val -= offset;
6553 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6554 goto again;
6555
6556 return nr;
6557}
6558
6559static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6560 struct perf_sample_data *data,
6561 struct pt_regs *regs)
6562{
6563 struct hw_perf_event *hwc = &event->hw;
6564 int throttle = 0;
6565
6566 if (!overflow)
6567 overflow = perf_swevent_set_period(event);
6568
6569 if (hwc->interrupts == MAX_INTERRUPTS)
6570 return;
6571
6572 for (; overflow; overflow--) {
6573 if (__perf_event_overflow(event, throttle,
6574 data, regs)) {
6575
6576
6577
6578
6579 break;
6580 }
6581 throttle = 1;
6582 }
6583}
6584
6585static void perf_swevent_event(struct perf_event *event, u64 nr,
6586 struct perf_sample_data *data,
6587 struct pt_regs *regs)
6588{
6589 struct hw_perf_event *hwc = &event->hw;
6590
6591 local64_add(nr, &event->count);
6592
6593 if (!regs)
6594 return;
6595
6596 if (!is_sampling_event(event))
6597 return;
6598
6599 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6600 data->period = nr;
6601 return perf_swevent_overflow(event, 1, data, regs);
6602 } else
6603 data->period = event->hw.last_period;
6604
6605 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6606 return perf_swevent_overflow(event, 1, data, regs);
6607
6608 if (local64_add_negative(nr, &hwc->period_left))
6609 return;
6610
6611 perf_swevent_overflow(event, 0, data, regs);
6612}
6613
6614static int perf_exclude_event(struct perf_event *event,
6615 struct pt_regs *regs)
6616{
6617 if (event->hw.state & PERF_HES_STOPPED)
6618 return 1;
6619
6620 if (regs) {
6621 if (event->attr.exclude_user && user_mode(regs))
6622 return 1;
6623
6624 if (event->attr.exclude_kernel && !user_mode(regs))
6625 return 1;
6626 }
6627
6628 return 0;
6629}
6630
6631static int perf_swevent_match(struct perf_event *event,
6632 enum perf_type_id type,
6633 u32 event_id,
6634 struct perf_sample_data *data,
6635 struct pt_regs *regs)
6636{
6637 if (event->attr.type != type)
6638 return 0;
6639
6640 if (event->attr.config != event_id)
6641 return 0;
6642
6643 if (perf_exclude_event(event, regs))
6644 return 0;
6645
6646 return 1;
6647}
6648
6649static inline u64 swevent_hash(u64 type, u32 event_id)
6650{
6651 u64 val = event_id | (type << 32);
6652
6653 return hash_64(val, SWEVENT_HLIST_BITS);
6654}
6655
6656static inline struct hlist_head *
6657__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6658{
6659 u64 hash = swevent_hash(type, event_id);
6660
6661 return &hlist->heads[hash];
6662}
6663
6664
6665static inline struct hlist_head *
6666find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6667{
6668 struct swevent_hlist *hlist;
6669
6670 hlist = rcu_dereference(swhash->swevent_hlist);
6671 if (!hlist)
6672 return NULL;
6673
6674 return __find_swevent_head(hlist, type, event_id);
6675}
6676
6677
6678static inline struct hlist_head *
6679find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6680{
6681 struct swevent_hlist *hlist;
6682 u32 event_id = event->attr.config;
6683 u64 type = event->attr.type;
6684
6685
6686
6687
6688
6689
6690 hlist = rcu_dereference_protected(swhash->swevent_hlist,
6691 lockdep_is_held(&event->ctx->lock));
6692 if (!hlist)
6693 return NULL;
6694
6695 return __find_swevent_head(hlist, type, event_id);
6696}
6697
6698static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6699 u64 nr,
6700 struct perf_sample_data *data,
6701 struct pt_regs *regs)
6702{
6703 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6704 struct perf_event *event;
6705 struct hlist_head *head;
6706
6707 rcu_read_lock();
6708 head = find_swevent_head_rcu(swhash, type, event_id);
6709 if (!head)
6710 goto end;
6711
6712 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6713 if (perf_swevent_match(event, type, event_id, data, regs))
6714 perf_swevent_event(event, nr, data, regs);
6715 }
6716end:
6717 rcu_read_unlock();
6718}
6719
6720DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6721
6722int perf_swevent_get_recursion_context(void)
6723{
6724 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6725
6726 return get_recursion_context(swhash->recursion);
6727}
6728EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6729
6730inline void perf_swevent_put_recursion_context(int rctx)
6731{
6732 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6733
6734 put_recursion_context(swhash->recursion, rctx);
6735}
6736
6737void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6738{
6739 struct perf_sample_data data;
6740
6741 if (WARN_ON_ONCE(!regs))
6742 return;
6743
6744 perf_sample_data_init(&data, addr, 0);
6745 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6746}
6747
6748void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6749{
6750 int rctx;
6751
6752 preempt_disable_notrace();
6753 rctx = perf_swevent_get_recursion_context();
6754 if (unlikely(rctx < 0))
6755 goto fail;
6756
6757 ___perf_sw_event(event_id, nr, regs, addr);
6758
6759 perf_swevent_put_recursion_context(rctx);
6760fail:
6761 preempt_enable_notrace();
6762}
6763
6764static void perf_swevent_read(struct perf_event *event)
6765{
6766}
6767
6768static int perf_swevent_add(struct perf_event *event, int flags)
6769{
6770 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6771 struct hw_perf_event *hwc = &event->hw;
6772 struct hlist_head *head;
6773
6774 if (is_sampling_event(event)) {
6775 hwc->last_period = hwc->sample_period;
6776 perf_swevent_set_period(event);
6777 }
6778
6779 hwc->state = !(flags & PERF_EF_START);
6780
6781 head = find_swevent_head(swhash, event);
6782 if (WARN_ON_ONCE(!head))
6783 return -EINVAL;
6784
6785 hlist_add_head_rcu(&event->hlist_entry, head);
6786 perf_event_update_userpage(event);
6787
6788 return 0;
6789}
6790
6791static void perf_swevent_del(struct perf_event *event, int flags)
6792{
6793 hlist_del_rcu(&event->hlist_entry);
6794}
6795
6796static void perf_swevent_start(struct perf_event *event, int flags)
6797{
6798 event->hw.state = 0;
6799}
6800
6801static void perf_swevent_stop(struct perf_event *event, int flags)
6802{
6803 event->hw.state = PERF_HES_STOPPED;
6804}
6805
6806
6807static inline struct swevent_hlist *
6808swevent_hlist_deref(struct swevent_htable *swhash)
6809{
6810 return rcu_dereference_protected(swhash->swevent_hlist,
6811 lockdep_is_held(&swhash->hlist_mutex));
6812}
6813
6814static void swevent_hlist_release(struct swevent_htable *swhash)
6815{
6816 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6817
6818 if (!hlist)
6819 return;
6820
6821 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6822 kfree_rcu(hlist, rcu_head);
6823}
6824
6825static void swevent_hlist_put_cpu(int cpu)
6826{
6827 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6828
6829 mutex_lock(&swhash->hlist_mutex);
6830
6831 if (!--swhash->hlist_refcount)
6832 swevent_hlist_release(swhash);
6833
6834 mutex_unlock(&swhash->hlist_mutex);
6835}
6836
6837static void swevent_hlist_put(void)
6838{
6839 int cpu;
6840
6841 for_each_possible_cpu(cpu)
6842 swevent_hlist_put_cpu(cpu);
6843}
6844
6845static int swevent_hlist_get_cpu(int cpu)
6846{
6847 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6848 int err = 0;
6849
6850 mutex_lock(&swhash->hlist_mutex);
6851 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6852 struct swevent_hlist *hlist;
6853
6854 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6855 if (!hlist) {
6856 err = -ENOMEM;
6857 goto exit;
6858 }
6859 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6860 }
6861 swhash->hlist_refcount++;
6862exit:
6863 mutex_unlock(&swhash->hlist_mutex);
6864
6865 return err;
6866}
6867
6868static int swevent_hlist_get(void)
6869{
6870 int err, cpu, failed_cpu;
6871
6872 get_online_cpus();
6873 for_each_possible_cpu(cpu) {
6874 err = swevent_hlist_get_cpu(cpu);
6875 if (err) {
6876 failed_cpu = cpu;
6877 goto fail;
6878 }
6879 }
6880 put_online_cpus();
6881
6882 return 0;
6883fail:
6884 for_each_possible_cpu(cpu) {
6885 if (cpu == failed_cpu)
6886 break;
6887 swevent_hlist_put_cpu(cpu);
6888 }
6889
6890 put_online_cpus();
6891 return err;
6892}
6893
6894struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6895
6896static void sw_perf_event_destroy(struct perf_event *event)
6897{
6898 u64 event_id = event->attr.config;
6899
6900 WARN_ON(event->parent);
6901
6902 static_key_slow_dec(&perf_swevent_enabled[event_id]);
6903 swevent_hlist_put();
6904}
6905
6906static int perf_swevent_init(struct perf_event *event)
6907{
6908 u64 event_id = event->attr.config;
6909
6910 if (event->attr.type != PERF_TYPE_SOFTWARE)
6911 return -ENOENT;
6912
6913
6914
6915
6916 if (has_branch_stack(event))
6917 return -EOPNOTSUPP;
6918
6919 switch (event_id) {
6920 case PERF_COUNT_SW_CPU_CLOCK:
6921 case PERF_COUNT_SW_TASK_CLOCK:
6922 return -ENOENT;
6923
6924 default:
6925 break;
6926 }
6927
6928 if (event_id >= PERF_COUNT_SW_MAX)
6929 return -ENOENT;
6930
6931 if (!event->parent) {
6932 int err;
6933
6934 err = swevent_hlist_get();
6935 if (err)
6936 return err;
6937
6938 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6939 event->destroy = sw_perf_event_destroy;
6940 }
6941
6942 return 0;
6943}
6944
6945static struct pmu perf_swevent = {
6946 .task_ctx_nr = perf_sw_context,
6947
6948 .capabilities = PERF_PMU_CAP_NO_NMI,
6949
6950 .event_init = perf_swevent_init,
6951 .add = perf_swevent_add,
6952 .del = perf_swevent_del,
6953 .start = perf_swevent_start,
6954 .stop = perf_swevent_stop,
6955 .read = perf_swevent_read,
6956};
6957
6958#ifdef CONFIG_EVENT_TRACING
6959
6960static int perf_tp_filter_match(struct perf_event *event,
6961 struct perf_sample_data *data)
6962{
6963 void *record = data->raw->data;
6964
6965
6966 if (event->parent)
6967 event = event->parent;
6968
6969 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6970 return 1;
6971 return 0;
6972}
6973
6974static int perf_tp_event_match(struct perf_event *event,
6975 struct perf_sample_data *data,
6976 struct pt_regs *regs)
6977{
6978 if (event->hw.state & PERF_HES_STOPPED)
6979 return 0;
6980
6981
6982
6983 if (event->attr.exclude_kernel)
6984 return 0;
6985
6986 if (!perf_tp_filter_match(event, data))
6987 return 0;
6988
6989 return 1;
6990}
6991
6992void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6993 struct pt_regs *regs, struct hlist_head *head, int rctx,
6994 struct task_struct *task)
6995{
6996 struct perf_sample_data data;
6997 struct perf_event *event;
6998
6999 struct perf_raw_record raw = {
7000 .size = entry_size,
7001 .data = record,
7002 };
7003
7004 perf_sample_data_init(&data, addr, 0);
7005 data.raw = &raw;
7006
7007 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7008 if (perf_tp_event_match(event, &data, regs))
7009 perf_swevent_event(event, count, &data, regs);
7010 }
7011
7012
7013
7014
7015
7016 if (task && task != current) {
7017 struct perf_event_context *ctx;
7018 struct trace_entry *entry = record;
7019
7020 rcu_read_lock();
7021 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
7022 if (!ctx)
7023 goto unlock;
7024
7025 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7026 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7027 continue;
7028 if (event->attr.config != entry->type)
7029 continue;
7030 if (perf_tp_event_match(event, &data, regs))
7031 perf_swevent_event(event, count, &data, regs);
7032 }
7033unlock:
7034 rcu_read_unlock();
7035 }
7036
7037 perf_swevent_put_recursion_context(rctx);
7038}
7039EXPORT_SYMBOL_GPL(perf_tp_event);
7040
7041static void tp_perf_event_destroy(struct perf_event *event)
7042{
7043 perf_trace_destroy(event);
7044}
7045
7046static int perf_tp_event_init(struct perf_event *event)
7047{
7048 int err;
7049
7050 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7051 return -ENOENT;
7052
7053
7054
7055
7056 if (has_branch_stack(event))
7057 return -EOPNOTSUPP;
7058
7059 err = perf_trace_init(event);
7060 if (err)
7061 return err;
7062
7063 event->destroy = tp_perf_event_destroy;
7064
7065 return 0;
7066}
7067
7068static struct pmu perf_tracepoint = {
7069 .task_ctx_nr = perf_sw_context,
7070
7071 .event_init = perf_tp_event_init,
7072 .add = perf_trace_add,
7073 .del = perf_trace_del,
7074 .start = perf_swevent_start,
7075 .stop = perf_swevent_stop,
7076 .read = perf_swevent_read,
7077};
7078
7079static inline void perf_tp_register(void)
7080{
7081 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
7082}
7083
7084static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7085{
7086 char *filter_str;
7087 int ret;
7088
7089 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7090 return -EINVAL;
7091
7092 filter_str = strndup_user(arg, PAGE_SIZE);
7093 if (IS_ERR(filter_str))
7094 return PTR_ERR(filter_str);
7095
7096 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
7097
7098 kfree(filter_str);
7099 return ret;
7100}
7101
7102static void perf_event_free_filter(struct perf_event *event)
7103{
7104 ftrace_profile_free_filter(event);
7105}
7106
7107static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7108{
7109 struct bpf_prog *prog;
7110
7111 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7112 return -EINVAL;
7113
7114 if (event->tp_event->prog)
7115 return -EEXIST;
7116
7117 if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
7118
7119 return -EINVAL;
7120
7121 prog = bpf_prog_get(prog_fd);
7122 if (IS_ERR(prog))
7123 return PTR_ERR(prog);
7124
7125 if (prog->type != BPF_PROG_TYPE_KPROBE) {
7126
7127 bpf_prog_put(prog);
7128 return -EINVAL;
7129 }
7130
7131 event->tp_event->prog = prog;
7132
7133 return 0;
7134}
7135
7136static void perf_event_free_bpf_prog(struct perf_event *event)
7137{
7138 struct bpf_prog *prog;
7139
7140 if (!event->tp_event)
7141 return;
7142
7143 prog = event->tp_event->prog;
7144 if (prog) {
7145 event->tp_event->prog = NULL;
7146 bpf_prog_put(prog);
7147 }
7148}
7149
7150#else
7151
7152static inline void perf_tp_register(void)
7153{
7154}
7155
7156static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7157{
7158 return -ENOENT;
7159}
7160
7161static void perf_event_free_filter(struct perf_event *event)
7162{
7163}
7164
7165static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7166{
7167 return -ENOENT;
7168}
7169
7170static void perf_event_free_bpf_prog(struct perf_event *event)
7171{
7172}
7173#endif
7174
7175#ifdef CONFIG_HAVE_HW_BREAKPOINT
7176void perf_bp_event(struct perf_event *bp, void *data)
7177{
7178 struct perf_sample_data sample;
7179 struct pt_regs *regs = data;
7180
7181 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
7182
7183 if (!bp->hw.state && !perf_exclude_event(bp, regs))
7184 perf_swevent_event(bp, 1, &sample, regs);
7185}
7186#endif
7187
7188
7189
7190
7191
7192static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
7193{
7194 enum hrtimer_restart ret = HRTIMER_RESTART;
7195 struct perf_sample_data data;
7196 struct pt_regs *regs;
7197 struct perf_event *event;
7198 u64 period;
7199
7200 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
7201
7202 if (event->state != PERF_EVENT_STATE_ACTIVE)
7203 return HRTIMER_NORESTART;
7204
7205 event->pmu->read(event);
7206
7207 perf_sample_data_init(&data, 0, event->hw.last_period);
7208 regs = get_irq_regs();
7209
7210 if (regs && !perf_exclude_event(event, regs)) {
7211 if (!(event->attr.exclude_idle && is_idle_task(current)))
7212 if (__perf_event_overflow(event, 1, &data, regs))
7213 ret = HRTIMER_NORESTART;
7214 }
7215
7216 period = max_t(u64, 10000, event->hw.sample_period);
7217 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
7218
7219 return ret;
7220}
7221
7222static void perf_swevent_start_hrtimer(struct perf_event *event)
7223{
7224 struct hw_perf_event *hwc = &event->hw;
7225 s64 period;
7226
7227 if (!is_sampling_event(event))
7228 return;
7229
7230 period = local64_read(&hwc->period_left);
7231 if (period) {
7232 if (period < 0)
7233 period = 10000;
7234
7235 local64_set(&hwc->period_left, 0);
7236 } else {
7237 period = max_t(u64, 10000, hwc->sample_period);
7238 }
7239 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
7240 HRTIMER_MODE_REL_PINNED);
7241}
7242
7243static void perf_swevent_cancel_hrtimer(struct perf_event *event)
7244{
7245 struct hw_perf_event *hwc = &event->hw;
7246
7247 if (is_sampling_event(event)) {
7248 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
7249 local64_set(&hwc->period_left, ktime_to_ns(remaining));
7250
7251 hrtimer_cancel(&hwc->hrtimer);
7252 }
7253}
7254
7255static void perf_swevent_init_hrtimer(struct perf_event *event)
7256{
7257 struct hw_perf_event *hwc = &event->hw;
7258
7259 if (!is_sampling_event(event))
7260 return;
7261
7262 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
7263 hwc->hrtimer.function = perf_swevent_hrtimer;
7264
7265
7266
7267
7268
7269 if (event->attr.freq) {
7270 long freq = event->attr.sample_freq;
7271
7272 event->attr.sample_period = NSEC_PER_SEC / freq;
7273 hwc->sample_period = event->attr.sample_period;
7274 local64_set(&hwc->period_left, hwc->sample_period);
7275 hwc->last_period = hwc->sample_period;
7276 event->attr.freq = 0;
7277 }
7278}
7279
7280
7281
7282
7283
7284static void cpu_clock_event_update(struct perf_event *event)
7285{
7286 s64 prev;
7287 u64 now;
7288
7289 now = local_clock();
7290 prev = local64_xchg(&event->hw.prev_count, now);
7291 local64_add(now - prev, &event->count);
7292}
7293
7294static void cpu_clock_event_start(struct perf_event *event, int flags)
7295{
7296 local64_set(&event->hw.prev_count, local_clock());
7297 perf_swevent_start_hrtimer(event);
7298}
7299
7300static void cpu_clock_event_stop(struct perf_event *event, int flags)
7301{
7302 perf_swevent_cancel_hrtimer(event);
7303 cpu_clock_event_update(event);
7304}
7305
7306static int cpu_clock_event_add(struct perf_event *event, int flags)
7307{
7308 if (flags & PERF_EF_START)
7309 cpu_clock_event_start(event, flags);
7310 perf_event_update_userpage(event);
7311
7312 return 0;
7313}
7314
7315static void cpu_clock_event_del(struct perf_event *event, int flags)
7316{
7317 cpu_clock_event_stop(event, flags);
7318}
7319
7320static void cpu_clock_event_read(struct perf_event *event)
7321{
7322 cpu_clock_event_update(event);
7323}
7324
7325static int cpu_clock_event_init(struct perf_event *event)
7326{
7327 if (event->attr.type != PERF_TYPE_SOFTWARE)
7328 return -ENOENT;
7329
7330 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
7331 return -ENOENT;
7332
7333
7334
7335
7336 if (has_branch_stack(event))
7337 return -EOPNOTSUPP;
7338
7339 perf_swevent_init_hrtimer(event);
7340
7341 return 0;
7342}
7343
7344static struct pmu perf_cpu_clock = {
7345 .task_ctx_nr = perf_sw_context,
7346
7347 .capabilities = PERF_PMU_CAP_NO_NMI,
7348
7349 .event_init = cpu_clock_event_init,
7350 .add = cpu_clock_event_add,
7351 .del = cpu_clock_event_del,
7352 .start = cpu_clock_event_start,
7353 .stop = cpu_clock_event_stop,
7354 .read = cpu_clock_event_read,
7355};
7356
7357
7358
7359
7360
7361static void task_clock_event_update(struct perf_event *event, u64 now)
7362{
7363 u64 prev;
7364 s64 delta;
7365
7366 prev = local64_xchg(&event->hw.prev_count, now);
7367 delta = now - prev;
7368 local64_add(delta, &event->count);
7369}
7370
7371static void task_clock_event_start(struct perf_event *event, int flags)
7372{
7373 local64_set(&event->hw.prev_count, event->ctx->time);
7374 perf_swevent_start_hrtimer(event);
7375}
7376
7377static void task_clock_event_stop(struct perf_event *event, int flags)
7378{
7379 perf_swevent_cancel_hrtimer(event);
7380 task_clock_event_update(event, event->ctx->time);
7381}
7382
7383static int task_clock_event_add(struct perf_event *event, int flags)
7384{
7385 if (flags & PERF_EF_START)
7386 task_clock_event_start(event, flags);
7387 perf_event_update_userpage(event);
7388
7389 return 0;
7390}
7391
7392static void task_clock_event_del(struct perf_event *event, int flags)
7393{
7394 task_clock_event_stop(event, PERF_EF_UPDATE);
7395}
7396
7397static void task_clock_event_read(struct perf_event *event)
7398{
7399 u64 now = perf_clock();
7400 u64 delta = now - event->ctx->timestamp;
7401 u64 time = event->ctx->time + delta;
7402
7403 task_clock_event_update(event, time);
7404}
7405
7406static int task_clock_event_init(struct perf_event *event)
7407{
7408 if (event->attr.type != PERF_TYPE_SOFTWARE)
7409 return -ENOENT;
7410
7411 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7412 return -ENOENT;
7413
7414
7415
7416
7417 if (has_branch_stack(event))
7418 return -EOPNOTSUPP;
7419
7420 perf_swevent_init_hrtimer(event);
7421
7422 return 0;
7423}
7424
7425static struct pmu perf_task_clock = {
7426 .task_ctx_nr = perf_sw_context,
7427
7428 .capabilities = PERF_PMU_CAP_NO_NMI,
7429
7430 .event_init = task_clock_event_init,
7431 .add = task_clock_event_add,
7432 .del = task_clock_event_del,
7433 .start = task_clock_event_start,
7434 .stop = task_clock_event_stop,
7435 .read = task_clock_event_read,
7436};
7437
7438static void perf_pmu_nop_void(struct pmu *pmu)
7439{
7440}
7441
7442static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
7443{
7444}
7445
7446static int perf_pmu_nop_int(struct pmu *pmu)
7447{
7448 return 0;
7449}
7450
7451static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
7452
7453static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
7454{
7455 __this_cpu_write(nop_txn_flags, flags);
7456
7457 if (flags & ~PERF_PMU_TXN_ADD)
7458 return;
7459
7460 perf_pmu_disable(pmu);
7461}
7462
7463static int perf_pmu_commit_txn(struct pmu *pmu)
7464{
7465 unsigned int flags = __this_cpu_read(nop_txn_flags);
7466
7467 __this_cpu_write(nop_txn_flags, 0);
7468
7469 if (flags & ~PERF_PMU_TXN_ADD)
7470 return 0;
7471
7472 perf_pmu_enable(pmu);
7473 return 0;
7474}
7475
7476static void perf_pmu_cancel_txn(struct pmu *pmu)
7477{
7478 unsigned int flags = __this_cpu_read(nop_txn_flags);
7479
7480 __this_cpu_write(nop_txn_flags, 0);
7481
7482 if (flags & ~PERF_PMU_TXN_ADD)
7483 return;
7484
7485 perf_pmu_enable(pmu);
7486}
7487
7488static int perf_event_idx_default(struct perf_event *event)
7489{
7490 return 0;
7491}
7492
7493
7494
7495
7496
7497static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7498{
7499 struct pmu *pmu;
7500
7501 if (ctxn < 0)
7502 return NULL;
7503
7504 list_for_each_entry(pmu, &pmus, entry) {
7505 if (pmu->task_ctx_nr == ctxn)
7506 return pmu->pmu_cpu_context;
7507 }
7508
7509 return NULL;
7510}
7511
7512static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7513{
7514 int cpu;
7515
7516 for_each_possible_cpu(cpu) {
7517 struct perf_cpu_context *cpuctx;
7518
7519 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7520
7521 if (cpuctx->unique_pmu == old_pmu)
7522 cpuctx->unique_pmu = pmu;
7523 }
7524}
7525
7526static void free_pmu_context(struct pmu *pmu)
7527{
7528 struct pmu *i;
7529
7530 mutex_lock(&pmus_lock);
7531
7532
7533
7534 list_for_each_entry(i, &pmus, entry) {
7535 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7536 update_pmu_context(i, pmu);
7537 goto out;
7538 }
7539 }
7540
7541 free_percpu(pmu->pmu_cpu_context);
7542out:
7543 mutex_unlock(&pmus_lock);
7544}
7545static struct idr pmu_idr;
7546
7547static ssize_t
7548type_show(struct device *dev, struct device_attribute *attr, char *page)
7549{
7550 struct pmu *pmu = dev_get_drvdata(dev);
7551
7552 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7553}
7554static DEVICE_ATTR_RO(type);
7555
7556static ssize_t
7557perf_event_mux_interval_ms_show(struct device *dev,
7558 struct device_attribute *attr,
7559 char *page)
7560{
7561 struct pmu *pmu = dev_get_drvdata(dev);
7562
7563 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7564}
7565
7566static DEFINE_MUTEX(mux_interval_mutex);
7567
7568static ssize_t
7569perf_event_mux_interval_ms_store(struct device *dev,
7570 struct device_attribute *attr,
7571 const char *buf, size_t count)
7572{
7573 struct pmu *pmu = dev_get_drvdata(dev);
7574 int timer, cpu, ret;
7575
7576 ret = kstrtoint(buf, 0, &timer);
7577 if (ret)
7578 return ret;
7579
7580 if (timer < 1)
7581 return -EINVAL;
7582
7583
7584 if (timer == pmu->hrtimer_interval_ms)
7585 return count;
7586
7587 mutex_lock(&mux_interval_mutex);
7588 pmu->hrtimer_interval_ms = timer;
7589
7590
7591 get_online_cpus();
7592 for_each_online_cpu(cpu) {
7593 struct perf_cpu_context *cpuctx;
7594 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7595 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7596
7597 cpu_function_call(cpu,
7598 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
7599 }
7600 put_online_cpus();
7601 mutex_unlock(&mux_interval_mutex);
7602
7603 return count;
7604}
7605static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7606
7607static struct attribute *pmu_dev_attrs[] = {
7608 &dev_attr_type.attr,
7609 &dev_attr_perf_event_mux_interval_ms.attr,
7610 NULL,
7611};
7612ATTRIBUTE_GROUPS(pmu_dev);
7613
7614static int pmu_bus_running;
7615static struct bus_type pmu_bus = {
7616 .name = "event_source",
7617 .dev_groups = pmu_dev_groups,
7618};
7619
7620static void pmu_dev_release(struct device *dev)
7621{
7622 kfree(dev);
7623}
7624
7625static int pmu_dev_alloc(struct pmu *pmu)
7626{
7627 int ret = -ENOMEM;
7628
7629 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7630 if (!pmu->dev)
7631 goto out;
7632
7633 pmu->dev->groups = pmu->attr_groups;
7634 device_initialize(pmu->dev);
7635 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7636 if (ret)
7637 goto free_dev;
7638
7639 dev_set_drvdata(pmu->dev, pmu);
7640 pmu->dev->bus = &pmu_bus;
7641 pmu->dev->release = pmu_dev_release;
7642 ret = device_add(pmu->dev);
7643 if (ret)
7644 goto free_dev;
7645
7646out:
7647 return ret;
7648
7649free_dev:
7650 put_device(pmu->dev);
7651 goto out;
7652}
7653
7654static struct lock_class_key cpuctx_mutex;
7655static struct lock_class_key cpuctx_lock;
7656
7657int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7658{
7659 int cpu, ret;
7660
7661 mutex_lock(&pmus_lock);
7662 ret = -ENOMEM;
7663 pmu->pmu_disable_count = alloc_percpu(int);
7664 if (!pmu->pmu_disable_count)
7665 goto unlock;
7666
7667 pmu->type = -1;
7668 if (!name)
7669 goto skip_type;
7670 pmu->name = name;
7671
7672 if (type < 0) {
7673 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7674 if (type < 0) {
7675 ret = type;
7676 goto free_pdc;
7677 }
7678 }
7679 pmu->type = type;
7680
7681 if (pmu_bus_running) {
7682 ret = pmu_dev_alloc(pmu);
7683 if (ret)
7684 goto free_idr;
7685 }
7686
7687skip_type:
7688 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7689 if (pmu->pmu_cpu_context)
7690 goto got_cpu_context;
7691
7692 ret = -ENOMEM;
7693 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7694 if (!pmu->pmu_cpu_context)
7695 goto free_dev;
7696
7697 for_each_possible_cpu(cpu) {
7698 struct perf_cpu_context *cpuctx;
7699
7700 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7701 __perf_event_init_context(&cpuctx->ctx);
7702 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7703 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7704 cpuctx->ctx.pmu = pmu;
7705
7706 __perf_mux_hrtimer_init(cpuctx, cpu);
7707
7708 cpuctx->unique_pmu = pmu;
7709 }
7710
7711got_cpu_context:
7712 if (!pmu->start_txn) {
7713 if (pmu->pmu_enable) {
7714
7715
7716
7717
7718
7719 pmu->start_txn = perf_pmu_start_txn;
7720 pmu->commit_txn = perf_pmu_commit_txn;
7721 pmu->cancel_txn = perf_pmu_cancel_txn;
7722 } else {
7723 pmu->start_txn = perf_pmu_nop_txn;
7724 pmu->commit_txn = perf_pmu_nop_int;
7725 pmu->cancel_txn = perf_pmu_nop_void;
7726 }
7727 }
7728
7729 if (!pmu->pmu_enable) {
7730 pmu->pmu_enable = perf_pmu_nop_void;
7731 pmu->pmu_disable = perf_pmu_nop_void;
7732 }
7733
7734 if (!pmu->event_idx)
7735 pmu->event_idx = perf_event_idx_default;
7736
7737 list_add_rcu(&pmu->entry, &pmus);
7738 atomic_set(&pmu->exclusive_cnt, 0);
7739 ret = 0;
7740unlock:
7741 mutex_unlock(&pmus_lock);
7742
7743 return ret;
7744
7745free_dev:
7746 device_del(pmu->dev);
7747 put_device(pmu->dev);
7748
7749free_idr:
7750 if (pmu->type >= PERF_TYPE_MAX)
7751 idr_remove(&pmu_idr, pmu->type);
7752
7753free_pdc:
7754 free_percpu(pmu->pmu_disable_count);
7755 goto unlock;
7756}
7757EXPORT_SYMBOL_GPL(perf_pmu_register);
7758
7759void perf_pmu_unregister(struct pmu *pmu)
7760{
7761 mutex_lock(&pmus_lock);
7762 list_del_rcu(&pmu->entry);
7763 mutex_unlock(&pmus_lock);
7764
7765
7766
7767
7768
7769 synchronize_srcu(&pmus_srcu);
7770 synchronize_rcu();
7771
7772 free_percpu(pmu->pmu_disable_count);
7773 if (pmu->type >= PERF_TYPE_MAX)
7774 idr_remove(&pmu_idr, pmu->type);
7775 device_del(pmu->dev);
7776 put_device(pmu->dev);
7777 free_pmu_context(pmu);
7778}
7779EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7780
7781static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7782{
7783 struct perf_event_context *ctx = NULL;
7784 int ret;
7785
7786 if (!try_module_get(pmu->module))
7787 return -ENODEV;
7788
7789 if (event->group_leader != event) {
7790
7791
7792
7793
7794 ctx = perf_event_ctx_lock_nested(event->group_leader,
7795 SINGLE_DEPTH_NESTING);
7796 BUG_ON(!ctx);
7797 }
7798
7799 event->pmu = pmu;
7800 ret = pmu->event_init(event);
7801
7802 if (ctx)
7803 perf_event_ctx_unlock(event->group_leader, ctx);
7804
7805 if (ret)
7806 module_put(pmu->module);
7807
7808 return ret;
7809}
7810
7811static struct pmu *perf_init_event(struct perf_event *event)
7812{
7813 struct pmu *pmu = NULL;
7814 int idx;
7815 int ret;
7816
7817 idx = srcu_read_lock(&pmus_srcu);
7818
7819 rcu_read_lock();
7820 pmu = idr_find(&pmu_idr, event->attr.type);
7821 rcu_read_unlock();
7822 if (pmu) {
7823 ret = perf_try_init_event(pmu, event);
7824 if (ret)
7825 pmu = ERR_PTR(ret);
7826 goto unlock;
7827 }
7828
7829 list_for_each_entry_rcu(pmu, &pmus, entry) {
7830 ret = perf_try_init_event(pmu, event);
7831 if (!ret)
7832 goto unlock;
7833
7834 if (ret != -ENOENT) {
7835 pmu = ERR_PTR(ret);
7836 goto unlock;
7837 }
7838 }
7839 pmu = ERR_PTR(-ENOENT);
7840unlock:
7841 srcu_read_unlock(&pmus_srcu, idx);
7842
7843 return pmu;
7844}
7845
7846static void account_event_cpu(struct perf_event *event, int cpu)
7847{
7848 if (event->parent)
7849 return;
7850
7851 if (is_cgroup_event(event))
7852 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7853}
7854
7855
7856static void account_freq_event_nohz(void)
7857{
7858#ifdef CONFIG_NO_HZ_FULL
7859
7860 spin_lock(&nr_freq_lock);
7861 if (atomic_inc_return(&nr_freq_events) == 1)
7862 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
7863 spin_unlock(&nr_freq_lock);
7864#endif
7865}
7866
7867static void account_freq_event(void)
7868{
7869 if (tick_nohz_full_enabled())
7870 account_freq_event_nohz();
7871 else
7872 atomic_inc(&nr_freq_events);
7873}
7874
7875
7876static void account_event(struct perf_event *event)
7877{
7878 bool inc = false;
7879
7880 if (event->parent)
7881 return;
7882
7883 if (event->attach_state & PERF_ATTACH_TASK)
7884 inc = true;
7885 if (event->attr.mmap || event->attr.mmap_data)
7886 atomic_inc(&nr_mmap_events);
7887 if (event->attr.comm)
7888 atomic_inc(&nr_comm_events);
7889 if (event->attr.task)
7890 atomic_inc(&nr_task_events);
7891 if (event->attr.freq)
7892 account_freq_event();
7893 if (event->attr.context_switch) {
7894 atomic_inc(&nr_switch_events);
7895 inc = true;
7896 }
7897 if (has_branch_stack(event))
7898 inc = true;
7899 if (is_cgroup_event(event))
7900 inc = true;
7901
7902 if (inc) {
7903 if (atomic_inc_not_zero(&perf_sched_count))
7904 goto enabled;
7905
7906 mutex_lock(&perf_sched_mutex);
7907 if (!atomic_read(&perf_sched_count)) {
7908 static_branch_enable(&perf_sched_events);
7909
7910
7911
7912
7913
7914 synchronize_sched();
7915 }
7916
7917
7918
7919
7920 atomic_inc(&perf_sched_count);
7921 mutex_unlock(&perf_sched_mutex);
7922 }
7923enabled:
7924
7925 account_event_cpu(event, event->cpu);
7926}
7927
7928
7929
7930
7931static struct perf_event *
7932perf_event_alloc(struct perf_event_attr *attr, int cpu,
7933 struct task_struct *task,
7934 struct perf_event *group_leader,
7935 struct perf_event *parent_event,
7936 perf_overflow_handler_t overflow_handler,
7937 void *context, int cgroup_fd)
7938{
7939 struct pmu *pmu;
7940 struct perf_event *event;
7941 struct hw_perf_event *hwc;
7942 long err = -EINVAL;
7943
7944 if ((unsigned)cpu >= nr_cpu_ids) {
7945 if (!task || cpu != -1)
7946 return ERR_PTR(-EINVAL);
7947 }
7948
7949 event = kzalloc(sizeof(*event), GFP_KERNEL);
7950 if (!event)
7951 return ERR_PTR(-ENOMEM);
7952
7953
7954
7955
7956
7957 if (!group_leader)
7958 group_leader = event;
7959
7960 mutex_init(&event->child_mutex);
7961 INIT_LIST_HEAD(&event->child_list);
7962
7963 INIT_LIST_HEAD(&event->group_entry);
7964 INIT_LIST_HEAD(&event->event_entry);
7965 INIT_LIST_HEAD(&event->sibling_list);
7966 INIT_LIST_HEAD(&event->rb_entry);
7967 INIT_LIST_HEAD(&event->active_entry);
7968 INIT_HLIST_NODE(&event->hlist_entry);
7969
7970
7971 init_waitqueue_head(&event->waitq);
7972 init_irq_work(&event->pending, perf_pending_event);
7973
7974 mutex_init(&event->mmap_mutex);
7975
7976 atomic_long_set(&event->refcount, 1);
7977 event->cpu = cpu;
7978 event->attr = *attr;
7979 event->group_leader = group_leader;
7980 event->pmu = NULL;
7981 event->oncpu = -1;
7982
7983 event->parent = parent_event;
7984
7985 event->ns = get_pid_ns(task_active_pid_ns(current));
7986 event->id = atomic64_inc_return(&perf_event_id);
7987
7988 event->state = PERF_EVENT_STATE_INACTIVE;
7989
7990 if (task) {
7991 event->attach_state = PERF_ATTACH_TASK;
7992
7993
7994
7995
7996
7997 event->hw.target = task;
7998 }
7999
8000 event->clock = &local_clock;
8001 if (parent_event)
8002 event->clock = parent_event->clock;
8003
8004 if (!overflow_handler && parent_event) {
8005 overflow_handler = parent_event->overflow_handler;
8006 context = parent_event->overflow_handler_context;
8007 }
8008
8009 event->overflow_handler = overflow_handler;
8010 event->overflow_handler_context = context;
8011
8012 perf_event__state_init(event);
8013
8014 pmu = NULL;
8015
8016 hwc = &event->hw;
8017 hwc->sample_period = attr->sample_period;
8018 if (attr->freq && attr->sample_freq)
8019 hwc->sample_period = 1;
8020 hwc->last_period = hwc->sample_period;
8021
8022 local64_set(&hwc->period_left, hwc->sample_period);
8023
8024
8025
8026
8027 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
8028 goto err_ns;
8029
8030 if (!has_branch_stack(event))
8031 event->attr.branch_sample_type = 0;
8032
8033 if (cgroup_fd != -1) {
8034 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
8035 if (err)
8036 goto err_ns;
8037 }
8038
8039 pmu = perf_init_event(event);
8040 if (!pmu)
8041 goto err_ns;
8042 else if (IS_ERR(pmu)) {
8043 err = PTR_ERR(pmu);
8044 goto err_ns;
8045 }
8046
8047 err = exclusive_event_init(event);
8048 if (err)
8049 goto err_pmu;
8050
8051 if (!event->parent) {
8052 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
8053 err = get_callchain_buffers();
8054 if (err)
8055 goto err_per_task;
8056 }
8057 }
8058
8059
8060 account_event(event);
8061
8062 return event;
8063
8064err_per_task:
8065 exclusive_event_destroy(event);
8066
8067err_pmu:
8068 if (event->destroy)
8069 event->destroy(event);
8070 module_put(pmu->module);
8071err_ns:
8072 if (is_cgroup_event(event))
8073 perf_detach_cgroup(event);
8074 if (event->ns)
8075 put_pid_ns(event->ns);
8076 kfree(event);
8077
8078 return ERR_PTR(err);
8079}
8080
8081static int perf_copy_attr(struct perf_event_attr __user *uattr,
8082 struct perf_event_attr *attr)
8083{
8084 u32 size;
8085 int ret;
8086
8087 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
8088 return -EFAULT;
8089
8090
8091
8092
8093 memset(attr, 0, sizeof(*attr));
8094
8095 ret = get_user(size, &uattr->size);
8096 if (ret)
8097 return ret;
8098
8099 if (size > PAGE_SIZE)
8100 goto err_size;
8101
8102 if (!size)
8103 size = PERF_ATTR_SIZE_VER0;
8104
8105 if (size < PERF_ATTR_SIZE_VER0)
8106 goto err_size;
8107
8108
8109
8110
8111
8112
8113
8114 if (size > sizeof(*attr)) {
8115 unsigned char __user *addr;
8116 unsigned char __user *end;
8117 unsigned char val;
8118
8119 addr = (void __user *)uattr + sizeof(*attr);
8120 end = (void __user *)uattr + size;
8121
8122 for (; addr < end; addr++) {
8123 ret = get_user(val, addr);
8124 if (ret)
8125 return ret;
8126 if (val)
8127 goto err_size;
8128 }
8129 size = sizeof(*attr);
8130 }
8131
8132 ret = copy_from_user(attr, uattr, size);
8133 if (ret)
8134 return -EFAULT;
8135
8136 if (attr->__reserved_1)
8137 return -EINVAL;
8138
8139 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
8140 return -EINVAL;
8141
8142 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
8143 return -EINVAL;
8144
8145 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
8146 u64 mask = attr->branch_sample_type;
8147
8148
8149 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
8150 return -EINVAL;
8151
8152
8153 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
8154 return -EINVAL;
8155
8156
8157 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
8158
8159
8160 if (!attr->exclude_kernel)
8161 mask |= PERF_SAMPLE_BRANCH_KERNEL;
8162
8163 if (!attr->exclude_user)
8164 mask |= PERF_SAMPLE_BRANCH_USER;
8165
8166 if (!attr->exclude_hv)
8167 mask |= PERF_SAMPLE_BRANCH_HV;
8168
8169
8170
8171 attr->branch_sample_type = mask;
8172 }
8173
8174 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
8175 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8176 return -EACCES;
8177 }
8178
8179 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
8180 ret = perf_reg_validate(attr->sample_regs_user);
8181 if (ret)
8182 return ret;
8183 }
8184
8185 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
8186 if (!arch_perf_have_user_stack_dump())
8187 return -ENOSYS;
8188
8189
8190
8191
8192
8193
8194 if (attr->sample_stack_user >= USHRT_MAX)
8195 ret = -EINVAL;
8196 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
8197 ret = -EINVAL;
8198 }
8199
8200 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
8201 ret = perf_reg_validate(attr->sample_regs_intr);
8202out:
8203 return ret;
8204
8205err_size:
8206 put_user(sizeof(*attr), &uattr->size);
8207 ret = -E2BIG;
8208 goto out;
8209}
8210
8211static int
8212perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
8213{
8214 struct ring_buffer *rb = NULL;
8215 int ret = -EINVAL;
8216
8217 if (!output_event)
8218 goto set;
8219
8220
8221 if (event == output_event)
8222 goto out;
8223
8224
8225
8226
8227 if (output_event->cpu != event->cpu)
8228 goto out;
8229
8230
8231
8232
8233 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
8234 goto out;
8235
8236
8237
8238
8239 if (output_event->clock != event->clock)
8240 goto out;
8241
8242
8243
8244
8245 if (has_aux(event) && has_aux(output_event) &&
8246 event->pmu != output_event->pmu)
8247 goto out;
8248
8249set:
8250 mutex_lock(&event->mmap_mutex);
8251
8252 if (atomic_read(&event->mmap_count))
8253 goto unlock;
8254
8255 if (output_event) {
8256
8257 rb = ring_buffer_get(output_event);
8258 if (!rb)
8259 goto unlock;
8260 }
8261
8262 ring_buffer_attach(event, rb);
8263
8264 ret = 0;
8265unlock:
8266 mutex_unlock(&event->mmap_mutex);
8267
8268out:
8269 return ret;
8270}
8271
8272static void mutex_lock_double(struct mutex *a, struct mutex *b)
8273{
8274 if (b < a)
8275 swap(a, b);
8276
8277 mutex_lock(a);
8278 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
8279}
8280
8281static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
8282{
8283 bool nmi_safe = false;
8284
8285 switch (clk_id) {
8286 case CLOCK_MONOTONIC:
8287 event->clock = &ktime_get_mono_fast_ns;
8288 nmi_safe = true;
8289 break;
8290
8291 case CLOCK_MONOTONIC_RAW:
8292 event->clock = &ktime_get_raw_fast_ns;
8293 nmi_safe = true;
8294 break;
8295
8296 case CLOCK_REALTIME:
8297 event->clock = &ktime_get_real_ns;
8298 break;
8299
8300 case CLOCK_BOOTTIME:
8301 event->clock = &ktime_get_boot_ns;
8302 break;
8303
8304 case CLOCK_TAI:
8305 event->clock = &ktime_get_tai_ns;
8306 break;
8307
8308 default:
8309 return -EINVAL;
8310 }
8311
8312 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
8313 return -EINVAL;
8314
8315 return 0;
8316}
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326SYSCALL_DEFINE5(perf_event_open,
8327 struct perf_event_attr __user *, attr_uptr,
8328 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
8329{
8330 struct perf_event *group_leader = NULL, *output_event = NULL;
8331 struct perf_event *event, *sibling;
8332 struct perf_event_attr attr;
8333 struct perf_event_context *ctx, *uninitialized_var(gctx);
8334 struct file *event_file = NULL;
8335 struct fd group = {NULL, 0};
8336 struct task_struct *task = NULL;
8337 struct pmu *pmu;
8338 int event_fd;
8339 int move_group = 0;
8340 int err;
8341 int f_flags = O_RDWR;
8342 int cgroup_fd = -1;
8343
8344
8345 if (flags & ~PERF_FLAG_ALL)
8346 return -EINVAL;
8347
8348 err = perf_copy_attr(attr_uptr, &attr);
8349 if (err)
8350 return err;
8351
8352 if (!attr.exclude_kernel) {
8353 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8354 return -EACCES;
8355 }
8356
8357 if (attr.freq) {
8358 if (attr.sample_freq > sysctl_perf_event_sample_rate)
8359 return -EINVAL;
8360 } else {
8361 if (attr.sample_period & (1ULL << 63))
8362 return -EINVAL;
8363 }
8364
8365
8366
8367
8368
8369
8370
8371 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
8372 return -EINVAL;
8373
8374 if (flags & PERF_FLAG_FD_CLOEXEC)
8375 f_flags |= O_CLOEXEC;
8376
8377 event_fd = get_unused_fd_flags(f_flags);
8378 if (event_fd < 0)
8379 return event_fd;
8380
8381 if (group_fd != -1) {
8382 err = perf_fget_light(group_fd, &group);
8383 if (err)
8384 goto err_fd;
8385 group_leader = group.file->private_data;
8386 if (flags & PERF_FLAG_FD_OUTPUT)
8387 output_event = group_leader;
8388 if (flags & PERF_FLAG_FD_NO_GROUP)
8389 group_leader = NULL;
8390 }
8391
8392 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
8393 task = find_lively_task_by_vpid(pid);
8394 if (IS_ERR(task)) {
8395 err = PTR_ERR(task);
8396 goto err_group_fd;
8397 }
8398 }
8399
8400 if (task && group_leader &&
8401 group_leader->attr.inherit != attr.inherit) {
8402 err = -EINVAL;
8403 goto err_task;
8404 }
8405
8406 get_online_cpus();
8407
8408 if (task) {
8409 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
8410 if (err)
8411 goto err_cpus;
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421 err = -EACCES;
8422 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
8423 goto err_cred;
8424 }
8425
8426 if (flags & PERF_FLAG_PID_CGROUP)
8427 cgroup_fd = pid;
8428
8429 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
8430 NULL, NULL, cgroup_fd);
8431 if (IS_ERR(event)) {
8432 err = PTR_ERR(event);
8433 goto err_cred;
8434 }
8435
8436 if (is_sampling_event(event)) {
8437 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
8438 err = -ENOTSUPP;
8439 goto err_alloc;
8440 }
8441 }
8442
8443
8444
8445
8446
8447 pmu = event->pmu;
8448
8449 if (attr.use_clockid) {
8450 err = perf_event_set_clock(event, attr.clockid);
8451 if (err)
8452 goto err_alloc;
8453 }
8454
8455 if (group_leader &&
8456 (is_software_event(event) != is_software_event(group_leader))) {
8457 if (is_software_event(event)) {
8458
8459
8460
8461
8462
8463
8464
8465
8466 pmu = group_leader->pmu;
8467 } else if (is_software_event(group_leader) &&
8468 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
8469
8470
8471
8472
8473
8474 move_group = 1;
8475 }
8476 }
8477
8478
8479
8480
8481 ctx = find_get_context(pmu, task, event);
8482 if (IS_ERR(ctx)) {
8483 err = PTR_ERR(ctx);
8484 goto err_alloc;
8485 }
8486
8487 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
8488 err = -EBUSY;
8489 goto err_context;
8490 }
8491
8492
8493
8494
8495 if (group_leader) {
8496 err = -EINVAL;
8497
8498
8499
8500
8501
8502 if (group_leader->group_leader != group_leader)
8503 goto err_context;
8504
8505
8506 if (group_leader->clock != event->clock)
8507 goto err_context;
8508
8509
8510
8511
8512
8513 if (move_group) {
8514
8515
8516
8517
8518 if (group_leader->ctx->task != ctx->task)
8519 goto err_context;
8520
8521
8522
8523
8524
8525
8526 if (group_leader->cpu != event->cpu)
8527 goto err_context;
8528 } else {
8529 if (group_leader->ctx != ctx)
8530 goto err_context;
8531 }
8532
8533
8534
8535
8536 if (attr.exclusive || attr.pinned)
8537 goto err_context;
8538 }
8539
8540 if (output_event) {
8541 err = perf_event_set_output(event, output_event);
8542 if (err)
8543 goto err_context;
8544 }
8545
8546 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8547 f_flags);
8548 if (IS_ERR(event_file)) {
8549 err = PTR_ERR(event_file);
8550 event_file = NULL;
8551 goto err_context;
8552 }
8553
8554 if (move_group) {
8555 gctx = group_leader->ctx;
8556 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8557 if (gctx->task == TASK_TOMBSTONE) {
8558 err = -ESRCH;
8559 goto err_locked;
8560 }
8561 } else {
8562 mutex_lock(&ctx->mutex);
8563 }
8564
8565 if (ctx->task == TASK_TOMBSTONE) {
8566 err = -ESRCH;
8567 goto err_locked;
8568 }
8569
8570 if (!perf_event_validate_size(event)) {
8571 err = -E2BIG;
8572 goto err_locked;
8573 }
8574
8575
8576
8577
8578
8579 if (!exclusive_event_installable(event, ctx)) {
8580
8581 WARN_ON_ONCE(move_group);
8582
8583 err = -EBUSY;
8584 goto err_locked;
8585 }
8586
8587 WARN_ON_ONCE(ctx->parent_ctx);
8588
8589
8590
8591
8592
8593
8594 if (move_group) {
8595
8596
8597
8598
8599 perf_remove_from_context(group_leader, 0);
8600
8601 list_for_each_entry(sibling, &group_leader->sibling_list,
8602 group_entry) {
8603 perf_remove_from_context(sibling, 0);
8604 put_ctx(gctx);
8605 }
8606
8607
8608
8609
8610
8611 synchronize_rcu();
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623 list_for_each_entry(sibling, &group_leader->sibling_list,
8624 group_entry) {
8625 perf_event__state_init(sibling);
8626 perf_install_in_context(ctx, sibling, sibling->cpu);
8627 get_ctx(ctx);
8628 }
8629
8630
8631
8632
8633
8634
8635 perf_event__state_init(group_leader);
8636 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8637 get_ctx(ctx);
8638
8639
8640
8641
8642
8643
8644 put_ctx(gctx);
8645 }
8646
8647
8648
8649
8650
8651
8652
8653 perf_event__header_size(event);
8654 perf_event__id_header_size(event);
8655
8656 event->owner = current;
8657
8658 perf_install_in_context(ctx, event, event->cpu);
8659 perf_unpin_context(ctx);
8660
8661 if (move_group)
8662 mutex_unlock(&gctx->mutex);
8663 mutex_unlock(&ctx->mutex);
8664
8665 if (task) {
8666 mutex_unlock(&task->signal->cred_guard_mutex);
8667 put_task_struct(task);
8668 }
8669
8670 put_online_cpus();
8671
8672 mutex_lock(¤t->perf_event_mutex);
8673 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
8674 mutex_unlock(¤t->perf_event_mutex);
8675
8676
8677
8678
8679
8680
8681
8682 fdput(group);
8683 fd_install(event_fd, event_file);
8684 return event_fd;
8685
8686err_locked:
8687 if (move_group)
8688 mutex_unlock(&gctx->mutex);
8689 mutex_unlock(&ctx->mutex);
8690
8691 fput(event_file);
8692err_context:
8693 perf_unpin_context(ctx);
8694 put_ctx(ctx);
8695err_alloc:
8696
8697
8698
8699
8700 if (!event_file)
8701 free_event(event);
8702err_cred:
8703 if (task)
8704 mutex_unlock(&task->signal->cred_guard_mutex);
8705err_cpus:
8706 put_online_cpus();
8707err_task:
8708 if (task)
8709 put_task_struct(task);
8710err_group_fd:
8711 fdput(group);
8712err_fd:
8713 put_unused_fd(event_fd);
8714 return err;
8715}
8716
8717
8718
8719
8720
8721
8722
8723
8724struct perf_event *
8725perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8726 struct task_struct *task,
8727 perf_overflow_handler_t overflow_handler,
8728 void *context)
8729{
8730 struct perf_event_context *ctx;
8731 struct perf_event *event;
8732 int err;
8733
8734
8735
8736
8737
8738 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8739 overflow_handler, context, -1);
8740 if (IS_ERR(event)) {
8741 err = PTR_ERR(event);
8742 goto err;
8743 }
8744
8745
8746 event->owner = TASK_TOMBSTONE;
8747
8748 ctx = find_get_context(event->pmu, task, event);
8749 if (IS_ERR(ctx)) {
8750 err = PTR_ERR(ctx);
8751 goto err_free;
8752 }
8753
8754 WARN_ON_ONCE(ctx->parent_ctx);
8755 mutex_lock(&ctx->mutex);
8756 if (ctx->task == TASK_TOMBSTONE) {
8757 err = -ESRCH;
8758 goto err_unlock;
8759 }
8760
8761 if (!exclusive_event_installable(event, ctx)) {
8762 err = -EBUSY;
8763 goto err_unlock;
8764 }
8765
8766 perf_install_in_context(ctx, event, cpu);
8767 perf_unpin_context(ctx);
8768 mutex_unlock(&ctx->mutex);
8769
8770 return event;
8771
8772err_unlock:
8773 mutex_unlock(&ctx->mutex);
8774 perf_unpin_context(ctx);
8775 put_ctx(ctx);
8776err_free:
8777 free_event(event);
8778err:
8779 return ERR_PTR(err);
8780}
8781EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8782
8783void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8784{
8785 struct perf_event_context *src_ctx;
8786 struct perf_event_context *dst_ctx;
8787 struct perf_event *event, *tmp;
8788 LIST_HEAD(events);
8789
8790 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8791 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8792
8793
8794
8795
8796
8797 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8798 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8799 event_entry) {
8800 perf_remove_from_context(event, 0);
8801 unaccount_event_cpu(event, src_cpu);
8802 put_ctx(src_ctx);
8803 list_add(&event->migrate_entry, &events);
8804 }
8805
8806
8807
8808
8809 synchronize_rcu();
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8820 if (event->group_leader == event)
8821 continue;
8822
8823 list_del(&event->migrate_entry);
8824 if (event->state >= PERF_EVENT_STATE_OFF)
8825 event->state = PERF_EVENT_STATE_INACTIVE;
8826 account_event_cpu(event, dst_cpu);
8827 perf_install_in_context(dst_ctx, event, dst_cpu);
8828 get_ctx(dst_ctx);
8829 }
8830
8831
8832
8833
8834
8835 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8836 list_del(&event->migrate_entry);
8837 if (event->state >= PERF_EVENT_STATE_OFF)
8838 event->state = PERF_EVENT_STATE_INACTIVE;
8839 account_event_cpu(event, dst_cpu);
8840 perf_install_in_context(dst_ctx, event, dst_cpu);
8841 get_ctx(dst_ctx);
8842 }
8843 mutex_unlock(&dst_ctx->mutex);
8844 mutex_unlock(&src_ctx->mutex);
8845}
8846EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8847
8848static void sync_child_event(struct perf_event *child_event,
8849 struct task_struct *child)
8850{
8851 struct perf_event *parent_event = child_event->parent;
8852 u64 child_val;
8853
8854 if (child_event->attr.inherit_stat)
8855 perf_event_read_event(child_event, child);
8856
8857 child_val = perf_event_count(child_event);
8858
8859
8860
8861
8862 atomic64_add(child_val, &parent_event->child_count);
8863 atomic64_add(child_event->total_time_enabled,
8864 &parent_event->child_total_time_enabled);
8865 atomic64_add(child_event->total_time_running,
8866 &parent_event->child_total_time_running);
8867}
8868
8869static void
8870perf_event_exit_event(struct perf_event *child_event,
8871 struct perf_event_context *child_ctx,
8872 struct task_struct *child)
8873{
8874 struct perf_event *parent_event = child_event->parent;
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888 raw_spin_lock_irq(&child_ctx->lock);
8889 WARN_ON_ONCE(child_ctx->is_active);
8890
8891 if (parent_event)
8892 perf_group_detach(child_event);
8893 list_del_event(child_event, child_ctx);
8894 child_event->state = PERF_EVENT_STATE_EXIT;
8895 raw_spin_unlock_irq(&child_ctx->lock);
8896
8897
8898
8899
8900 if (!parent_event) {
8901 perf_event_wakeup(child_event);
8902 return;
8903 }
8904
8905
8906
8907
8908 sync_child_event(child_event, child);
8909
8910
8911
8912
8913 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8914 mutex_lock(&parent_event->child_mutex);
8915 list_del_init(&child_event->child_list);
8916 mutex_unlock(&parent_event->child_mutex);
8917
8918
8919
8920
8921 perf_event_wakeup(parent_event);
8922 free_event(child_event);
8923 put_event(parent_event);
8924}
8925
8926static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8927{
8928 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8929 struct perf_event *child_event, *next;
8930
8931 WARN_ON_ONCE(child != current);
8932
8933 child_ctx = perf_pin_task_context(child, ctxn);
8934 if (!child_ctx)
8935 return;
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947 mutex_lock(&child_ctx->mutex);
8948
8949
8950
8951
8952
8953
8954 raw_spin_lock_irq(&child_ctx->lock);
8955 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
8956
8957
8958
8959
8960
8961 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
8962 put_ctx(child_ctx);
8963 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
8964 put_task_struct(current);
8965
8966 clone_ctx = unclone_ctx(child_ctx);
8967 raw_spin_unlock_irq(&child_ctx->lock);
8968
8969 if (clone_ctx)
8970 put_ctx(clone_ctx);
8971
8972
8973
8974
8975
8976
8977 perf_event_task(child, child_ctx, 0);
8978
8979 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8980 perf_event_exit_event(child_event, child_ctx, child);
8981
8982 mutex_unlock(&child_ctx->mutex);
8983
8984 put_ctx(child_ctx);
8985}
8986
8987
8988
8989
8990
8991
8992
8993void perf_event_exit_task(struct task_struct *child)
8994{
8995 struct perf_event *event, *tmp;
8996 int ctxn;
8997
8998 mutex_lock(&child->perf_event_mutex);
8999 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
9000 owner_entry) {
9001 list_del_init(&event->owner_entry);
9002
9003
9004
9005
9006
9007
9008 smp_store_release(&event->owner, NULL);
9009 }
9010 mutex_unlock(&child->perf_event_mutex);
9011
9012 for_each_task_context_nr(ctxn)
9013 perf_event_exit_task_context(child, ctxn);
9014
9015
9016
9017
9018
9019
9020
9021 perf_event_task(child, NULL, 0);
9022}
9023
9024static void perf_free_event(struct perf_event *event,
9025 struct perf_event_context *ctx)
9026{
9027 struct perf_event *parent = event->parent;
9028
9029 if (WARN_ON_ONCE(!parent))
9030 return;
9031
9032 mutex_lock(&parent->child_mutex);
9033 list_del_init(&event->child_list);
9034 mutex_unlock(&parent->child_mutex);
9035
9036 put_event(parent);
9037
9038 raw_spin_lock_irq(&ctx->lock);
9039 perf_group_detach(event);
9040 list_del_event(event, ctx);
9041 raw_spin_unlock_irq(&ctx->lock);
9042 free_event(event);
9043}
9044
9045
9046
9047
9048
9049
9050
9051
9052void perf_event_free_task(struct task_struct *task)
9053{
9054 struct perf_event_context *ctx;
9055 struct perf_event *event, *tmp;
9056 int ctxn;
9057
9058 for_each_task_context_nr(ctxn) {
9059 ctx = task->perf_event_ctxp[ctxn];
9060 if (!ctx)
9061 continue;
9062
9063 mutex_lock(&ctx->mutex);
9064again:
9065 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
9066 group_entry)
9067 perf_free_event(event, ctx);
9068
9069 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
9070 group_entry)
9071 perf_free_event(event, ctx);
9072
9073 if (!list_empty(&ctx->pinned_groups) ||
9074 !list_empty(&ctx->flexible_groups))
9075 goto again;
9076
9077 mutex_unlock(&ctx->mutex);
9078
9079 put_ctx(ctx);
9080 }
9081}
9082
9083void perf_event_delayed_put(struct task_struct *task)
9084{
9085 int ctxn;
9086
9087 for_each_task_context_nr(ctxn)
9088 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
9089}
9090
9091struct file *perf_event_get(unsigned int fd)
9092{
9093 struct file *file;
9094
9095 file = fget_raw(fd);
9096 if (!file)
9097 return ERR_PTR(-EBADF);
9098
9099 if (file->f_op != &perf_fops) {
9100 fput(file);
9101 return ERR_PTR(-EBADF);
9102 }
9103
9104 return file;
9105}
9106
9107const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
9108{
9109 if (!event)
9110 return ERR_PTR(-EINVAL);
9111
9112 return &event->attr;
9113}
9114
9115
9116
9117
9118static struct perf_event *
9119inherit_event(struct perf_event *parent_event,
9120 struct task_struct *parent,
9121 struct perf_event_context *parent_ctx,
9122 struct task_struct *child,
9123 struct perf_event *group_leader,
9124 struct perf_event_context *child_ctx)
9125{
9126 enum perf_event_active_state parent_state = parent_event->state;
9127 struct perf_event *child_event;
9128 unsigned long flags;
9129
9130
9131
9132
9133
9134
9135
9136 if (parent_event->parent)
9137 parent_event = parent_event->parent;
9138
9139 child_event = perf_event_alloc(&parent_event->attr,
9140 parent_event->cpu,
9141 child,
9142 group_leader, parent_event,
9143 NULL, NULL, -1);
9144 if (IS_ERR(child_event))
9145 return child_event;
9146
9147
9148
9149
9150
9151
9152
9153 mutex_lock(&parent_event->child_mutex);
9154 if (is_orphaned_event(parent_event) ||
9155 !atomic_long_inc_not_zero(&parent_event->refcount)) {
9156 mutex_unlock(&parent_event->child_mutex);
9157 free_event(child_event);
9158 return NULL;
9159 }
9160
9161 get_ctx(child_ctx);
9162
9163
9164
9165
9166
9167
9168 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
9169 child_event->state = PERF_EVENT_STATE_INACTIVE;
9170 else
9171 child_event->state = PERF_EVENT_STATE_OFF;
9172
9173 if (parent_event->attr.freq) {
9174 u64 sample_period = parent_event->hw.sample_period;
9175 struct hw_perf_event *hwc = &child_event->hw;
9176
9177 hwc->sample_period = sample_period;
9178 hwc->last_period = sample_period;
9179
9180 local64_set(&hwc->period_left, sample_period);
9181 }
9182
9183 child_event->ctx = child_ctx;
9184 child_event->overflow_handler = parent_event->overflow_handler;
9185 child_event->overflow_handler_context
9186 = parent_event->overflow_handler_context;
9187
9188
9189
9190
9191 perf_event__header_size(child_event);
9192 perf_event__id_header_size(child_event);
9193
9194
9195
9196
9197 raw_spin_lock_irqsave(&child_ctx->lock, flags);
9198 add_event_to_ctx(child_event, child_ctx);
9199 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
9200
9201
9202
9203
9204 list_add_tail(&child_event->child_list, &parent_event->child_list);
9205 mutex_unlock(&parent_event->child_mutex);
9206
9207 return child_event;
9208}
9209
9210static int inherit_group(struct perf_event *parent_event,
9211 struct task_struct *parent,
9212 struct perf_event_context *parent_ctx,
9213 struct task_struct *child,
9214 struct perf_event_context *child_ctx)
9215{
9216 struct perf_event *leader;
9217 struct perf_event *sub;
9218 struct perf_event *child_ctr;
9219
9220 leader = inherit_event(parent_event, parent, parent_ctx,
9221 child, NULL, child_ctx);
9222 if (IS_ERR(leader))
9223 return PTR_ERR(leader);
9224 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
9225 child_ctr = inherit_event(sub, parent, parent_ctx,
9226 child, leader, child_ctx);
9227 if (IS_ERR(child_ctr))
9228 return PTR_ERR(child_ctr);
9229 }
9230 return 0;
9231}
9232
9233static int
9234inherit_task_group(struct perf_event *event, struct task_struct *parent,
9235 struct perf_event_context *parent_ctx,
9236 struct task_struct *child, int ctxn,
9237 int *inherited_all)
9238{
9239 int ret;
9240 struct perf_event_context *child_ctx;
9241
9242 if (!event->attr.inherit) {
9243 *inherited_all = 0;
9244 return 0;
9245 }
9246
9247 child_ctx = child->perf_event_ctxp[ctxn];
9248 if (!child_ctx) {
9249
9250
9251
9252
9253
9254
9255
9256 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
9257 if (!child_ctx)
9258 return -ENOMEM;
9259
9260 child->perf_event_ctxp[ctxn] = child_ctx;
9261 }
9262
9263 ret = inherit_group(event, parent, parent_ctx,
9264 child, child_ctx);
9265
9266 if (ret)
9267 *inherited_all = 0;
9268
9269 return ret;
9270}
9271
9272
9273
9274
9275static int perf_event_init_context(struct task_struct *child, int ctxn)
9276{
9277 struct perf_event_context *child_ctx, *parent_ctx;
9278 struct perf_event_context *cloned_ctx;
9279 struct perf_event *event;
9280 struct task_struct *parent = current;
9281 int inherited_all = 1;
9282 unsigned long flags;
9283 int ret = 0;
9284
9285 if (likely(!parent->perf_event_ctxp[ctxn]))
9286 return 0;
9287
9288
9289
9290
9291
9292 parent_ctx = perf_pin_task_context(parent, ctxn);
9293 if (!parent_ctx)
9294 return 0;
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307 mutex_lock(&parent_ctx->mutex);
9308
9309
9310
9311
9312
9313 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
9314 ret = inherit_task_group(event, parent, parent_ctx,
9315 child, ctxn, &inherited_all);
9316 if (ret)
9317 break;
9318 }
9319
9320
9321
9322
9323
9324
9325 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9326 parent_ctx->rotate_disable = 1;
9327 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9328
9329 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
9330 ret = inherit_task_group(event, parent, parent_ctx,
9331 child, ctxn, &inherited_all);
9332 if (ret)
9333 break;
9334 }
9335
9336 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9337 parent_ctx->rotate_disable = 0;
9338
9339 child_ctx = child->perf_event_ctxp[ctxn];
9340
9341 if (child_ctx && inherited_all) {
9342
9343
9344
9345
9346
9347
9348
9349 cloned_ctx = parent_ctx->parent_ctx;
9350 if (cloned_ctx) {
9351 child_ctx->parent_ctx = cloned_ctx;
9352 child_ctx->parent_gen = parent_ctx->parent_gen;
9353 } else {
9354 child_ctx->parent_ctx = parent_ctx;
9355 child_ctx->parent_gen = parent_ctx->generation;
9356 }
9357 get_ctx(child_ctx->parent_ctx);
9358 }
9359
9360 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9361 mutex_unlock(&parent_ctx->mutex);
9362
9363 perf_unpin_context(parent_ctx);
9364 put_ctx(parent_ctx);
9365
9366 return ret;
9367}
9368
9369
9370
9371
9372int perf_event_init_task(struct task_struct *child)
9373{
9374 int ctxn, ret;
9375
9376 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
9377 mutex_init(&child->perf_event_mutex);
9378 INIT_LIST_HEAD(&child->perf_event_list);
9379
9380 for_each_task_context_nr(ctxn) {
9381 ret = perf_event_init_context(child, ctxn);
9382 if (ret) {
9383 perf_event_free_task(child);
9384 return ret;
9385 }
9386 }
9387
9388 return 0;
9389}
9390
9391static void __init perf_event_init_all_cpus(void)
9392{
9393 struct swevent_htable *swhash;
9394 int cpu;
9395
9396 for_each_possible_cpu(cpu) {
9397 swhash = &per_cpu(swevent_htable, cpu);
9398 mutex_init(&swhash->hlist_mutex);
9399 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
9400 }
9401}
9402
9403static void perf_event_init_cpu(int cpu)
9404{
9405 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9406
9407 mutex_lock(&swhash->hlist_mutex);
9408 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
9409 struct swevent_hlist *hlist;
9410
9411 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
9412 WARN_ON(!hlist);
9413 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9414 }
9415 mutex_unlock(&swhash->hlist_mutex);
9416}
9417
9418#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
9419static void __perf_event_exit_context(void *__info)
9420{
9421 struct perf_event_context *ctx = __info;
9422 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
9423 struct perf_event *event;
9424
9425 raw_spin_lock(&ctx->lock);
9426 list_for_each_entry(event, &ctx->event_list, event_entry)
9427 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
9428 raw_spin_unlock(&ctx->lock);
9429}
9430
9431static void perf_event_exit_cpu_context(int cpu)
9432{
9433 struct perf_event_context *ctx;
9434 struct pmu *pmu;
9435 int idx;
9436
9437 idx = srcu_read_lock(&pmus_srcu);
9438 list_for_each_entry_rcu(pmu, &pmus, entry) {
9439 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
9440
9441 mutex_lock(&ctx->mutex);
9442 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
9443 mutex_unlock(&ctx->mutex);
9444 }
9445 srcu_read_unlock(&pmus_srcu, idx);
9446}
9447
9448static void perf_event_exit_cpu(int cpu)
9449{
9450 perf_event_exit_cpu_context(cpu);
9451}
9452#else
9453static inline void perf_event_exit_cpu(int cpu) { }
9454#endif
9455
9456static int
9457perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
9458{
9459 int cpu;
9460
9461 for_each_online_cpu(cpu)
9462 perf_event_exit_cpu(cpu);
9463
9464 return NOTIFY_OK;
9465}
9466
9467
9468
9469
9470
9471static struct notifier_block perf_reboot_notifier = {
9472 .notifier_call = perf_reboot,
9473 .priority = INT_MIN,
9474};
9475
9476static int
9477perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
9478{
9479 unsigned int cpu = (long)hcpu;
9480
9481 switch (action & ~CPU_TASKS_FROZEN) {
9482
9483 case CPU_UP_PREPARE:
9484
9485
9486
9487
9488
9489
9490
9491 perf_event_init_cpu(cpu);
9492 break;
9493
9494 case CPU_DOWN_PREPARE:
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507 perf_event_exit_cpu(cpu);
9508 break;
9509 default:
9510 break;
9511 }
9512
9513 return NOTIFY_OK;
9514}
9515
9516void __init perf_event_init(void)
9517{
9518 int ret;
9519
9520 idr_init(&pmu_idr);
9521
9522 perf_event_init_all_cpus();
9523 init_srcu_struct(&pmus_srcu);
9524 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
9525 perf_pmu_register(&perf_cpu_clock, NULL, -1);
9526 perf_pmu_register(&perf_task_clock, NULL, -1);
9527 perf_tp_register();
9528 perf_cpu_notifier(perf_cpu_notify);
9529 register_reboot_notifier(&perf_reboot_notifier);
9530
9531 ret = init_hw_breakpoint();
9532 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
9533
9534
9535
9536
9537
9538 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
9539 != 1024);
9540}
9541
9542ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
9543 char *page)
9544{
9545 struct perf_pmu_events_attr *pmu_attr =
9546 container_of(attr, struct perf_pmu_events_attr, attr);
9547
9548 if (pmu_attr->event_str)
9549 return sprintf(page, "%s\n", pmu_attr->event_str);
9550
9551 return 0;
9552}
9553EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
9554
9555static int __init perf_event_sysfs_init(void)
9556{
9557 struct pmu *pmu;
9558 int ret;
9559
9560 mutex_lock(&pmus_lock);
9561
9562 ret = bus_register(&pmu_bus);
9563 if (ret)
9564 goto unlock;
9565
9566 list_for_each_entry(pmu, &pmus, entry) {
9567 if (!pmu->name || pmu->type < 0)
9568 continue;
9569
9570 ret = pmu_dev_alloc(pmu);
9571 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
9572 }
9573 pmu_bus_running = 1;
9574 ret = 0;
9575
9576unlock:
9577 mutex_unlock(&pmus_lock);
9578
9579 return ret;
9580}
9581device_initcall(perf_event_sysfs_init);
9582
9583#ifdef CONFIG_CGROUP_PERF
9584static struct cgroup_subsys_state *
9585perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9586{
9587 struct perf_cgroup *jc;
9588
9589 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
9590 if (!jc)
9591 return ERR_PTR(-ENOMEM);
9592
9593 jc->info = alloc_percpu(struct perf_cgroup_info);
9594 if (!jc->info) {
9595 kfree(jc);
9596 return ERR_PTR(-ENOMEM);
9597 }
9598
9599 return &jc->css;
9600}
9601
9602static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9603{
9604 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9605
9606 free_percpu(jc->info);
9607 kfree(jc);
9608}
9609
9610static int __perf_cgroup_move(void *info)
9611{
9612 struct task_struct *task = info;
9613 rcu_read_lock();
9614 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9615 rcu_read_unlock();
9616 return 0;
9617}
9618
9619static void perf_cgroup_attach(struct cgroup_taskset *tset)
9620{
9621 struct task_struct *task;
9622 struct cgroup_subsys_state *css;
9623
9624 cgroup_taskset_for_each(task, css, tset)
9625 task_function_call(task, __perf_cgroup_move, task);
9626}
9627
9628struct cgroup_subsys perf_event_cgrp_subsys = {
9629 .css_alloc = perf_cgroup_css_alloc,
9630 .css_free = perf_cgroup_css_free,
9631 .attach = perf_cgroup_attach,
9632};
9633#endif
9634