1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49
50#include "internal.h"
51
52#include <asm/irq_regs.h>
53
54typedef int (*remote_function_f)(void *);
55
56struct remote_function_call {
57 struct task_struct *p;
58 remote_function_f func;
59 void *info;
60 int ret;
61};
62
63static void remote_function(void *data)
64{
65 struct remote_function_call *tfc = data;
66 struct task_struct *p = tfc->p;
67
68 if (p) {
69
70 if (task_cpu(p) != smp_processor_id())
71 return;
72
73
74
75
76
77
78 tfc->ret = -ESRCH;
79 if (p != current)
80 return;
81 }
82
83 tfc->ret = tfc->func(tfc->info);
84}
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99static int
100task_function_call(struct task_struct *p, remote_function_f func, void *info)
101{
102 struct remote_function_call data = {
103 .p = p,
104 .func = func,
105 .info = info,
106 .ret = -EAGAIN,
107 };
108 int ret;
109
110 do {
111 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
112 if (!ret)
113 ret = data.ret;
114 } while (ret == -EAGAIN);
115
116 return ret;
117}
118
119
120
121
122
123
124
125
126
127
128static int cpu_function_call(int cpu, remote_function_f func, void *info)
129{
130 struct remote_function_call data = {
131 .p = NULL,
132 .func = func,
133 .info = info,
134 .ret = -ENXIO,
135 };
136
137 smp_call_function_single(cpu, remote_function, &data, 1);
138
139 return data.ret;
140}
141
142static inline struct perf_cpu_context *
143__get_cpu_context(struct perf_event_context *ctx)
144{
145 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
146}
147
148static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
149 struct perf_event_context *ctx)
150{
151 raw_spin_lock(&cpuctx->ctx.lock);
152 if (ctx)
153 raw_spin_lock(&ctx->lock);
154}
155
156static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
157 struct perf_event_context *ctx)
158{
159 if (ctx)
160 raw_spin_unlock(&ctx->lock);
161 raw_spin_unlock(&cpuctx->ctx.lock);
162}
163
164#define TASK_TOMBSTONE ((void *)-1L)
165
166static bool is_kernel_event(struct perf_event *event)
167{
168 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
169}
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
191 struct perf_event_context *, void *);
192
193struct event_function_struct {
194 struct perf_event *event;
195 event_f func;
196 void *data;
197};
198
199static int event_function(void *info)
200{
201 struct event_function_struct *efs = info;
202 struct perf_event *event = efs->event;
203 struct perf_event_context *ctx = event->ctx;
204 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
205 struct perf_event_context *task_ctx = cpuctx->task_ctx;
206 int ret = 0;
207
208 WARN_ON_ONCE(!irqs_disabled());
209
210 perf_ctx_lock(cpuctx, task_ctx);
211
212
213
214
215 if (ctx->task) {
216 if (ctx->task != current) {
217 ret = -ESRCH;
218 goto unlock;
219 }
220
221
222
223
224
225
226
227
228 WARN_ON_ONCE(!ctx->is_active);
229
230
231
232
233 WARN_ON_ONCE(task_ctx != ctx);
234 } else {
235 WARN_ON_ONCE(&cpuctx->ctx != ctx);
236 }
237
238 efs->func(event, cpuctx, ctx, efs->data);
239unlock:
240 perf_ctx_unlock(cpuctx, task_ctx);
241
242 return ret;
243}
244
245static void event_function_call(struct perf_event *event, event_f func, void *data)
246{
247 struct perf_event_context *ctx = event->ctx;
248 struct task_struct *task = READ_ONCE(ctx->task);
249 struct event_function_struct efs = {
250 .event = event,
251 .func = func,
252 .data = data,
253 };
254
255 if (!event->parent) {
256
257
258
259
260
261 lockdep_assert_held(&ctx->mutex);
262 }
263
264 if (!task) {
265 cpu_function_call(event->cpu, event_function, &efs);
266 return;
267 }
268
269 if (task == TASK_TOMBSTONE)
270 return;
271
272again:
273 if (!task_function_call(task, event_function, &efs))
274 return;
275
276 raw_spin_lock_irq(&ctx->lock);
277
278
279
280
281 task = ctx->task;
282 if (task == TASK_TOMBSTONE) {
283 raw_spin_unlock_irq(&ctx->lock);
284 return;
285 }
286 if (ctx->is_active) {
287 raw_spin_unlock_irq(&ctx->lock);
288 goto again;
289 }
290 func(event, NULL, ctx, data);
291 raw_spin_unlock_irq(&ctx->lock);
292}
293
294
295
296
297
298static void event_function_local(struct perf_event *event, event_f func, void *data)
299{
300 struct perf_event_context *ctx = event->ctx;
301 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
302 struct task_struct *task = READ_ONCE(ctx->task);
303 struct perf_event_context *task_ctx = NULL;
304
305 WARN_ON_ONCE(!irqs_disabled());
306
307 if (task) {
308 if (task == TASK_TOMBSTONE)
309 return;
310
311 task_ctx = ctx;
312 }
313
314 perf_ctx_lock(cpuctx, task_ctx);
315
316 task = ctx->task;
317 if (task == TASK_TOMBSTONE)
318 goto unlock;
319
320 if (task) {
321
322
323
324
325
326 if (ctx->is_active) {
327 if (WARN_ON_ONCE(task != current))
328 goto unlock;
329
330 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
331 goto unlock;
332 }
333 } else {
334 WARN_ON_ONCE(&cpuctx->ctx != ctx);
335 }
336
337 func(event, cpuctx, ctx, data);
338unlock:
339 perf_ctx_unlock(cpuctx, task_ctx);
340}
341
342#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
343 PERF_FLAG_FD_OUTPUT |\
344 PERF_FLAG_PID_CGROUP |\
345 PERF_FLAG_FD_CLOEXEC)
346
347
348
349
350#define PERF_SAMPLE_BRANCH_PERM_PLM \
351 (PERF_SAMPLE_BRANCH_KERNEL |\
352 PERF_SAMPLE_BRANCH_HV)
353
354enum event_type_t {
355 EVENT_FLEXIBLE = 0x1,
356 EVENT_PINNED = 0x2,
357 EVENT_TIME = 0x4,
358 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
359};
360
361
362
363
364
365
366static void perf_sched_delayed(struct work_struct *work);
367DEFINE_STATIC_KEY_FALSE(perf_sched_events);
368static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
369static DEFINE_MUTEX(perf_sched_mutex);
370static atomic_t perf_sched_count;
371
372static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
373static DEFINE_PER_CPU(int, perf_sched_cb_usages);
374static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
375
376static atomic_t nr_mmap_events __read_mostly;
377static atomic_t nr_comm_events __read_mostly;
378static atomic_t nr_task_events __read_mostly;
379static atomic_t nr_freq_events __read_mostly;
380static atomic_t nr_switch_events __read_mostly;
381
382static LIST_HEAD(pmus);
383static DEFINE_MUTEX(pmus_lock);
384static struct srcu_struct pmus_srcu;
385
386
387
388
389
390
391
392
393int sysctl_perf_event_paranoid __read_mostly = 2;
394
395
396int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
397
398
399
400
401#define DEFAULT_MAX_SAMPLE_RATE 100000
402#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
403#define DEFAULT_CPU_TIME_MAX_PERCENT 25
404
405int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
406
407static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
408static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
409
410static int perf_sample_allowed_ns __read_mostly =
411 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
412
413static void update_perf_cpu_limits(void)
414{
415 u64 tmp = perf_sample_period_ns;
416
417 tmp *= sysctl_perf_cpu_time_max_percent;
418 tmp = div_u64(tmp, 100);
419 if (!tmp)
420 tmp = 1;
421
422 WRITE_ONCE(perf_sample_allowed_ns, tmp);
423}
424
425static int perf_rotate_context(struct perf_cpu_context *cpuctx);
426
427int perf_proc_update_handler(struct ctl_table *table, int write,
428 void __user *buffer, size_t *lenp,
429 loff_t *ppos)
430{
431 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
432
433 if (ret || !write)
434 return ret;
435
436
437
438
439 if (sysctl_perf_cpu_time_max_percent == 100 ||
440 sysctl_perf_cpu_time_max_percent == 0)
441 return -EINVAL;
442
443 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
444 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
445 update_perf_cpu_limits();
446
447 return 0;
448}
449
450int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
451
452int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
453 void __user *buffer, size_t *lenp,
454 loff_t *ppos)
455{
456 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
457
458 if (ret || !write)
459 return ret;
460
461 if (sysctl_perf_cpu_time_max_percent == 100 ||
462 sysctl_perf_cpu_time_max_percent == 0) {
463 printk(KERN_WARNING
464 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
465 WRITE_ONCE(perf_sample_allowed_ns, 0);
466 } else {
467 update_perf_cpu_limits();
468 }
469
470 return 0;
471}
472
473
474
475
476
477
478
479#define NR_ACCUMULATED_SAMPLES 128
480static DEFINE_PER_CPU(u64, running_sample_length);
481
482static u64 __report_avg;
483static u64 __report_allowed;
484
485static void perf_duration_warn(struct irq_work *w)
486{
487 printk_ratelimited(KERN_INFO
488 "perf: interrupt took too long (%lld > %lld), lowering "
489 "kernel.perf_event_max_sample_rate to %d\n",
490 __report_avg, __report_allowed,
491 sysctl_perf_event_sample_rate);
492}
493
494static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
495
496void perf_sample_event_took(u64 sample_len_ns)
497{
498 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
499 u64 running_len;
500 u64 avg_len;
501 u32 max;
502
503 if (max_len == 0)
504 return;
505
506
507 running_len = __this_cpu_read(running_sample_length);
508 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
509 running_len += sample_len_ns;
510 __this_cpu_write(running_sample_length, running_len);
511
512
513
514
515
516
517 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
518 if (avg_len <= max_len)
519 return;
520
521 __report_avg = avg_len;
522 __report_allowed = max_len;
523
524
525
526
527 avg_len += avg_len / 4;
528 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
529 if (avg_len < max)
530 max /= (u32)avg_len;
531 else
532 max = 1;
533
534 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
535 WRITE_ONCE(max_samples_per_tick, max);
536
537 sysctl_perf_event_sample_rate = max * HZ;
538 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
539
540 if (!irq_work_queue(&perf_duration_work)) {
541 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
542 "kernel.perf_event_max_sample_rate to %d\n",
543 __report_avg, __report_allowed,
544 sysctl_perf_event_sample_rate);
545 }
546}
547
548static atomic64_t perf_event_id;
549
550static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
551 enum event_type_t event_type);
552
553static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
554 enum event_type_t event_type,
555 struct task_struct *task);
556
557static void update_context_time(struct perf_event_context *ctx);
558static u64 perf_event_time(struct perf_event *event);
559
560void __weak perf_event_print_debug(void) { }
561
562extern __weak const char *perf_pmu_name(void)
563{
564 return "pmu";
565}
566
567static inline u64 perf_clock(void)
568{
569 return local_clock();
570}
571
572static inline u64 perf_event_clock(struct perf_event *event)
573{
574 return event->clock();
575}
576
577#ifdef CONFIG_CGROUP_PERF
578
579static inline bool
580perf_cgroup_match(struct perf_event *event)
581{
582 struct perf_event_context *ctx = event->ctx;
583 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
584
585
586 if (!event->cgrp)
587 return true;
588
589
590 if (!cpuctx->cgrp)
591 return false;
592
593
594
595
596
597
598
599 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
600 event->cgrp->css.cgroup);
601}
602
603static inline void perf_detach_cgroup(struct perf_event *event)
604{
605 css_put(&event->cgrp->css);
606 event->cgrp = NULL;
607}
608
609static inline int is_cgroup_event(struct perf_event *event)
610{
611 return event->cgrp != NULL;
612}
613
614static inline u64 perf_cgroup_event_time(struct perf_event *event)
615{
616 struct perf_cgroup_info *t;
617
618 t = per_cpu_ptr(event->cgrp->info, event->cpu);
619 return t->time;
620}
621
622static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
623{
624 struct perf_cgroup_info *info;
625 u64 now;
626
627 now = perf_clock();
628
629 info = this_cpu_ptr(cgrp->info);
630
631 info->time += now - info->timestamp;
632 info->timestamp = now;
633}
634
635static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
636{
637 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
638 if (cgrp_out)
639 __update_cgrp_time(cgrp_out);
640}
641
642static inline void update_cgrp_time_from_event(struct perf_event *event)
643{
644 struct perf_cgroup *cgrp;
645
646
647
648
649
650 if (!is_cgroup_event(event))
651 return;
652
653 cgrp = perf_cgroup_from_task(current, event->ctx);
654
655
656
657 if (cgrp == event->cgrp)
658 __update_cgrp_time(event->cgrp);
659}
660
661static inline void
662perf_cgroup_set_timestamp(struct task_struct *task,
663 struct perf_event_context *ctx)
664{
665 struct perf_cgroup *cgrp;
666 struct perf_cgroup_info *info;
667
668
669
670
671
672
673 if (!task || !ctx->nr_cgroups)
674 return;
675
676 cgrp = perf_cgroup_from_task(task, ctx);
677 info = this_cpu_ptr(cgrp->info);
678 info->timestamp = ctx->timestamp;
679}
680
681#define PERF_CGROUP_SWOUT 0x1
682#define PERF_CGROUP_SWIN 0x2
683
684
685
686
687
688
689
690static void perf_cgroup_switch(struct task_struct *task, int mode)
691{
692 struct perf_cpu_context *cpuctx;
693 struct pmu *pmu;
694 unsigned long flags;
695
696
697
698
699
700
701 local_irq_save(flags);
702
703
704
705
706
707
708 list_for_each_entry_rcu(pmu, &pmus, entry) {
709 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
710 if (cpuctx->unique_pmu != pmu)
711 continue;
712
713
714
715
716
717
718
719
720 if (cpuctx->ctx.nr_cgroups > 0) {
721 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
722 perf_pmu_disable(cpuctx->ctx.pmu);
723
724 if (mode & PERF_CGROUP_SWOUT) {
725 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
726
727
728
729
730 cpuctx->cgrp = NULL;
731 }
732
733 if (mode & PERF_CGROUP_SWIN) {
734 WARN_ON_ONCE(cpuctx->cgrp);
735
736
737
738
739
740
741
742 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
743 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
744 }
745 perf_pmu_enable(cpuctx->ctx.pmu);
746 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
747 }
748 }
749
750 local_irq_restore(flags);
751}
752
753static inline void perf_cgroup_sched_out(struct task_struct *task,
754 struct task_struct *next)
755{
756 struct perf_cgroup *cgrp1;
757 struct perf_cgroup *cgrp2 = NULL;
758
759 rcu_read_lock();
760
761
762
763
764
765 cgrp1 = perf_cgroup_from_task(task, NULL);
766 cgrp2 = perf_cgroup_from_task(next, NULL);
767
768
769
770
771
772
773 if (cgrp1 != cgrp2)
774 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
775
776 rcu_read_unlock();
777}
778
779static inline void perf_cgroup_sched_in(struct task_struct *prev,
780 struct task_struct *task)
781{
782 struct perf_cgroup *cgrp1;
783 struct perf_cgroup *cgrp2 = NULL;
784
785 rcu_read_lock();
786
787
788
789
790
791 cgrp1 = perf_cgroup_from_task(task, NULL);
792 cgrp2 = perf_cgroup_from_task(prev, NULL);
793
794
795
796
797
798
799 if (cgrp1 != cgrp2)
800 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
801
802 rcu_read_unlock();
803}
804
805static inline int perf_cgroup_connect(int fd, struct perf_event *event,
806 struct perf_event_attr *attr,
807 struct perf_event *group_leader)
808{
809 struct perf_cgroup *cgrp;
810 struct cgroup_subsys_state *css;
811 struct fd f = fdget(fd);
812 int ret = 0;
813
814 if (!f.file)
815 return -EBADF;
816
817 css = css_tryget_online_from_dir(f.file->f_path.dentry,
818 &perf_event_cgrp_subsys);
819 if (IS_ERR(css)) {
820 ret = PTR_ERR(css);
821 goto out;
822 }
823
824 cgrp = container_of(css, struct perf_cgroup, css);
825 event->cgrp = cgrp;
826
827
828
829
830
831
832 if (group_leader && group_leader->cgrp != cgrp) {
833 perf_detach_cgroup(event);
834 ret = -EINVAL;
835 }
836out:
837 fdput(f);
838 return ret;
839}
840
841static inline void
842perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
843{
844 struct perf_cgroup_info *t;
845 t = per_cpu_ptr(event->cgrp->info, event->cpu);
846 event->shadow_ctx_time = now - t->timestamp;
847}
848
849static inline void
850perf_cgroup_defer_enabled(struct perf_event *event)
851{
852
853
854
855
856
857
858 if (is_cgroup_event(event) && !perf_cgroup_match(event))
859 event->cgrp_defer_enabled = 1;
860}
861
862static inline void
863perf_cgroup_mark_enabled(struct perf_event *event,
864 struct perf_event_context *ctx)
865{
866 struct perf_event *sub;
867 u64 tstamp = perf_event_time(event);
868
869 if (!event->cgrp_defer_enabled)
870 return;
871
872 event->cgrp_defer_enabled = 0;
873
874 event->tstamp_enabled = tstamp - event->total_time_enabled;
875 list_for_each_entry(sub, &event->sibling_list, group_entry) {
876 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
877 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
878 sub->cgrp_defer_enabled = 0;
879 }
880 }
881}
882
883
884
885
886
887static inline void
888list_update_cgroup_event(struct perf_event *event,
889 struct perf_event_context *ctx, bool add)
890{
891 struct perf_cpu_context *cpuctx;
892
893 if (!is_cgroup_event(event))
894 return;
895
896 if (add && ctx->nr_cgroups++)
897 return;
898 else if (!add && --ctx->nr_cgroups)
899 return;
900
901
902
903
904 cpuctx = __get_cpu_context(ctx);
905
906
907
908
909
910 if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
911 cpuctx->cgrp = event->cgrp;
912 else if (!add)
913 cpuctx->cgrp = NULL;
914}
915
916#else
917
918static inline bool
919perf_cgroup_match(struct perf_event *event)
920{
921 return true;
922}
923
924static inline void perf_detach_cgroup(struct perf_event *event)
925{}
926
927static inline int is_cgroup_event(struct perf_event *event)
928{
929 return 0;
930}
931
932static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
933{
934 return 0;
935}
936
937static inline void update_cgrp_time_from_event(struct perf_event *event)
938{
939}
940
941static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
942{
943}
944
945static inline void perf_cgroup_sched_out(struct task_struct *task,
946 struct task_struct *next)
947{
948}
949
950static inline void perf_cgroup_sched_in(struct task_struct *prev,
951 struct task_struct *task)
952{
953}
954
955static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
956 struct perf_event_attr *attr,
957 struct perf_event *group_leader)
958{
959 return -EINVAL;
960}
961
962static inline void
963perf_cgroup_set_timestamp(struct task_struct *task,
964 struct perf_event_context *ctx)
965{
966}
967
968void
969perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
970{
971}
972
973static inline void
974perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
975{
976}
977
978static inline u64 perf_cgroup_event_time(struct perf_event *event)
979{
980 return 0;
981}
982
983static inline void
984perf_cgroup_defer_enabled(struct perf_event *event)
985{
986}
987
988static inline void
989perf_cgroup_mark_enabled(struct perf_event *event,
990 struct perf_event_context *ctx)
991{
992}
993
994static inline void
995list_update_cgroup_event(struct perf_event *event,
996 struct perf_event_context *ctx, bool add)
997{
998}
999
1000#endif
1001
1002
1003
1004
1005
1006#define PERF_CPU_HRTIMER (1000 / HZ)
1007
1008
1009
1010static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1011{
1012 struct perf_cpu_context *cpuctx;
1013 int rotations = 0;
1014
1015 WARN_ON(!irqs_disabled());
1016
1017 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1018 rotations = perf_rotate_context(cpuctx);
1019
1020 raw_spin_lock(&cpuctx->hrtimer_lock);
1021 if (rotations)
1022 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1023 else
1024 cpuctx->hrtimer_active = 0;
1025 raw_spin_unlock(&cpuctx->hrtimer_lock);
1026
1027 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1028}
1029
1030static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1031{
1032 struct hrtimer *timer = &cpuctx->hrtimer;
1033 struct pmu *pmu = cpuctx->ctx.pmu;
1034 u64 interval;
1035
1036
1037 if (pmu->task_ctx_nr == perf_sw_context)
1038 return;
1039
1040
1041
1042
1043
1044 interval = pmu->hrtimer_interval_ms;
1045 if (interval < 1)
1046 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1047
1048 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1049
1050 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1051 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1052 timer->function = perf_mux_hrtimer_handler;
1053}
1054
1055static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1056{
1057 struct hrtimer *timer = &cpuctx->hrtimer;
1058 struct pmu *pmu = cpuctx->ctx.pmu;
1059 unsigned long flags;
1060
1061
1062 if (pmu->task_ctx_nr == perf_sw_context)
1063 return 0;
1064
1065 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1066 if (!cpuctx->hrtimer_active) {
1067 cpuctx->hrtimer_active = 1;
1068 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1069 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1070 }
1071 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1072
1073 return 0;
1074}
1075
1076void perf_pmu_disable(struct pmu *pmu)
1077{
1078 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1079 if (!(*count)++)
1080 pmu->pmu_disable(pmu);
1081}
1082
1083void perf_pmu_enable(struct pmu *pmu)
1084{
1085 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1086 if (!--(*count))
1087 pmu->pmu_enable(pmu);
1088}
1089
1090static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1091
1092
1093
1094
1095
1096
1097
1098static void perf_event_ctx_activate(struct perf_event_context *ctx)
1099{
1100 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1101
1102 WARN_ON(!irqs_disabled());
1103
1104 WARN_ON(!list_empty(&ctx->active_ctx_list));
1105
1106 list_add(&ctx->active_ctx_list, head);
1107}
1108
1109static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1110{
1111 WARN_ON(!irqs_disabled());
1112
1113 WARN_ON(list_empty(&ctx->active_ctx_list));
1114
1115 list_del_init(&ctx->active_ctx_list);
1116}
1117
1118static void get_ctx(struct perf_event_context *ctx)
1119{
1120 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1121}
1122
1123static void free_ctx(struct rcu_head *head)
1124{
1125 struct perf_event_context *ctx;
1126
1127 ctx = container_of(head, struct perf_event_context, rcu_head);
1128 kfree(ctx->task_ctx_data);
1129 kfree(ctx);
1130}
1131
1132static void put_ctx(struct perf_event_context *ctx)
1133{
1134 if (atomic_dec_and_test(&ctx->refcount)) {
1135 if (ctx->parent_ctx)
1136 put_ctx(ctx->parent_ctx);
1137 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1138 put_task_struct(ctx->task);
1139 call_rcu(&ctx->rcu_head, free_ctx);
1140 }
1141}
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204static struct perf_event_context *
1205perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1206{
1207 struct perf_event_context *ctx;
1208
1209again:
1210 rcu_read_lock();
1211 ctx = ACCESS_ONCE(event->ctx);
1212 if (!atomic_inc_not_zero(&ctx->refcount)) {
1213 rcu_read_unlock();
1214 goto again;
1215 }
1216 rcu_read_unlock();
1217
1218 mutex_lock_nested(&ctx->mutex, nesting);
1219 if (event->ctx != ctx) {
1220 mutex_unlock(&ctx->mutex);
1221 put_ctx(ctx);
1222 goto again;
1223 }
1224
1225 return ctx;
1226}
1227
1228static inline struct perf_event_context *
1229perf_event_ctx_lock(struct perf_event *event)
1230{
1231 return perf_event_ctx_lock_nested(event, 0);
1232}
1233
1234static void perf_event_ctx_unlock(struct perf_event *event,
1235 struct perf_event_context *ctx)
1236{
1237 mutex_unlock(&ctx->mutex);
1238 put_ctx(ctx);
1239}
1240
1241
1242
1243
1244
1245
1246static __must_check struct perf_event_context *
1247unclone_ctx(struct perf_event_context *ctx)
1248{
1249 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1250
1251 lockdep_assert_held(&ctx->lock);
1252
1253 if (parent_ctx)
1254 ctx->parent_ctx = NULL;
1255 ctx->generation++;
1256
1257 return parent_ctx;
1258}
1259
1260static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1261{
1262
1263
1264
1265 if (event->parent)
1266 event = event->parent;
1267
1268 return task_tgid_nr_ns(p, event->ns);
1269}
1270
1271static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1272{
1273
1274
1275
1276 if (event->parent)
1277 event = event->parent;
1278
1279 return task_pid_nr_ns(p, event->ns);
1280}
1281
1282
1283
1284
1285
1286static u64 primary_event_id(struct perf_event *event)
1287{
1288 u64 id = event->id;
1289
1290 if (event->parent)
1291 id = event->parent->id;
1292
1293 return id;
1294}
1295
1296
1297
1298
1299
1300
1301
1302static struct perf_event_context *
1303perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1304{
1305 struct perf_event_context *ctx;
1306
1307retry:
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317 local_irq_save(*flags);
1318 rcu_read_lock();
1319 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1320 if (ctx) {
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331 raw_spin_lock(&ctx->lock);
1332 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1333 raw_spin_unlock(&ctx->lock);
1334 rcu_read_unlock();
1335 local_irq_restore(*flags);
1336 goto retry;
1337 }
1338
1339 if (ctx->task == TASK_TOMBSTONE ||
1340 !atomic_inc_not_zero(&ctx->refcount)) {
1341 raw_spin_unlock(&ctx->lock);
1342 ctx = NULL;
1343 } else {
1344 WARN_ON_ONCE(ctx->task != task);
1345 }
1346 }
1347 rcu_read_unlock();
1348 if (!ctx)
1349 local_irq_restore(*flags);
1350 return ctx;
1351}
1352
1353
1354
1355
1356
1357
1358static struct perf_event_context *
1359perf_pin_task_context(struct task_struct *task, int ctxn)
1360{
1361 struct perf_event_context *ctx;
1362 unsigned long flags;
1363
1364 ctx = perf_lock_task_context(task, ctxn, &flags);
1365 if (ctx) {
1366 ++ctx->pin_count;
1367 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1368 }
1369 return ctx;
1370}
1371
1372static void perf_unpin_context(struct perf_event_context *ctx)
1373{
1374 unsigned long flags;
1375
1376 raw_spin_lock_irqsave(&ctx->lock, flags);
1377 --ctx->pin_count;
1378 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1379}
1380
1381
1382
1383
1384static void update_context_time(struct perf_event_context *ctx)
1385{
1386 u64 now = perf_clock();
1387
1388 ctx->time += now - ctx->timestamp;
1389 ctx->timestamp = now;
1390}
1391
1392static u64 perf_event_time(struct perf_event *event)
1393{
1394 struct perf_event_context *ctx = event->ctx;
1395
1396 if (is_cgroup_event(event))
1397 return perf_cgroup_event_time(event);
1398
1399 return ctx ? ctx->time : 0;
1400}
1401
1402
1403
1404
1405static void update_event_times(struct perf_event *event)
1406{
1407 struct perf_event_context *ctx = event->ctx;
1408 u64 run_end;
1409
1410 lockdep_assert_held(&ctx->lock);
1411
1412 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1413 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1414 return;
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426 if (is_cgroup_event(event))
1427 run_end = perf_cgroup_event_time(event);
1428 else if (ctx->is_active)
1429 run_end = ctx->time;
1430 else
1431 run_end = event->tstamp_stopped;
1432
1433 event->total_time_enabled = run_end - event->tstamp_enabled;
1434
1435 if (event->state == PERF_EVENT_STATE_INACTIVE)
1436 run_end = event->tstamp_stopped;
1437 else
1438 run_end = perf_event_time(event);
1439
1440 event->total_time_running = run_end - event->tstamp_running;
1441
1442}
1443
1444
1445
1446
1447static void update_group_times(struct perf_event *leader)
1448{
1449 struct perf_event *event;
1450
1451 update_event_times(leader);
1452 list_for_each_entry(event, &leader->sibling_list, group_entry)
1453 update_event_times(event);
1454}
1455
1456static struct list_head *
1457ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1458{
1459 if (event->attr.pinned)
1460 return &ctx->pinned_groups;
1461 else
1462 return &ctx->flexible_groups;
1463}
1464
1465
1466
1467
1468
1469static void
1470list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1471{
1472
1473 lockdep_assert_held(&ctx->lock);
1474
1475 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1476 event->attach_state |= PERF_ATTACH_CONTEXT;
1477
1478
1479
1480
1481
1482
1483 if (event->group_leader == event) {
1484 struct list_head *list;
1485
1486 event->group_caps = event->event_caps;
1487
1488 list = ctx_group_list(event, ctx);
1489 list_add_tail(&event->group_entry, list);
1490 }
1491
1492 list_update_cgroup_event(event, ctx, true);
1493
1494 list_add_rcu(&event->event_entry, &ctx->event_list);
1495 ctx->nr_events++;
1496 if (event->attr.inherit_stat)
1497 ctx->nr_stat++;
1498
1499 ctx->generation++;
1500}
1501
1502
1503
1504
1505static inline void perf_event__state_init(struct perf_event *event)
1506{
1507 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1508 PERF_EVENT_STATE_INACTIVE;
1509}
1510
1511static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1512{
1513 int entry = sizeof(u64);
1514 int size = 0;
1515 int nr = 1;
1516
1517 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1518 size += sizeof(u64);
1519
1520 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1521 size += sizeof(u64);
1522
1523 if (event->attr.read_format & PERF_FORMAT_ID)
1524 entry += sizeof(u64);
1525
1526 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1527 nr += nr_siblings;
1528 size += sizeof(u64);
1529 }
1530
1531 size += entry * nr;
1532 event->read_size = size;
1533}
1534
1535static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1536{
1537 struct perf_sample_data *data;
1538 u16 size = 0;
1539
1540 if (sample_type & PERF_SAMPLE_IP)
1541 size += sizeof(data->ip);
1542
1543 if (sample_type & PERF_SAMPLE_ADDR)
1544 size += sizeof(data->addr);
1545
1546 if (sample_type & PERF_SAMPLE_PERIOD)
1547 size += sizeof(data->period);
1548
1549 if (sample_type & PERF_SAMPLE_WEIGHT)
1550 size += sizeof(data->weight);
1551
1552 if (sample_type & PERF_SAMPLE_READ)
1553 size += event->read_size;
1554
1555 if (sample_type & PERF_SAMPLE_DATA_SRC)
1556 size += sizeof(data->data_src.val);
1557
1558 if (sample_type & PERF_SAMPLE_TRANSACTION)
1559 size += sizeof(data->txn);
1560
1561 event->header_size = size;
1562}
1563
1564
1565
1566
1567
1568static void perf_event__header_size(struct perf_event *event)
1569{
1570 __perf_event_read_size(event,
1571 event->group_leader->nr_siblings);
1572 __perf_event_header_size(event, event->attr.sample_type);
1573}
1574
1575static void perf_event__id_header_size(struct perf_event *event)
1576{
1577 struct perf_sample_data *data;
1578 u64 sample_type = event->attr.sample_type;
1579 u16 size = 0;
1580
1581 if (sample_type & PERF_SAMPLE_TID)
1582 size += sizeof(data->tid_entry);
1583
1584 if (sample_type & PERF_SAMPLE_TIME)
1585 size += sizeof(data->time);
1586
1587 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1588 size += sizeof(data->id);
1589
1590 if (sample_type & PERF_SAMPLE_ID)
1591 size += sizeof(data->id);
1592
1593 if (sample_type & PERF_SAMPLE_STREAM_ID)
1594 size += sizeof(data->stream_id);
1595
1596 if (sample_type & PERF_SAMPLE_CPU)
1597 size += sizeof(data->cpu_entry);
1598
1599 event->id_header_size = size;
1600}
1601
1602static bool perf_event_validate_size(struct perf_event *event)
1603{
1604
1605
1606
1607
1608 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1609 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1610 perf_event__id_header_size(event);
1611
1612
1613
1614
1615
1616 if (event->read_size + event->header_size +
1617 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1618 return false;
1619
1620 return true;
1621}
1622
1623static void perf_group_attach(struct perf_event *event)
1624{
1625 struct perf_event *group_leader = event->group_leader, *pos;
1626
1627
1628
1629
1630 if (event->attach_state & PERF_ATTACH_GROUP)
1631 return;
1632
1633 event->attach_state |= PERF_ATTACH_GROUP;
1634
1635 if (group_leader == event)
1636 return;
1637
1638 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1639
1640 group_leader->group_caps &= event->event_caps;
1641
1642 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1643 group_leader->nr_siblings++;
1644
1645 perf_event__header_size(group_leader);
1646
1647 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1648 perf_event__header_size(pos);
1649}
1650
1651
1652
1653
1654
1655static void
1656list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1657{
1658 WARN_ON_ONCE(event->ctx != ctx);
1659 lockdep_assert_held(&ctx->lock);
1660
1661
1662
1663
1664 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1665 return;
1666
1667 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1668
1669 list_update_cgroup_event(event, ctx, false);
1670
1671 ctx->nr_events--;
1672 if (event->attr.inherit_stat)
1673 ctx->nr_stat--;
1674
1675 list_del_rcu(&event->event_entry);
1676
1677 if (event->group_leader == event)
1678 list_del_init(&event->group_entry);
1679
1680 update_group_times(event);
1681
1682
1683
1684
1685
1686
1687
1688
1689 if (event->state > PERF_EVENT_STATE_OFF)
1690 event->state = PERF_EVENT_STATE_OFF;
1691
1692 ctx->generation++;
1693}
1694
1695static void perf_group_detach(struct perf_event *event)
1696{
1697 struct perf_event *sibling, *tmp;
1698 struct list_head *list = NULL;
1699
1700
1701
1702
1703 if (!(event->attach_state & PERF_ATTACH_GROUP))
1704 return;
1705
1706 event->attach_state &= ~PERF_ATTACH_GROUP;
1707
1708
1709
1710
1711 if (event->group_leader != event) {
1712 list_del_init(&event->group_entry);
1713 event->group_leader->nr_siblings--;
1714 goto out;
1715 }
1716
1717 if (!list_empty(&event->group_entry))
1718 list = &event->group_entry;
1719
1720
1721
1722
1723
1724
1725 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1726 if (list)
1727 list_move_tail(&sibling->group_entry, list);
1728 sibling->group_leader = sibling;
1729
1730
1731 sibling->group_caps = event->group_caps;
1732
1733 WARN_ON_ONCE(sibling->ctx != event->ctx);
1734 }
1735
1736out:
1737 perf_event__header_size(event->group_leader);
1738
1739 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1740 perf_event__header_size(tmp);
1741}
1742
1743static bool is_orphaned_event(struct perf_event *event)
1744{
1745 return event->state == PERF_EVENT_STATE_DEAD;
1746}
1747
1748static inline int __pmu_filter_match(struct perf_event *event)
1749{
1750 struct pmu *pmu = event->pmu;
1751 return pmu->filter_match ? pmu->filter_match(event) : 1;
1752}
1753
1754
1755
1756
1757
1758
1759
1760static inline int pmu_filter_match(struct perf_event *event)
1761{
1762 struct perf_event *child;
1763
1764 if (!__pmu_filter_match(event))
1765 return 0;
1766
1767 list_for_each_entry(child, &event->sibling_list, group_entry) {
1768 if (!__pmu_filter_match(child))
1769 return 0;
1770 }
1771
1772 return 1;
1773}
1774
1775static inline int
1776event_filter_match(struct perf_event *event)
1777{
1778 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1779 perf_cgroup_match(event) && pmu_filter_match(event);
1780}
1781
1782static void
1783event_sched_out(struct perf_event *event,
1784 struct perf_cpu_context *cpuctx,
1785 struct perf_event_context *ctx)
1786{
1787 u64 tstamp = perf_event_time(event);
1788 u64 delta;
1789
1790 WARN_ON_ONCE(event->ctx != ctx);
1791 lockdep_assert_held(&ctx->lock);
1792
1793
1794
1795
1796
1797
1798
1799 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1800 !event_filter_match(event)) {
1801 delta = tstamp - event->tstamp_stopped;
1802 event->tstamp_running += delta;
1803 event->tstamp_stopped = tstamp;
1804 }
1805
1806 if (event->state != PERF_EVENT_STATE_ACTIVE)
1807 return;
1808
1809 perf_pmu_disable(event->pmu);
1810
1811 event->tstamp_stopped = tstamp;
1812 event->pmu->del(event, 0);
1813 event->oncpu = -1;
1814 event->state = PERF_EVENT_STATE_INACTIVE;
1815 if (event->pending_disable) {
1816 event->pending_disable = 0;
1817 event->state = PERF_EVENT_STATE_OFF;
1818 }
1819
1820 if (!is_software_event(event))
1821 cpuctx->active_oncpu--;
1822 if (!--ctx->nr_active)
1823 perf_event_ctx_deactivate(ctx);
1824 if (event->attr.freq && event->attr.sample_freq)
1825 ctx->nr_freq--;
1826 if (event->attr.exclusive || !cpuctx->active_oncpu)
1827 cpuctx->exclusive = 0;
1828
1829 perf_pmu_enable(event->pmu);
1830}
1831
1832static void
1833group_sched_out(struct perf_event *group_event,
1834 struct perf_cpu_context *cpuctx,
1835 struct perf_event_context *ctx)
1836{
1837 struct perf_event *event;
1838 int state = group_event->state;
1839
1840 perf_pmu_disable(ctx->pmu);
1841
1842 event_sched_out(group_event, cpuctx, ctx);
1843
1844
1845
1846
1847 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1848 event_sched_out(event, cpuctx, ctx);
1849
1850 perf_pmu_enable(ctx->pmu);
1851
1852 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1853 cpuctx->exclusive = 0;
1854}
1855
1856#define DETACH_GROUP 0x01UL
1857
1858
1859
1860
1861
1862
1863
1864static void
1865__perf_remove_from_context(struct perf_event *event,
1866 struct perf_cpu_context *cpuctx,
1867 struct perf_event_context *ctx,
1868 void *info)
1869{
1870 unsigned long flags = (unsigned long)info;
1871
1872 event_sched_out(event, cpuctx, ctx);
1873 if (flags & DETACH_GROUP)
1874 perf_group_detach(event);
1875 list_del_event(event, ctx);
1876
1877 if (!ctx->nr_events && ctx->is_active) {
1878 ctx->is_active = 0;
1879 if (ctx->task) {
1880 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1881 cpuctx->task_ctx = NULL;
1882 }
1883 }
1884}
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1897{
1898 lockdep_assert_held(&event->ctx->mutex);
1899
1900 event_function_call(event, __perf_remove_from_context, (void *)flags);
1901}
1902
1903
1904
1905
1906static void __perf_event_disable(struct perf_event *event,
1907 struct perf_cpu_context *cpuctx,
1908 struct perf_event_context *ctx,
1909 void *info)
1910{
1911 if (event->state < PERF_EVENT_STATE_INACTIVE)
1912 return;
1913
1914 update_context_time(ctx);
1915 update_cgrp_time_from_event(event);
1916 update_group_times(event);
1917 if (event == event->group_leader)
1918 group_sched_out(event, cpuctx, ctx);
1919 else
1920 event_sched_out(event, cpuctx, ctx);
1921 event->state = PERF_EVENT_STATE_OFF;
1922}
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938static void _perf_event_disable(struct perf_event *event)
1939{
1940 struct perf_event_context *ctx = event->ctx;
1941
1942 raw_spin_lock_irq(&ctx->lock);
1943 if (event->state <= PERF_EVENT_STATE_OFF) {
1944 raw_spin_unlock_irq(&ctx->lock);
1945 return;
1946 }
1947 raw_spin_unlock_irq(&ctx->lock);
1948
1949 event_function_call(event, __perf_event_disable, NULL);
1950}
1951
1952void perf_event_disable_local(struct perf_event *event)
1953{
1954 event_function_local(event, __perf_event_disable, NULL);
1955}
1956
1957
1958
1959
1960
1961void perf_event_disable(struct perf_event *event)
1962{
1963 struct perf_event_context *ctx;
1964
1965 ctx = perf_event_ctx_lock(event);
1966 _perf_event_disable(event);
1967 perf_event_ctx_unlock(event, ctx);
1968}
1969EXPORT_SYMBOL_GPL(perf_event_disable);
1970
1971void perf_event_disable_inatomic(struct perf_event *event)
1972{
1973 event->pending_disable = 1;
1974 irq_work_queue(&event->pending);
1975}
1976
1977static void perf_set_shadow_time(struct perf_event *event,
1978 struct perf_event_context *ctx,
1979 u64 tstamp)
1980{
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006 if (is_cgroup_event(event))
2007 perf_cgroup_set_shadow_time(event, tstamp);
2008 else
2009 event->shadow_ctx_time = tstamp - ctx->timestamp;
2010}
2011
2012#define MAX_INTERRUPTS (~0ULL)
2013
2014static void perf_log_throttle(struct perf_event *event, int enable);
2015static void perf_log_itrace_start(struct perf_event *event);
2016
2017static int
2018event_sched_in(struct perf_event *event,
2019 struct perf_cpu_context *cpuctx,
2020 struct perf_event_context *ctx)
2021{
2022 u64 tstamp = perf_event_time(event);
2023 int ret = 0;
2024
2025 lockdep_assert_held(&ctx->lock);
2026
2027 if (event->state <= PERF_EVENT_STATE_OFF)
2028 return 0;
2029
2030 WRITE_ONCE(event->oncpu, smp_processor_id());
2031
2032
2033
2034
2035 smp_wmb();
2036 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2037
2038
2039
2040
2041
2042
2043 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2044 perf_log_throttle(event, 1);
2045 event->hw.interrupts = 0;
2046 }
2047
2048
2049
2050
2051 smp_wmb();
2052
2053 perf_pmu_disable(event->pmu);
2054
2055 perf_set_shadow_time(event, ctx, tstamp);
2056
2057 perf_log_itrace_start(event);
2058
2059 if (event->pmu->add(event, PERF_EF_START)) {
2060 event->state = PERF_EVENT_STATE_INACTIVE;
2061 event->oncpu = -1;
2062 ret = -EAGAIN;
2063 goto out;
2064 }
2065
2066 event->tstamp_running += tstamp - event->tstamp_stopped;
2067
2068 if (!is_software_event(event))
2069 cpuctx->active_oncpu++;
2070 if (!ctx->nr_active++)
2071 perf_event_ctx_activate(ctx);
2072 if (event->attr.freq && event->attr.sample_freq)
2073 ctx->nr_freq++;
2074
2075 if (event->attr.exclusive)
2076 cpuctx->exclusive = 1;
2077
2078out:
2079 perf_pmu_enable(event->pmu);
2080
2081 return ret;
2082}
2083
2084static int
2085group_sched_in(struct perf_event *group_event,
2086 struct perf_cpu_context *cpuctx,
2087 struct perf_event_context *ctx)
2088{
2089 struct perf_event *event, *partial_group = NULL;
2090 struct pmu *pmu = ctx->pmu;
2091 u64 now = ctx->time;
2092 bool simulate = false;
2093
2094 if (group_event->state == PERF_EVENT_STATE_OFF)
2095 return 0;
2096
2097 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2098
2099 if (event_sched_in(group_event, cpuctx, ctx)) {
2100 pmu->cancel_txn(pmu);
2101 perf_mux_hrtimer_restart(cpuctx);
2102 return -EAGAIN;
2103 }
2104
2105
2106
2107
2108 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2109 if (event_sched_in(event, cpuctx, ctx)) {
2110 partial_group = event;
2111 goto group_error;
2112 }
2113 }
2114
2115 if (!pmu->commit_txn(pmu))
2116 return 0;
2117
2118group_error:
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2134 if (event == partial_group)
2135 simulate = true;
2136
2137 if (simulate) {
2138 event->tstamp_running += now - event->tstamp_stopped;
2139 event->tstamp_stopped = now;
2140 } else {
2141 event_sched_out(event, cpuctx, ctx);
2142 }
2143 }
2144 event_sched_out(group_event, cpuctx, ctx);
2145
2146 pmu->cancel_txn(pmu);
2147
2148 perf_mux_hrtimer_restart(cpuctx);
2149
2150 return -EAGAIN;
2151}
2152
2153
2154
2155
2156static int group_can_go_on(struct perf_event *event,
2157 struct perf_cpu_context *cpuctx,
2158 int can_add_hw)
2159{
2160
2161
2162
2163 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2164 return 1;
2165
2166
2167
2168
2169 if (cpuctx->exclusive)
2170 return 0;
2171
2172
2173
2174
2175 if (event->attr.exclusive && cpuctx->active_oncpu)
2176 return 0;
2177
2178
2179
2180
2181 return can_add_hw;
2182}
2183
2184static void add_event_to_ctx(struct perf_event *event,
2185 struct perf_event_context *ctx)
2186{
2187 u64 tstamp = perf_event_time(event);
2188
2189 list_add_event(event, ctx);
2190 perf_group_attach(event);
2191 event->tstamp_enabled = tstamp;
2192 event->tstamp_running = tstamp;
2193 event->tstamp_stopped = tstamp;
2194}
2195
2196static void ctx_sched_out(struct perf_event_context *ctx,
2197 struct perf_cpu_context *cpuctx,
2198 enum event_type_t event_type);
2199static void
2200ctx_sched_in(struct perf_event_context *ctx,
2201 struct perf_cpu_context *cpuctx,
2202 enum event_type_t event_type,
2203 struct task_struct *task);
2204
2205static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2206 struct perf_event_context *ctx)
2207{
2208 if (!cpuctx->task_ctx)
2209 return;
2210
2211 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2212 return;
2213
2214 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2215}
2216
2217static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2218 struct perf_event_context *ctx,
2219 struct task_struct *task)
2220{
2221 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2222 if (ctx)
2223 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2224 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2225 if (ctx)
2226 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2227}
2228
2229static void ctx_resched(struct perf_cpu_context *cpuctx,
2230 struct perf_event_context *task_ctx)
2231{
2232 perf_pmu_disable(cpuctx->ctx.pmu);
2233 if (task_ctx)
2234 task_ctx_sched_out(cpuctx, task_ctx);
2235 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2236 perf_event_sched_in(cpuctx, task_ctx, current);
2237 perf_pmu_enable(cpuctx->ctx.pmu);
2238}
2239
2240
2241
2242
2243
2244
2245
2246static int __perf_install_in_context(void *info)
2247{
2248 struct perf_event *event = info;
2249 struct perf_event_context *ctx = event->ctx;
2250 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2251 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2252 bool activate = true;
2253 int ret = 0;
2254
2255 raw_spin_lock(&cpuctx->ctx.lock);
2256 if (ctx->task) {
2257 raw_spin_lock(&ctx->lock);
2258 task_ctx = ctx;
2259
2260
2261 if (task_cpu(ctx->task) != smp_processor_id()) {
2262 ret = -ESRCH;
2263 goto unlock;
2264 }
2265
2266
2267
2268
2269
2270
2271 if (ctx->task != current)
2272 activate = false;
2273 else
2274 WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2275
2276 } else if (task_ctx) {
2277 raw_spin_lock(&task_ctx->lock);
2278 }
2279
2280 if (activate) {
2281 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2282 add_event_to_ctx(event, ctx);
2283 ctx_resched(cpuctx, task_ctx);
2284 } else {
2285 add_event_to_ctx(event, ctx);
2286 }
2287
2288unlock:
2289 perf_ctx_unlock(cpuctx, task_ctx);
2290
2291 return ret;
2292}
2293
2294
2295
2296
2297
2298
2299static void
2300perf_install_in_context(struct perf_event_context *ctx,
2301 struct perf_event *event,
2302 int cpu)
2303{
2304 struct task_struct *task = READ_ONCE(ctx->task);
2305
2306 lockdep_assert_held(&ctx->mutex);
2307
2308 if (event->cpu != -1)
2309 event->cpu = cpu;
2310
2311
2312
2313
2314
2315 smp_store_release(&event->ctx, ctx);
2316
2317 if (!task) {
2318 cpu_function_call(cpu, __perf_install_in_context, event);
2319 return;
2320 }
2321
2322
2323
2324
2325 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2326 return;
2327
2328
2329
2330
2331
2332again:
2333
2334
2335
2336
2337 if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
2338 return;
2339
2340 raw_spin_lock_irq(&ctx->lock);
2341 task = ctx->task;
2342 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2343
2344
2345
2346
2347
2348 raw_spin_unlock_irq(&ctx->lock);
2349 return;
2350 }
2351 raw_spin_unlock_irq(&ctx->lock);
2352
2353
2354
2355
2356 goto again;
2357}
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367static void __perf_event_mark_enabled(struct perf_event *event)
2368{
2369 struct perf_event *sub;
2370 u64 tstamp = perf_event_time(event);
2371
2372 event->state = PERF_EVENT_STATE_INACTIVE;
2373 event->tstamp_enabled = tstamp - event->total_time_enabled;
2374 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2375 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2376 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2377 }
2378}
2379
2380
2381
2382
2383static void __perf_event_enable(struct perf_event *event,
2384 struct perf_cpu_context *cpuctx,
2385 struct perf_event_context *ctx,
2386 void *info)
2387{
2388 struct perf_event *leader = event->group_leader;
2389 struct perf_event_context *task_ctx;
2390
2391 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2392 event->state <= PERF_EVENT_STATE_ERROR)
2393 return;
2394
2395 if (ctx->is_active)
2396 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2397
2398 __perf_event_mark_enabled(event);
2399
2400 if (!ctx->is_active)
2401 return;
2402
2403 if (!event_filter_match(event)) {
2404 if (is_cgroup_event(event))
2405 perf_cgroup_defer_enabled(event);
2406 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2407 return;
2408 }
2409
2410
2411
2412
2413
2414 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2415 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2416 return;
2417 }
2418
2419 task_ctx = cpuctx->task_ctx;
2420 if (ctx->task)
2421 WARN_ON_ONCE(task_ctx != ctx);
2422
2423 ctx_resched(cpuctx, task_ctx);
2424}
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435static void _perf_event_enable(struct perf_event *event)
2436{
2437 struct perf_event_context *ctx = event->ctx;
2438
2439 raw_spin_lock_irq(&ctx->lock);
2440 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2441 event->state < PERF_EVENT_STATE_ERROR) {
2442 raw_spin_unlock_irq(&ctx->lock);
2443 return;
2444 }
2445
2446
2447
2448
2449
2450
2451
2452
2453 if (event->state == PERF_EVENT_STATE_ERROR)
2454 event->state = PERF_EVENT_STATE_OFF;
2455 raw_spin_unlock_irq(&ctx->lock);
2456
2457 event_function_call(event, __perf_event_enable, NULL);
2458}
2459
2460
2461
2462
2463void perf_event_enable(struct perf_event *event)
2464{
2465 struct perf_event_context *ctx;
2466
2467 ctx = perf_event_ctx_lock(event);
2468 _perf_event_enable(event);
2469 perf_event_ctx_unlock(event, ctx);
2470}
2471EXPORT_SYMBOL_GPL(perf_event_enable);
2472
2473struct stop_event_data {
2474 struct perf_event *event;
2475 unsigned int restart;
2476};
2477
2478static int __perf_event_stop(void *info)
2479{
2480 struct stop_event_data *sd = info;
2481 struct perf_event *event = sd->event;
2482
2483
2484 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2485 return 0;
2486
2487
2488 smp_rmb();
2489
2490
2491
2492
2493
2494 if (READ_ONCE(event->oncpu) != smp_processor_id())
2495 return -EAGAIN;
2496
2497 event->pmu->stop(event, PERF_EF_UPDATE);
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508 if (sd->restart)
2509 event->pmu->start(event, 0);
2510
2511 return 0;
2512}
2513
2514static int perf_event_stop(struct perf_event *event, int restart)
2515{
2516 struct stop_event_data sd = {
2517 .event = event,
2518 .restart = restart,
2519 };
2520 int ret = 0;
2521
2522 do {
2523 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2524 return 0;
2525
2526
2527 smp_rmb();
2528
2529
2530
2531
2532
2533
2534 ret = cpu_function_call(READ_ONCE(event->oncpu),
2535 __perf_event_stop, &sd);
2536 } while (ret == -EAGAIN);
2537
2538 return ret;
2539}
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563void perf_event_addr_filters_sync(struct perf_event *event)
2564{
2565 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2566
2567 if (!has_addr_filter(event))
2568 return;
2569
2570 raw_spin_lock(&ifh->lock);
2571 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2572 event->pmu->addr_filters_sync(event);
2573 event->hw.addr_filters_gen = event->addr_filters_gen;
2574 }
2575 raw_spin_unlock(&ifh->lock);
2576}
2577EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2578
2579static int _perf_event_refresh(struct perf_event *event, int refresh)
2580{
2581
2582
2583
2584 if (event->attr.inherit || !is_sampling_event(event))
2585 return -EINVAL;
2586
2587 atomic_add(refresh, &event->event_limit);
2588 _perf_event_enable(event);
2589
2590 return 0;
2591}
2592
2593
2594
2595
2596int perf_event_refresh(struct perf_event *event, int refresh)
2597{
2598 struct perf_event_context *ctx;
2599 int ret;
2600
2601 ctx = perf_event_ctx_lock(event);
2602 ret = _perf_event_refresh(event, refresh);
2603 perf_event_ctx_unlock(event, ctx);
2604
2605 return ret;
2606}
2607EXPORT_SYMBOL_GPL(perf_event_refresh);
2608
2609static void ctx_sched_out(struct perf_event_context *ctx,
2610 struct perf_cpu_context *cpuctx,
2611 enum event_type_t event_type)
2612{
2613 int is_active = ctx->is_active;
2614 struct perf_event *event;
2615
2616 lockdep_assert_held(&ctx->lock);
2617
2618 if (likely(!ctx->nr_events)) {
2619
2620
2621
2622 WARN_ON_ONCE(ctx->is_active);
2623 if (ctx->task)
2624 WARN_ON_ONCE(cpuctx->task_ctx);
2625 return;
2626 }
2627
2628 ctx->is_active &= ~event_type;
2629 if (!(ctx->is_active & EVENT_ALL))
2630 ctx->is_active = 0;
2631
2632 if (ctx->task) {
2633 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2634 if (!ctx->is_active)
2635 cpuctx->task_ctx = NULL;
2636 }
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648 if (is_active & EVENT_TIME) {
2649
2650 update_context_time(ctx);
2651 update_cgrp_time_from_cpuctx(cpuctx);
2652 }
2653
2654 is_active ^= ctx->is_active;
2655
2656 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2657 return;
2658
2659 perf_pmu_disable(ctx->pmu);
2660 if (is_active & EVENT_PINNED) {
2661 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2662 group_sched_out(event, cpuctx, ctx);
2663 }
2664
2665 if (is_active & EVENT_FLEXIBLE) {
2666 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2667 group_sched_out(event, cpuctx, ctx);
2668 }
2669 perf_pmu_enable(ctx->pmu);
2670}
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680static int context_equiv(struct perf_event_context *ctx1,
2681 struct perf_event_context *ctx2)
2682{
2683 lockdep_assert_held(&ctx1->lock);
2684 lockdep_assert_held(&ctx2->lock);
2685
2686
2687 if (ctx1->pin_count || ctx2->pin_count)
2688 return 0;
2689
2690
2691 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2692 return 1;
2693
2694
2695 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2696 return 1;
2697
2698
2699
2700
2701
2702 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2703 ctx1->parent_gen == ctx2->parent_gen)
2704 return 1;
2705
2706
2707 return 0;
2708}
2709
2710static void __perf_event_sync_stat(struct perf_event *event,
2711 struct perf_event *next_event)
2712{
2713 u64 value;
2714
2715 if (!event->attr.inherit_stat)
2716 return;
2717
2718
2719
2720
2721
2722
2723
2724
2725 switch (event->state) {
2726 case PERF_EVENT_STATE_ACTIVE:
2727 event->pmu->read(event);
2728
2729
2730 case PERF_EVENT_STATE_INACTIVE:
2731 update_event_times(event);
2732 break;
2733
2734 default:
2735 break;
2736 }
2737
2738
2739
2740
2741
2742 value = local64_read(&next_event->count);
2743 value = local64_xchg(&event->count, value);
2744 local64_set(&next_event->count, value);
2745
2746 swap(event->total_time_enabled, next_event->total_time_enabled);
2747 swap(event->total_time_running, next_event->total_time_running);
2748
2749
2750
2751
2752 perf_event_update_userpage(event);
2753 perf_event_update_userpage(next_event);
2754}
2755
2756static void perf_event_sync_stat(struct perf_event_context *ctx,
2757 struct perf_event_context *next_ctx)
2758{
2759 struct perf_event *event, *next_event;
2760
2761 if (!ctx->nr_stat)
2762 return;
2763
2764 update_context_time(ctx);
2765
2766 event = list_first_entry(&ctx->event_list,
2767 struct perf_event, event_entry);
2768
2769 next_event = list_first_entry(&next_ctx->event_list,
2770 struct perf_event, event_entry);
2771
2772 while (&event->event_entry != &ctx->event_list &&
2773 &next_event->event_entry != &next_ctx->event_list) {
2774
2775 __perf_event_sync_stat(event, next_event);
2776
2777 event = list_next_entry(event, event_entry);
2778 next_event = list_next_entry(next_event, event_entry);
2779 }
2780}
2781
2782static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2783 struct task_struct *next)
2784{
2785 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2786 struct perf_event_context *next_ctx;
2787 struct perf_event_context *parent, *next_parent;
2788 struct perf_cpu_context *cpuctx;
2789 int do_switch = 1;
2790
2791 if (likely(!ctx))
2792 return;
2793
2794 cpuctx = __get_cpu_context(ctx);
2795 if (!cpuctx->task_ctx)
2796 return;
2797
2798 rcu_read_lock();
2799 next_ctx = next->perf_event_ctxp[ctxn];
2800 if (!next_ctx)
2801 goto unlock;
2802
2803 parent = rcu_dereference(ctx->parent_ctx);
2804 next_parent = rcu_dereference(next_ctx->parent_ctx);
2805
2806
2807 if (!parent && !next_parent)
2808 goto unlock;
2809
2810 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820 raw_spin_lock(&ctx->lock);
2821 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2822 if (context_equiv(ctx, next_ctx)) {
2823 WRITE_ONCE(ctx->task, next);
2824 WRITE_ONCE(next_ctx->task, task);
2825
2826 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2827
2828
2829
2830
2831
2832
2833
2834
2835 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2836 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2837
2838 do_switch = 0;
2839
2840 perf_event_sync_stat(ctx, next_ctx);
2841 }
2842 raw_spin_unlock(&next_ctx->lock);
2843 raw_spin_unlock(&ctx->lock);
2844 }
2845unlock:
2846 rcu_read_unlock();
2847
2848 if (do_switch) {
2849 raw_spin_lock(&ctx->lock);
2850 task_ctx_sched_out(cpuctx, ctx);
2851 raw_spin_unlock(&ctx->lock);
2852 }
2853}
2854
2855static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2856
2857void perf_sched_cb_dec(struct pmu *pmu)
2858{
2859 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2860
2861 this_cpu_dec(perf_sched_cb_usages);
2862
2863 if (!--cpuctx->sched_cb_usage)
2864 list_del(&cpuctx->sched_cb_entry);
2865}
2866
2867
2868void perf_sched_cb_inc(struct pmu *pmu)
2869{
2870 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2871
2872 if (!cpuctx->sched_cb_usage++)
2873 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
2874
2875 this_cpu_inc(perf_sched_cb_usages);
2876}
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886static void perf_pmu_sched_task(struct task_struct *prev,
2887 struct task_struct *next,
2888 bool sched_in)
2889{
2890 struct perf_cpu_context *cpuctx;
2891 struct pmu *pmu;
2892
2893 if (prev == next)
2894 return;
2895
2896 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
2897 pmu = cpuctx->unique_pmu;
2898
2899 if (WARN_ON_ONCE(!pmu->sched_task))
2900 continue;
2901
2902 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2903 perf_pmu_disable(pmu);
2904
2905 pmu->sched_task(cpuctx->task_ctx, sched_in);
2906
2907 perf_pmu_enable(pmu);
2908 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2909 }
2910}
2911
2912static void perf_event_switch(struct task_struct *task,
2913 struct task_struct *next_prev, bool sched_in);
2914
2915#define for_each_task_context_nr(ctxn) \
2916 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929void __perf_event_task_sched_out(struct task_struct *task,
2930 struct task_struct *next)
2931{
2932 int ctxn;
2933
2934 if (__this_cpu_read(perf_sched_cb_usages))
2935 perf_pmu_sched_task(task, next, false);
2936
2937 if (atomic_read(&nr_switch_events))
2938 perf_event_switch(task, next, false);
2939
2940 for_each_task_context_nr(ctxn)
2941 perf_event_context_sched_out(task, ctxn, next);
2942
2943
2944
2945
2946
2947
2948 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2949 perf_cgroup_sched_out(task, next);
2950}
2951
2952
2953
2954
2955static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2956 enum event_type_t event_type)
2957{
2958 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2959}
2960
2961static void
2962ctx_pinned_sched_in(struct perf_event_context *ctx,
2963 struct perf_cpu_context *cpuctx)
2964{
2965 struct perf_event *event;
2966
2967 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2968 if (event->state <= PERF_EVENT_STATE_OFF)
2969 continue;
2970 if (!event_filter_match(event))
2971 continue;
2972
2973
2974 if (is_cgroup_event(event))
2975 perf_cgroup_mark_enabled(event, ctx);
2976
2977 if (group_can_go_on(event, cpuctx, 1))
2978 group_sched_in(event, cpuctx, ctx);
2979
2980
2981
2982
2983
2984 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2985 update_group_times(event);
2986 event->state = PERF_EVENT_STATE_ERROR;
2987 }
2988 }
2989}
2990
2991static void
2992ctx_flexible_sched_in(struct perf_event_context *ctx,
2993 struct perf_cpu_context *cpuctx)
2994{
2995 struct perf_event *event;
2996 int can_add_hw = 1;
2997
2998 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2999
3000 if (event->state <= PERF_EVENT_STATE_OFF)
3001 continue;
3002
3003
3004
3005
3006 if (!event_filter_match(event))
3007 continue;
3008
3009
3010 if (is_cgroup_event(event))
3011 perf_cgroup_mark_enabled(event, ctx);
3012
3013 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3014 if (group_sched_in(event, cpuctx, ctx))
3015 can_add_hw = 0;
3016 }
3017 }
3018}
3019
3020static void
3021ctx_sched_in(struct perf_event_context *ctx,
3022 struct perf_cpu_context *cpuctx,
3023 enum event_type_t event_type,
3024 struct task_struct *task)
3025{
3026 int is_active = ctx->is_active;
3027 u64 now;
3028
3029 lockdep_assert_held(&ctx->lock);
3030
3031 if (likely(!ctx->nr_events))
3032 return;
3033
3034 ctx->is_active |= (event_type | EVENT_TIME);
3035 if (ctx->task) {
3036 if (!is_active)
3037 cpuctx->task_ctx = ctx;
3038 else
3039 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3040 }
3041
3042 is_active ^= ctx->is_active;
3043
3044 if (is_active & EVENT_TIME) {
3045
3046 now = perf_clock();
3047 ctx->timestamp = now;
3048 perf_cgroup_set_timestamp(task, ctx);
3049 }
3050
3051
3052
3053
3054
3055 if (is_active & EVENT_PINNED)
3056 ctx_pinned_sched_in(ctx, cpuctx);
3057
3058
3059 if (is_active & EVENT_FLEXIBLE)
3060 ctx_flexible_sched_in(ctx, cpuctx);
3061}
3062
3063static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3064 enum event_type_t event_type,
3065 struct task_struct *task)
3066{
3067 struct perf_event_context *ctx = &cpuctx->ctx;
3068
3069 ctx_sched_in(ctx, cpuctx, event_type, task);
3070}
3071
3072static void perf_event_context_sched_in(struct perf_event_context *ctx,
3073 struct task_struct *task)
3074{
3075 struct perf_cpu_context *cpuctx;
3076
3077 cpuctx = __get_cpu_context(ctx);
3078 if (cpuctx->task_ctx == ctx)
3079 return;
3080
3081 perf_ctx_lock(cpuctx, ctx);
3082 perf_pmu_disable(ctx->pmu);
3083
3084
3085
3086
3087
3088 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3089 perf_event_sched_in(cpuctx, ctx, task);
3090 perf_pmu_enable(ctx->pmu);
3091 perf_ctx_unlock(cpuctx, ctx);
3092}
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105void __perf_event_task_sched_in(struct task_struct *prev,
3106 struct task_struct *task)
3107{
3108 struct perf_event_context *ctx;
3109 int ctxn;
3110
3111
3112
3113
3114
3115
3116
3117
3118 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3119 perf_cgroup_sched_in(prev, task);
3120
3121 for_each_task_context_nr(ctxn) {
3122 ctx = task->perf_event_ctxp[ctxn];
3123 if (likely(!ctx))
3124 continue;
3125
3126 perf_event_context_sched_in(ctx, task);
3127 }
3128
3129 if (atomic_read(&nr_switch_events))
3130 perf_event_switch(task, prev, true);
3131
3132 if (__this_cpu_read(perf_sched_cb_usages))
3133 perf_pmu_sched_task(prev, task, true);
3134}
3135
3136static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3137{
3138 u64 frequency = event->attr.sample_freq;
3139 u64 sec = NSEC_PER_SEC;
3140 u64 divisor, dividend;
3141
3142 int count_fls, nsec_fls, frequency_fls, sec_fls;
3143
3144 count_fls = fls64(count);
3145 nsec_fls = fls64(nsec);
3146 frequency_fls = fls64(frequency);
3147 sec_fls = 30;
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163#define REDUCE_FLS(a, b) \
3164do { \
3165 if (a##_fls > b##_fls) { \
3166 a >>= 1; \
3167 a##_fls--; \
3168 } else { \
3169 b >>= 1; \
3170 b##_fls--; \
3171 } \
3172} while (0)
3173
3174
3175
3176
3177
3178 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3179 REDUCE_FLS(nsec, frequency);
3180 REDUCE_FLS(sec, count);
3181 }
3182
3183 if (count_fls + sec_fls > 64) {
3184 divisor = nsec * frequency;
3185
3186 while (count_fls + sec_fls > 64) {
3187 REDUCE_FLS(count, sec);
3188 divisor >>= 1;
3189 }
3190
3191 dividend = count * sec;
3192 } else {
3193 dividend = count * sec;
3194
3195 while (nsec_fls + frequency_fls > 64) {
3196 REDUCE_FLS(nsec, frequency);
3197 dividend >>= 1;
3198 }
3199
3200 divisor = nsec * frequency;
3201 }
3202
3203 if (!divisor)
3204 return dividend;
3205
3206 return div64_u64(dividend, divisor);
3207}
3208
3209static DEFINE_PER_CPU(int, perf_throttled_count);
3210static DEFINE_PER_CPU(u64, perf_throttled_seq);
3211
3212static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3213{
3214 struct hw_perf_event *hwc = &event->hw;
3215 s64 period, sample_period;
3216 s64 delta;
3217
3218 period = perf_calculate_period(event, nsec, count);
3219
3220 delta = (s64)(period - hwc->sample_period);
3221 delta = (delta + 7) / 8;
3222
3223 sample_period = hwc->sample_period + delta;
3224
3225 if (!sample_period)
3226 sample_period = 1;
3227
3228 hwc->sample_period = sample_period;
3229
3230 if (local64_read(&hwc->period_left) > 8*sample_period) {
3231 if (disable)
3232 event->pmu->stop(event, PERF_EF_UPDATE);
3233
3234 local64_set(&hwc->period_left, 0);
3235
3236 if (disable)
3237 event->pmu->start(event, PERF_EF_RELOAD);
3238 }
3239}
3240
3241
3242
3243
3244
3245
3246static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3247 int needs_unthr)
3248{
3249 struct perf_event *event;
3250 struct hw_perf_event *hwc;
3251 u64 now, period = TICK_NSEC;
3252 s64 delta;
3253
3254
3255
3256
3257
3258
3259 if (!(ctx->nr_freq || needs_unthr))
3260 return;
3261
3262 raw_spin_lock(&ctx->lock);
3263 perf_pmu_disable(ctx->pmu);
3264
3265 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3266 if (event->state != PERF_EVENT_STATE_ACTIVE)
3267 continue;
3268
3269 if (!event_filter_match(event))
3270 continue;
3271
3272 perf_pmu_disable(event->pmu);
3273
3274 hwc = &event->hw;
3275
3276 if (hwc->interrupts == MAX_INTERRUPTS) {
3277 hwc->interrupts = 0;
3278 perf_log_throttle(event, 1);
3279 event->pmu->start(event, 0);
3280 }
3281
3282 if (!event->attr.freq || !event->attr.sample_freq)
3283 goto next;
3284
3285
3286
3287
3288 event->pmu->stop(event, PERF_EF_UPDATE);
3289
3290 now = local64_read(&event->count);
3291 delta = now - hwc->freq_count_stamp;
3292 hwc->freq_count_stamp = now;
3293
3294
3295
3296
3297
3298
3299
3300
3301 if (delta > 0)
3302 perf_adjust_period(event, period, delta, false);
3303
3304 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3305 next:
3306 perf_pmu_enable(event->pmu);
3307 }
3308
3309 perf_pmu_enable(ctx->pmu);
3310 raw_spin_unlock(&ctx->lock);
3311}
3312
3313
3314
3315
3316static void rotate_ctx(struct perf_event_context *ctx)
3317{
3318
3319
3320
3321
3322 if (!ctx->rotate_disable)
3323 list_rotate_left(&ctx->flexible_groups);
3324}
3325
3326static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3327{
3328 struct perf_event_context *ctx = NULL;
3329 int rotate = 0;
3330
3331 if (cpuctx->ctx.nr_events) {
3332 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3333 rotate = 1;
3334 }
3335
3336 ctx = cpuctx->task_ctx;
3337 if (ctx && ctx->nr_events) {
3338 if (ctx->nr_events != ctx->nr_active)
3339 rotate = 1;
3340 }
3341
3342 if (!rotate)
3343 goto done;
3344
3345 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3346 perf_pmu_disable(cpuctx->ctx.pmu);
3347
3348 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3349 if (ctx)
3350 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3351
3352 rotate_ctx(&cpuctx->ctx);
3353 if (ctx)
3354 rotate_ctx(ctx);
3355
3356 perf_event_sched_in(cpuctx, ctx, current);
3357
3358 perf_pmu_enable(cpuctx->ctx.pmu);
3359 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3360done:
3361
3362 return rotate;
3363}
3364
3365void perf_event_task_tick(void)
3366{
3367 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3368 struct perf_event_context *ctx, *tmp;
3369 int throttled;
3370
3371 WARN_ON(!irqs_disabled());
3372
3373 __this_cpu_inc(perf_throttled_seq);
3374 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3375 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3376
3377 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3378 perf_adjust_freq_unthr_context(ctx, throttled);
3379}
3380
3381static int event_enable_on_exec(struct perf_event *event,
3382 struct perf_event_context *ctx)
3383{
3384 if (!event->attr.enable_on_exec)
3385 return 0;
3386
3387 event->attr.enable_on_exec = 0;
3388 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3389 return 0;
3390
3391 __perf_event_mark_enabled(event);
3392
3393 return 1;
3394}
3395
3396
3397
3398
3399
3400static void perf_event_enable_on_exec(int ctxn)
3401{
3402 struct perf_event_context *ctx, *clone_ctx = NULL;
3403 struct perf_cpu_context *cpuctx;
3404 struct perf_event *event;
3405 unsigned long flags;
3406 int enabled = 0;
3407
3408 local_irq_save(flags);
3409 ctx = current->perf_event_ctxp[ctxn];
3410 if (!ctx || !ctx->nr_events)
3411 goto out;
3412
3413 cpuctx = __get_cpu_context(ctx);
3414 perf_ctx_lock(cpuctx, ctx);
3415 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3416 list_for_each_entry(event, &ctx->event_list, event_entry)
3417 enabled |= event_enable_on_exec(event, ctx);
3418
3419
3420
3421
3422 if (enabled) {
3423 clone_ctx = unclone_ctx(ctx);
3424 ctx_resched(cpuctx, ctx);
3425 }
3426 perf_ctx_unlock(cpuctx, ctx);
3427
3428out:
3429 local_irq_restore(flags);
3430
3431 if (clone_ctx)
3432 put_ctx(clone_ctx);
3433}
3434
3435struct perf_read_data {
3436 struct perf_event *event;
3437 bool group;
3438 int ret;
3439};
3440
3441static int find_cpu_to_read(struct perf_event *event, int local_cpu)
3442{
3443 int event_cpu = event->oncpu;
3444 u16 local_pkg, event_pkg;
3445
3446 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3447 event_pkg = topology_physical_package_id(event_cpu);
3448 local_pkg = topology_physical_package_id(local_cpu);
3449
3450 if (event_pkg == local_pkg)
3451 return local_cpu;
3452 }
3453
3454 return event_cpu;
3455}
3456
3457
3458
3459
3460static void __perf_event_read(void *info)
3461{
3462 struct perf_read_data *data = info;
3463 struct perf_event *sub, *event = data->event;
3464 struct perf_event_context *ctx = event->ctx;
3465 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3466 struct pmu *pmu = event->pmu;
3467
3468
3469
3470
3471
3472
3473
3474
3475 if (ctx->task && cpuctx->task_ctx != ctx)
3476 return;
3477
3478 raw_spin_lock(&ctx->lock);
3479 if (ctx->is_active) {
3480 update_context_time(ctx);
3481 update_cgrp_time_from_event(event);
3482 }
3483
3484 update_event_times(event);
3485 if (event->state != PERF_EVENT_STATE_ACTIVE)
3486 goto unlock;
3487
3488 if (!data->group) {
3489 pmu->read(event);
3490 data->ret = 0;
3491 goto unlock;
3492 }
3493
3494 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3495
3496 pmu->read(event);
3497
3498 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3499 update_event_times(sub);
3500 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3501
3502
3503
3504
3505 sub->pmu->read(sub);
3506 }
3507 }
3508
3509 data->ret = pmu->commit_txn(pmu);
3510
3511unlock:
3512 raw_spin_unlock(&ctx->lock);
3513}
3514
3515static inline u64 perf_event_count(struct perf_event *event)
3516{
3517 if (event->pmu->count)
3518 return event->pmu->count(event);
3519
3520 return __perf_event_count(event);
3521}
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531u64 perf_event_read_local(struct perf_event *event)
3532{
3533 unsigned long flags;
3534 u64 val;
3535
3536
3537
3538
3539
3540 local_irq_save(flags);
3541
3542
3543 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3544 event->hw.target != current);
3545
3546
3547 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3548 event->cpu != smp_processor_id());
3549
3550
3551
3552
3553
3554 WARN_ON_ONCE(event->attr.inherit);
3555
3556
3557
3558
3559
3560 WARN_ON_ONCE(event->pmu->count);
3561
3562
3563
3564
3565
3566
3567 if (event->oncpu == smp_processor_id())
3568 event->pmu->read(event);
3569
3570 val = local64_read(&event->count);
3571 local_irq_restore(flags);
3572
3573 return val;
3574}
3575
3576static int perf_event_read(struct perf_event *event, bool group)
3577{
3578 int ret = 0, cpu_to_read, local_cpu;
3579
3580
3581
3582
3583
3584 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3585 struct perf_read_data data = {
3586 .event = event,
3587 .group = group,
3588 .ret = 0,
3589 };
3590
3591 local_cpu = get_cpu();
3592 cpu_to_read = find_cpu_to_read(event, local_cpu);
3593 put_cpu();
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605 (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
3606 ret = data.ret;
3607 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3608 struct perf_event_context *ctx = event->ctx;
3609 unsigned long flags;
3610
3611 raw_spin_lock_irqsave(&ctx->lock, flags);
3612
3613
3614
3615
3616
3617 if (ctx->is_active) {
3618 update_context_time(ctx);
3619 update_cgrp_time_from_event(event);
3620 }
3621 if (group)
3622 update_group_times(event);
3623 else
3624 update_event_times(event);
3625 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3626 }
3627
3628 return ret;
3629}
3630
3631
3632
3633
3634static void __perf_event_init_context(struct perf_event_context *ctx)
3635{
3636 raw_spin_lock_init(&ctx->lock);
3637 mutex_init(&ctx->mutex);
3638 INIT_LIST_HEAD(&ctx->active_ctx_list);
3639 INIT_LIST_HEAD(&ctx->pinned_groups);
3640 INIT_LIST_HEAD(&ctx->flexible_groups);
3641 INIT_LIST_HEAD(&ctx->event_list);
3642 atomic_set(&ctx->refcount, 1);
3643}
3644
3645static struct perf_event_context *
3646alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3647{
3648 struct perf_event_context *ctx;
3649
3650 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3651 if (!ctx)
3652 return NULL;
3653
3654 __perf_event_init_context(ctx);
3655 if (task) {
3656 ctx->task = task;
3657 get_task_struct(task);
3658 }
3659 ctx->pmu = pmu;
3660
3661 return ctx;
3662}
3663
3664static struct task_struct *
3665find_lively_task_by_vpid(pid_t vpid)
3666{
3667 struct task_struct *task;
3668
3669 rcu_read_lock();
3670 if (!vpid)
3671 task = current;
3672 else
3673 task = find_task_by_vpid(vpid);
3674 if (task)
3675 get_task_struct(task);
3676 rcu_read_unlock();
3677
3678 if (!task)
3679 return ERR_PTR(-ESRCH);
3680
3681 return task;
3682}
3683
3684
3685
3686
3687static struct perf_event_context *
3688find_get_context(struct pmu *pmu, struct task_struct *task,
3689 struct perf_event *event)
3690{
3691 struct perf_event_context *ctx, *clone_ctx = NULL;
3692 struct perf_cpu_context *cpuctx;
3693 void *task_ctx_data = NULL;
3694 unsigned long flags;
3695 int ctxn, err;
3696 int cpu = event->cpu;
3697
3698 if (!task) {
3699
3700 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3701 return ERR_PTR(-EACCES);
3702
3703
3704
3705
3706
3707
3708 if (!cpu_online(cpu))
3709 return ERR_PTR(-ENODEV);
3710
3711 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3712 ctx = &cpuctx->ctx;
3713 get_ctx(ctx);
3714 ++ctx->pin_count;
3715
3716 return ctx;
3717 }
3718
3719 err = -EINVAL;
3720 ctxn = pmu->task_ctx_nr;
3721 if (ctxn < 0)
3722 goto errout;
3723
3724 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3725 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3726 if (!task_ctx_data) {
3727 err = -ENOMEM;
3728 goto errout;
3729 }
3730 }
3731
3732retry:
3733 ctx = perf_lock_task_context(task, ctxn, &flags);
3734 if (ctx) {
3735 clone_ctx = unclone_ctx(ctx);
3736 ++ctx->pin_count;
3737
3738 if (task_ctx_data && !ctx->task_ctx_data) {
3739 ctx->task_ctx_data = task_ctx_data;
3740 task_ctx_data = NULL;
3741 }
3742 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3743
3744 if (clone_ctx)
3745 put_ctx(clone_ctx);
3746 } else {
3747 ctx = alloc_perf_context(pmu, task);
3748 err = -ENOMEM;
3749 if (!ctx)
3750 goto errout;
3751
3752 if (task_ctx_data) {
3753 ctx->task_ctx_data = task_ctx_data;
3754 task_ctx_data = NULL;
3755 }
3756
3757 err = 0;
3758 mutex_lock(&task->perf_event_mutex);
3759
3760
3761
3762
3763 if (task->flags & PF_EXITING)
3764 err = -ESRCH;
3765 else if (task->perf_event_ctxp[ctxn])
3766 err = -EAGAIN;
3767 else {
3768 get_ctx(ctx);
3769 ++ctx->pin_count;
3770 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3771 }
3772 mutex_unlock(&task->perf_event_mutex);
3773
3774 if (unlikely(err)) {
3775 put_ctx(ctx);
3776
3777 if (err == -EAGAIN)
3778 goto retry;
3779 goto errout;
3780 }
3781 }
3782
3783 kfree(task_ctx_data);
3784 return ctx;
3785
3786errout:
3787 kfree(task_ctx_data);
3788 return ERR_PTR(err);
3789}
3790
3791static void perf_event_free_filter(struct perf_event *event);
3792static void perf_event_free_bpf_prog(struct perf_event *event);
3793
3794static void free_event_rcu(struct rcu_head *head)
3795{
3796 struct perf_event *event;
3797
3798 event = container_of(head, struct perf_event, rcu_head);
3799 if (event->ns)
3800 put_pid_ns(event->ns);
3801 perf_event_free_filter(event);
3802 kfree(event);
3803}
3804
3805static void ring_buffer_attach(struct perf_event *event,
3806 struct ring_buffer *rb);
3807
3808static void detach_sb_event(struct perf_event *event)
3809{
3810 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3811
3812 raw_spin_lock(&pel->lock);
3813 list_del_rcu(&event->sb_list);
3814 raw_spin_unlock(&pel->lock);
3815}
3816
3817static bool is_sb_event(struct perf_event *event)
3818{
3819 struct perf_event_attr *attr = &event->attr;
3820
3821 if (event->parent)
3822 return false;
3823
3824 if (event->attach_state & PERF_ATTACH_TASK)
3825 return false;
3826
3827 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3828 attr->comm || attr->comm_exec ||
3829 attr->task ||
3830 attr->context_switch)
3831 return true;
3832 return false;
3833}
3834
3835static void unaccount_pmu_sb_event(struct perf_event *event)
3836{
3837 if (is_sb_event(event))
3838 detach_sb_event(event);
3839}
3840
3841static void unaccount_event_cpu(struct perf_event *event, int cpu)
3842{
3843 if (event->parent)
3844 return;
3845
3846 if (is_cgroup_event(event))
3847 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3848}
3849
3850#ifdef CONFIG_NO_HZ_FULL
3851static DEFINE_SPINLOCK(nr_freq_lock);
3852#endif
3853
3854static void unaccount_freq_event_nohz(void)
3855{
3856#ifdef CONFIG_NO_HZ_FULL
3857 spin_lock(&nr_freq_lock);
3858 if (atomic_dec_and_test(&nr_freq_events))
3859 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3860 spin_unlock(&nr_freq_lock);
3861#endif
3862}
3863
3864static void unaccount_freq_event(void)
3865{
3866 if (tick_nohz_full_enabled())
3867 unaccount_freq_event_nohz();
3868 else
3869 atomic_dec(&nr_freq_events);
3870}
3871
3872static void unaccount_event(struct perf_event *event)
3873{
3874 bool dec = false;
3875
3876 if (event->parent)
3877 return;
3878
3879 if (event->attach_state & PERF_ATTACH_TASK)
3880 dec = true;
3881 if (event->attr.mmap || event->attr.mmap_data)
3882 atomic_dec(&nr_mmap_events);
3883 if (event->attr.comm)
3884 atomic_dec(&nr_comm_events);
3885 if (event->attr.task)
3886 atomic_dec(&nr_task_events);
3887 if (event->attr.freq)
3888 unaccount_freq_event();
3889 if (event->attr.context_switch) {
3890 dec = true;
3891 atomic_dec(&nr_switch_events);
3892 }
3893 if (is_cgroup_event(event))
3894 dec = true;
3895 if (has_branch_stack(event))
3896 dec = true;
3897
3898 if (dec) {
3899 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3900 schedule_delayed_work(&perf_sched_work, HZ);
3901 }
3902
3903 unaccount_event_cpu(event, event->cpu);
3904
3905 unaccount_pmu_sb_event(event);
3906}
3907
3908static void perf_sched_delayed(struct work_struct *work)
3909{
3910 mutex_lock(&perf_sched_mutex);
3911 if (atomic_dec_and_test(&perf_sched_count))
3912 static_branch_disable(&perf_sched_events);
3913 mutex_unlock(&perf_sched_mutex);
3914}
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928static int exclusive_event_init(struct perf_event *event)
3929{
3930 struct pmu *pmu = event->pmu;
3931
3932 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3933 return 0;
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948 if (event->attach_state & PERF_ATTACH_TASK) {
3949 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3950 return -EBUSY;
3951 } else {
3952 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3953 return -EBUSY;
3954 }
3955
3956 return 0;
3957}
3958
3959static void exclusive_event_destroy(struct perf_event *event)
3960{
3961 struct pmu *pmu = event->pmu;
3962
3963 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3964 return;
3965
3966
3967 if (event->attach_state & PERF_ATTACH_TASK)
3968 atomic_dec(&pmu->exclusive_cnt);
3969 else
3970 atomic_inc(&pmu->exclusive_cnt);
3971}
3972
3973static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3974{
3975 if ((e1->pmu == e2->pmu) &&
3976 (e1->cpu == e2->cpu ||
3977 e1->cpu == -1 ||
3978 e2->cpu == -1))
3979 return true;
3980 return false;
3981}
3982
3983
3984static bool exclusive_event_installable(struct perf_event *event,
3985 struct perf_event_context *ctx)
3986{
3987 struct perf_event *iter_event;
3988 struct pmu *pmu = event->pmu;
3989
3990 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3991 return true;
3992
3993 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3994 if (exclusive_event_match(iter_event, event))
3995 return false;
3996 }
3997
3998 return true;
3999}
4000
4001static void perf_addr_filters_splice(struct perf_event *event,
4002 struct list_head *head);
4003
4004static void _free_event(struct perf_event *event)
4005{
4006 irq_work_sync(&event->pending);
4007
4008 unaccount_event(event);
4009
4010 if (event->rb) {
4011
4012
4013
4014
4015
4016
4017 mutex_lock(&event->mmap_mutex);
4018 ring_buffer_attach(event, NULL);
4019 mutex_unlock(&event->mmap_mutex);
4020 }
4021
4022 if (is_cgroup_event(event))
4023 perf_detach_cgroup(event);
4024
4025 if (!event->parent) {
4026 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4027 put_callchain_buffers();
4028 }
4029
4030 perf_event_free_bpf_prog(event);
4031 perf_addr_filters_splice(event, NULL);
4032 kfree(event->addr_filters_offs);
4033
4034 if (event->destroy)
4035 event->destroy(event);
4036
4037 if (event->ctx)
4038 put_ctx(event->ctx);
4039
4040 exclusive_event_destroy(event);
4041 module_put(event->pmu->module);
4042
4043 call_rcu(&event->rcu_head, free_event_rcu);
4044}
4045
4046
4047
4048
4049
4050static void free_event(struct perf_event *event)
4051{
4052 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4053 "unexpected event refcount: %ld; ptr=%p\n",
4054 atomic_long_read(&event->refcount), event)) {
4055
4056 return;
4057 }
4058
4059 _free_event(event);
4060}
4061
4062
4063
4064
4065static void perf_remove_from_owner(struct perf_event *event)
4066{
4067 struct task_struct *owner;
4068
4069 rcu_read_lock();
4070
4071
4072
4073
4074
4075
4076 owner = lockless_dereference(event->owner);
4077 if (owner) {
4078
4079
4080
4081
4082
4083 get_task_struct(owner);
4084 }
4085 rcu_read_unlock();
4086
4087 if (owner) {
4088
4089
4090
4091
4092
4093
4094
4095
4096 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4097
4098
4099
4100
4101
4102
4103
4104 if (event->owner) {
4105 list_del_init(&event->owner_entry);
4106 smp_store_release(&event->owner, NULL);
4107 }
4108 mutex_unlock(&owner->perf_event_mutex);
4109 put_task_struct(owner);
4110 }
4111}
4112
4113static void put_event(struct perf_event *event)
4114{
4115 if (!atomic_long_dec_and_test(&event->refcount))
4116 return;
4117
4118 _free_event(event);
4119}
4120
4121
4122
4123
4124
4125
4126int perf_event_release_kernel(struct perf_event *event)
4127{
4128 struct perf_event_context *ctx = event->ctx;
4129 struct perf_event *child, *tmp;
4130
4131
4132
4133
4134
4135 if (!ctx) {
4136 WARN_ON_ONCE(event->attach_state &
4137 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4138 goto no_ctx;
4139 }
4140
4141 if (!is_kernel_event(event))
4142 perf_remove_from_owner(event);
4143
4144 ctx = perf_event_ctx_lock(event);
4145 WARN_ON_ONCE(ctx->parent_ctx);
4146 perf_remove_from_context(event, DETACH_GROUP);
4147
4148 raw_spin_lock_irq(&ctx->lock);
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160 event->state = PERF_EVENT_STATE_DEAD;
4161 raw_spin_unlock_irq(&ctx->lock);
4162
4163 perf_event_ctx_unlock(event, ctx);
4164
4165again:
4166 mutex_lock(&event->child_mutex);
4167 list_for_each_entry(child, &event->child_list, child_list) {
4168
4169
4170
4171
4172
4173 ctx = lockless_dereference(child->ctx);
4174
4175
4176
4177
4178
4179
4180
4181
4182 get_ctx(ctx);
4183
4184
4185
4186
4187
4188
4189 mutex_unlock(&event->child_mutex);
4190 mutex_lock(&ctx->mutex);
4191 mutex_lock(&event->child_mutex);
4192
4193
4194
4195
4196
4197
4198 tmp = list_first_entry_or_null(&event->child_list,
4199 struct perf_event, child_list);
4200 if (tmp == child) {
4201 perf_remove_from_context(child, DETACH_GROUP);
4202 list_del(&child->child_list);
4203 free_event(child);
4204
4205
4206
4207
4208 put_event(event);
4209 }
4210
4211 mutex_unlock(&event->child_mutex);
4212 mutex_unlock(&ctx->mutex);
4213 put_ctx(ctx);
4214 goto again;
4215 }
4216 mutex_unlock(&event->child_mutex);
4217
4218no_ctx:
4219 put_event(event);
4220 return 0;
4221}
4222EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4223
4224
4225
4226
4227static int perf_release(struct inode *inode, struct file *file)
4228{
4229 perf_event_release_kernel(file->private_data);
4230 return 0;
4231}
4232
4233u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4234{
4235 struct perf_event *child;
4236 u64 total = 0;
4237
4238 *enabled = 0;
4239 *running = 0;
4240
4241 mutex_lock(&event->child_mutex);
4242
4243 (void)perf_event_read(event, false);
4244 total += perf_event_count(event);
4245
4246 *enabled += event->total_time_enabled +
4247 atomic64_read(&event->child_total_time_enabled);
4248 *running += event->total_time_running +
4249 atomic64_read(&event->child_total_time_running);
4250
4251 list_for_each_entry(child, &event->child_list, child_list) {
4252 (void)perf_event_read(child, false);
4253 total += perf_event_count(child);
4254 *enabled += child->total_time_enabled;
4255 *running += child->total_time_running;
4256 }
4257 mutex_unlock(&event->child_mutex);
4258
4259 return total;
4260}
4261EXPORT_SYMBOL_GPL(perf_event_read_value);
4262
4263static int __perf_read_group_add(struct perf_event *leader,
4264 u64 read_format, u64 *values)
4265{
4266 struct perf_event *sub;
4267 int n = 1;
4268 int ret;
4269
4270 ret = perf_event_read(leader, true);
4271 if (ret)
4272 return ret;
4273
4274
4275
4276
4277
4278
4279 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4280 values[n++] += leader->total_time_enabled +
4281 atomic64_read(&leader->child_total_time_enabled);
4282 }
4283
4284 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4285 values[n++] += leader->total_time_running +
4286 atomic64_read(&leader->child_total_time_running);
4287 }
4288
4289
4290
4291
4292 values[n++] += perf_event_count(leader);
4293 if (read_format & PERF_FORMAT_ID)
4294 values[n++] = primary_event_id(leader);
4295
4296 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4297 values[n++] += perf_event_count(sub);
4298 if (read_format & PERF_FORMAT_ID)
4299 values[n++] = primary_event_id(sub);
4300 }
4301
4302 return 0;
4303}
4304
4305static int perf_read_group(struct perf_event *event,
4306 u64 read_format, char __user *buf)
4307{
4308 struct perf_event *leader = event->group_leader, *child;
4309 struct perf_event_context *ctx = leader->ctx;
4310 int ret;
4311 u64 *values;
4312
4313 lockdep_assert_held(&ctx->mutex);
4314
4315 values = kzalloc(event->read_size, GFP_KERNEL);
4316 if (!values)
4317 return -ENOMEM;
4318
4319 values[0] = 1 + leader->nr_siblings;
4320
4321
4322
4323
4324
4325 mutex_lock(&leader->child_mutex);
4326
4327 ret = __perf_read_group_add(leader, read_format, values);
4328 if (ret)
4329 goto unlock;
4330
4331 list_for_each_entry(child, &leader->child_list, child_list) {
4332 ret = __perf_read_group_add(child, read_format, values);
4333 if (ret)
4334 goto unlock;
4335 }
4336
4337 mutex_unlock(&leader->child_mutex);
4338
4339 ret = event->read_size;
4340 if (copy_to_user(buf, values, event->read_size))
4341 ret = -EFAULT;
4342 goto out;
4343
4344unlock:
4345 mutex_unlock(&leader->child_mutex);
4346out:
4347 kfree(values);
4348 return ret;
4349}
4350
4351static int perf_read_one(struct perf_event *event,
4352 u64 read_format, char __user *buf)
4353{
4354 u64 enabled, running;
4355 u64 values[4];
4356 int n = 0;
4357
4358 values[n++] = perf_event_read_value(event, &enabled, &running);
4359 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4360 values[n++] = enabled;
4361 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4362 values[n++] = running;
4363 if (read_format & PERF_FORMAT_ID)
4364 values[n++] = primary_event_id(event);
4365
4366 if (copy_to_user(buf, values, n * sizeof(u64)))
4367 return -EFAULT;
4368
4369 return n * sizeof(u64);
4370}
4371
4372static bool is_event_hup(struct perf_event *event)
4373{
4374 bool no_children;
4375
4376 if (event->state > PERF_EVENT_STATE_EXIT)
4377 return false;
4378
4379 mutex_lock(&event->child_mutex);
4380 no_children = list_empty(&event->child_list);
4381 mutex_unlock(&event->child_mutex);
4382 return no_children;
4383}
4384
4385
4386
4387
4388static ssize_t
4389__perf_read(struct perf_event *event, char __user *buf, size_t count)
4390{
4391 u64 read_format = event->attr.read_format;
4392 int ret;
4393
4394
4395
4396
4397
4398
4399 if (event->state == PERF_EVENT_STATE_ERROR)
4400 return 0;
4401
4402 if (count < event->read_size)
4403 return -ENOSPC;
4404
4405 WARN_ON_ONCE(event->ctx->parent_ctx);
4406 if (read_format & PERF_FORMAT_GROUP)
4407 ret = perf_read_group(event, read_format, buf);
4408 else
4409 ret = perf_read_one(event, read_format, buf);
4410
4411 return ret;
4412}
4413
4414static ssize_t
4415perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4416{
4417 struct perf_event *event = file->private_data;
4418 struct perf_event_context *ctx;
4419 int ret;
4420
4421 ctx = perf_event_ctx_lock(event);
4422 ret = __perf_read(event, buf, count);
4423 perf_event_ctx_unlock(event, ctx);
4424
4425 return ret;
4426}
4427
4428static unsigned int perf_poll(struct file *file, poll_table *wait)
4429{
4430 struct perf_event *event = file->private_data;
4431 struct ring_buffer *rb;
4432 unsigned int events = POLLHUP;
4433
4434 poll_wait(file, &event->waitq, wait);
4435
4436 if (is_event_hup(event))
4437 return events;
4438
4439
4440
4441
4442
4443 mutex_lock(&event->mmap_mutex);
4444 rb = event->rb;
4445 if (rb)
4446 events = atomic_xchg(&rb->poll, 0);
4447 mutex_unlock(&event->mmap_mutex);
4448 return events;
4449}
4450
4451static void _perf_event_reset(struct perf_event *event)
4452{
4453 (void)perf_event_read(event, false);
4454 local64_set(&event->count, 0);
4455 perf_event_update_userpage(event);
4456}
4457
4458
4459
4460
4461
4462
4463
4464static void perf_event_for_each_child(struct perf_event *event,
4465 void (*func)(struct perf_event *))
4466{
4467 struct perf_event *child;
4468
4469 WARN_ON_ONCE(event->ctx->parent_ctx);
4470
4471 mutex_lock(&event->child_mutex);
4472 func(event);
4473 list_for_each_entry(child, &event->child_list, child_list)
4474 func(child);
4475 mutex_unlock(&event->child_mutex);
4476}
4477
4478static void perf_event_for_each(struct perf_event *event,
4479 void (*func)(struct perf_event *))
4480{
4481 struct perf_event_context *ctx = event->ctx;
4482 struct perf_event *sibling;
4483
4484 lockdep_assert_held(&ctx->mutex);
4485
4486 event = event->group_leader;
4487
4488 perf_event_for_each_child(event, func);
4489 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4490 perf_event_for_each_child(sibling, func);
4491}
4492
4493static void __perf_event_period(struct perf_event *event,
4494 struct perf_cpu_context *cpuctx,
4495 struct perf_event_context *ctx,
4496 void *info)
4497{
4498 u64 value = *((u64 *)info);
4499 bool active;
4500
4501 if (event->attr.freq) {
4502 event->attr.sample_freq = value;
4503 } else {
4504 event->attr.sample_period = value;
4505 event->hw.sample_period = value;
4506 }
4507
4508 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4509 if (active) {
4510 perf_pmu_disable(ctx->pmu);
4511
4512
4513
4514
4515 if (event->hw.interrupts == MAX_INTERRUPTS) {
4516 event->hw.interrupts = 0;
4517 perf_log_throttle(event, 1);
4518 }
4519 event->pmu->stop(event, PERF_EF_UPDATE);
4520 }
4521
4522 local64_set(&event->hw.period_left, 0);
4523
4524 if (active) {
4525 event->pmu->start(event, PERF_EF_RELOAD);
4526 perf_pmu_enable(ctx->pmu);
4527 }
4528}
4529
4530static int perf_event_period(struct perf_event *event, u64 __user *arg)
4531{
4532 u64 value;
4533
4534 if (!is_sampling_event(event))
4535 return -EINVAL;
4536
4537 if (copy_from_user(&value, arg, sizeof(value)))
4538 return -EFAULT;
4539
4540 if (!value)
4541 return -EINVAL;
4542
4543 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4544 return -EINVAL;
4545
4546 event_function_call(event, __perf_event_period, &value);
4547
4548 return 0;
4549}
4550
4551static const struct file_operations perf_fops;
4552
4553static inline int perf_fget_light(int fd, struct fd *p)
4554{
4555 struct fd f = fdget(fd);
4556 if (!f.file)
4557 return -EBADF;
4558
4559 if (f.file->f_op != &perf_fops) {
4560 fdput(f);
4561 return -EBADF;
4562 }
4563 *p = f;
4564 return 0;
4565}
4566
4567static int perf_event_set_output(struct perf_event *event,
4568 struct perf_event *output_event);
4569static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4570static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4571
4572static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4573{
4574 void (*func)(struct perf_event *);
4575 u32 flags = arg;
4576
4577 switch (cmd) {
4578 case PERF_EVENT_IOC_ENABLE:
4579 func = _perf_event_enable;
4580 break;
4581 case PERF_EVENT_IOC_DISABLE:
4582 func = _perf_event_disable;
4583 break;
4584 case PERF_EVENT_IOC_RESET:
4585 func = _perf_event_reset;
4586 break;
4587
4588 case PERF_EVENT_IOC_REFRESH:
4589 return _perf_event_refresh(event, arg);
4590
4591 case PERF_EVENT_IOC_PERIOD:
4592 return perf_event_period(event, (u64 __user *)arg);
4593
4594 case PERF_EVENT_IOC_ID:
4595 {
4596 u64 id = primary_event_id(event);
4597
4598 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4599 return -EFAULT;
4600 return 0;
4601 }
4602
4603 case PERF_EVENT_IOC_SET_OUTPUT:
4604 {
4605 int ret;
4606 if (arg != -1) {
4607 struct perf_event *output_event;
4608 struct fd output;
4609 ret = perf_fget_light(arg, &output);
4610 if (ret)
4611 return ret;
4612 output_event = output.file->private_data;
4613 ret = perf_event_set_output(event, output_event);
4614 fdput(output);
4615 } else {
4616 ret = perf_event_set_output(event, NULL);
4617 }
4618 return ret;
4619 }
4620
4621 case PERF_EVENT_IOC_SET_FILTER:
4622 return perf_event_set_filter(event, (void __user *)arg);
4623
4624 case PERF_EVENT_IOC_SET_BPF:
4625 return perf_event_set_bpf_prog(event, arg);
4626
4627 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4628 struct ring_buffer *rb;
4629
4630 rcu_read_lock();
4631 rb = rcu_dereference(event->rb);
4632 if (!rb || !rb->nr_pages) {
4633 rcu_read_unlock();
4634 return -EINVAL;
4635 }
4636 rb_toggle_paused(rb, !!arg);
4637 rcu_read_unlock();
4638 return 0;
4639 }
4640 default:
4641 return -ENOTTY;
4642 }
4643
4644 if (flags & PERF_IOC_FLAG_GROUP)
4645 perf_event_for_each(event, func);
4646 else
4647 perf_event_for_each_child(event, func);
4648
4649 return 0;
4650}
4651
4652static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4653{
4654 struct perf_event *event = file->private_data;
4655 struct perf_event_context *ctx;
4656 long ret;
4657
4658 ctx = perf_event_ctx_lock(event);
4659 ret = _perf_ioctl(event, cmd, arg);
4660 perf_event_ctx_unlock(event, ctx);
4661
4662 return ret;
4663}
4664
4665#ifdef CONFIG_COMPAT
4666static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4667 unsigned long arg)
4668{
4669 switch (_IOC_NR(cmd)) {
4670 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4671 case _IOC_NR(PERF_EVENT_IOC_ID):
4672
4673 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4674 cmd &= ~IOCSIZE_MASK;
4675 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4676 }
4677 break;
4678 }
4679 return perf_ioctl(file, cmd, arg);
4680}
4681#else
4682# define perf_compat_ioctl NULL
4683#endif
4684
4685int perf_event_task_enable(void)
4686{
4687 struct perf_event_context *ctx;
4688 struct perf_event *event;
4689
4690 mutex_lock(¤t->perf_event_mutex);
4691 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4692 ctx = perf_event_ctx_lock(event);
4693 perf_event_for_each_child(event, _perf_event_enable);
4694 perf_event_ctx_unlock(event, ctx);
4695 }
4696 mutex_unlock(¤t->perf_event_mutex);
4697
4698 return 0;
4699}
4700
4701int perf_event_task_disable(void)
4702{
4703 struct perf_event_context *ctx;
4704 struct perf_event *event;
4705
4706 mutex_lock(¤t->perf_event_mutex);
4707 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4708 ctx = perf_event_ctx_lock(event);
4709 perf_event_for_each_child(event, _perf_event_disable);
4710 perf_event_ctx_unlock(event, ctx);
4711 }
4712 mutex_unlock(¤t->perf_event_mutex);
4713
4714 return 0;
4715}
4716
4717static int perf_event_index(struct perf_event *event)
4718{
4719 if (event->hw.state & PERF_HES_STOPPED)
4720 return 0;
4721
4722 if (event->state != PERF_EVENT_STATE_ACTIVE)
4723 return 0;
4724
4725 return event->pmu->event_idx(event);
4726}
4727
4728static void calc_timer_values(struct perf_event *event,
4729 u64 *now,
4730 u64 *enabled,
4731 u64 *running)
4732{
4733 u64 ctx_time;
4734
4735 *now = perf_clock();
4736 ctx_time = event->shadow_ctx_time + *now;
4737 *enabled = ctx_time - event->tstamp_enabled;
4738 *running = ctx_time - event->tstamp_running;
4739}
4740
4741static void perf_event_init_userpage(struct perf_event *event)
4742{
4743 struct perf_event_mmap_page *userpg;
4744 struct ring_buffer *rb;
4745
4746 rcu_read_lock();
4747 rb = rcu_dereference(event->rb);
4748 if (!rb)
4749 goto unlock;
4750
4751 userpg = rb->user_page;
4752
4753
4754 userpg->cap_bit0_is_deprecated = 1;
4755 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4756 userpg->data_offset = PAGE_SIZE;
4757 userpg->data_size = perf_data_size(rb);
4758
4759unlock:
4760 rcu_read_unlock();
4761}
4762
4763void __weak arch_perf_update_userpage(
4764 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4765{
4766}
4767
4768
4769
4770
4771
4772
4773void perf_event_update_userpage(struct perf_event *event)
4774{
4775 struct perf_event_mmap_page *userpg;
4776 struct ring_buffer *rb;
4777 u64 enabled, running, now;
4778
4779 rcu_read_lock();
4780 rb = rcu_dereference(event->rb);
4781 if (!rb)
4782 goto unlock;
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793 calc_timer_values(event, &now, &enabled, &running);
4794
4795 userpg = rb->user_page;
4796
4797
4798
4799
4800 preempt_disable();
4801 ++userpg->lock;
4802 barrier();
4803 userpg->index = perf_event_index(event);
4804 userpg->offset = perf_event_count(event);
4805 if (userpg->index)
4806 userpg->offset -= local64_read(&event->hw.prev_count);
4807
4808 userpg->time_enabled = enabled +
4809 atomic64_read(&event->child_total_time_enabled);
4810
4811 userpg->time_running = running +
4812 atomic64_read(&event->child_total_time_running);
4813
4814 arch_perf_update_userpage(event, userpg, now);
4815
4816 barrier();
4817 ++userpg->lock;
4818 preempt_enable();
4819unlock:
4820 rcu_read_unlock();
4821}
4822
4823static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4824{
4825 struct perf_event *event = vma->vm_file->private_data;
4826 struct ring_buffer *rb;
4827 int ret = VM_FAULT_SIGBUS;
4828
4829 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4830 if (vmf->pgoff == 0)
4831 ret = 0;
4832 return ret;
4833 }
4834
4835 rcu_read_lock();
4836 rb = rcu_dereference(event->rb);
4837 if (!rb)
4838 goto unlock;
4839
4840 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4841 goto unlock;
4842
4843 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4844 if (!vmf->page)
4845 goto unlock;
4846
4847 get_page(vmf->page);
4848 vmf->page->mapping = vma->vm_file->f_mapping;
4849 vmf->page->index = vmf->pgoff;
4850
4851 ret = 0;
4852unlock:
4853 rcu_read_unlock();
4854
4855 return ret;
4856}
4857
4858static void ring_buffer_attach(struct perf_event *event,
4859 struct ring_buffer *rb)
4860{
4861 struct ring_buffer *old_rb = NULL;
4862 unsigned long flags;
4863
4864 if (event->rb) {
4865
4866
4867
4868
4869 WARN_ON_ONCE(event->rcu_pending);
4870
4871 old_rb = event->rb;
4872 spin_lock_irqsave(&old_rb->event_lock, flags);
4873 list_del_rcu(&event->rb_entry);
4874 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4875
4876 event->rcu_batches = get_state_synchronize_rcu();
4877 event->rcu_pending = 1;
4878 }
4879
4880 if (rb) {
4881 if (event->rcu_pending) {
4882 cond_synchronize_rcu(event->rcu_batches);
4883 event->rcu_pending = 0;
4884 }
4885
4886 spin_lock_irqsave(&rb->event_lock, flags);
4887 list_add_rcu(&event->rb_entry, &rb->event_list);
4888 spin_unlock_irqrestore(&rb->event_lock, flags);
4889 }
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901 if (has_aux(event))
4902 perf_event_stop(event, 0);
4903
4904 rcu_assign_pointer(event->rb, rb);
4905
4906 if (old_rb) {
4907 ring_buffer_put(old_rb);
4908
4909
4910
4911
4912
4913 wake_up_all(&event->waitq);
4914 }
4915}
4916
4917static void ring_buffer_wakeup(struct perf_event *event)
4918{
4919 struct ring_buffer *rb;
4920
4921 rcu_read_lock();
4922 rb = rcu_dereference(event->rb);
4923 if (rb) {
4924 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4925 wake_up_all(&event->waitq);
4926 }
4927 rcu_read_unlock();
4928}
4929
4930struct ring_buffer *ring_buffer_get(struct perf_event *event)
4931{
4932 struct ring_buffer *rb;
4933
4934 rcu_read_lock();
4935 rb = rcu_dereference(event->rb);
4936 if (rb) {
4937 if (!atomic_inc_not_zero(&rb->refcount))
4938 rb = NULL;
4939 }
4940 rcu_read_unlock();
4941
4942 return rb;
4943}
4944
4945void ring_buffer_put(struct ring_buffer *rb)
4946{
4947 if (!atomic_dec_and_test(&rb->refcount))
4948 return;
4949
4950 WARN_ON_ONCE(!list_empty(&rb->event_list));
4951
4952 call_rcu(&rb->rcu_head, rb_free_rcu);
4953}
4954
4955static void perf_mmap_open(struct vm_area_struct *vma)
4956{
4957 struct perf_event *event = vma->vm_file->private_data;
4958
4959 atomic_inc(&event->mmap_count);
4960 atomic_inc(&event->rb->mmap_count);
4961
4962 if (vma->vm_pgoff)
4963 atomic_inc(&event->rb->aux_mmap_count);
4964
4965 if (event->pmu->event_mapped)
4966 event->pmu->event_mapped(event);
4967}
4968
4969static void perf_pmu_output_stop(struct perf_event *event);
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979static void perf_mmap_close(struct vm_area_struct *vma)
4980{
4981 struct perf_event *event = vma->vm_file->private_data;
4982
4983 struct ring_buffer *rb = ring_buffer_get(event);
4984 struct user_struct *mmap_user = rb->mmap_user;
4985 int mmap_locked = rb->mmap_locked;
4986 unsigned long size = perf_data_size(rb);
4987
4988 if (event->pmu->event_unmapped)
4989 event->pmu->event_unmapped(event);
4990
4991
4992
4993
4994
4995
4996 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4997 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4998
4999
5000
5001
5002
5003
5004 perf_pmu_output_stop(event);
5005
5006
5007 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5008 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5009
5010
5011 rb_free_aux(rb);
5012 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5013
5014 mutex_unlock(&event->mmap_mutex);
5015 }
5016
5017 atomic_dec(&rb->mmap_count);
5018
5019 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5020 goto out_put;
5021
5022 ring_buffer_attach(event, NULL);
5023 mutex_unlock(&event->mmap_mutex);
5024
5025
5026 if (atomic_read(&rb->mmap_count))
5027 goto out_put;
5028
5029
5030
5031
5032
5033
5034again:
5035 rcu_read_lock();
5036 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5037 if (!atomic_long_inc_not_zero(&event->refcount)) {
5038
5039
5040
5041
5042 continue;
5043 }
5044 rcu_read_unlock();
5045
5046 mutex_lock(&event->mmap_mutex);
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057 if (event->rb == rb)
5058 ring_buffer_attach(event, NULL);
5059
5060 mutex_unlock(&event->mmap_mutex);
5061 put_event(event);
5062
5063
5064
5065
5066
5067 goto again;
5068 }
5069 rcu_read_unlock();
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5081 vma->vm_mm->pinned_vm -= mmap_locked;
5082 free_uid(mmap_user);
5083
5084out_put:
5085 ring_buffer_put(rb);
5086}
5087
5088static const struct vm_operations_struct perf_mmap_vmops = {
5089 .open = perf_mmap_open,
5090 .close = perf_mmap_close,
5091 .fault = perf_mmap_fault,
5092 .page_mkwrite = perf_mmap_fault,
5093};
5094
5095static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5096{
5097 struct perf_event *event = file->private_data;
5098 unsigned long user_locked, user_lock_limit;
5099 struct user_struct *user = current_user();
5100 unsigned long locked, lock_limit;
5101 struct ring_buffer *rb = NULL;
5102 unsigned long vma_size;
5103 unsigned long nr_pages;
5104 long user_extra = 0, extra = 0;
5105 int ret = 0, flags = 0;
5106
5107
5108
5109
5110
5111
5112 if (event->cpu == -1 && event->attr.inherit)
5113 return -EINVAL;
5114
5115 if (!(vma->vm_flags & VM_SHARED))
5116 return -EINVAL;
5117
5118 vma_size = vma->vm_end - vma->vm_start;
5119
5120 if (vma->vm_pgoff == 0) {
5121 nr_pages = (vma_size / PAGE_SIZE) - 1;
5122 } else {
5123
5124
5125
5126
5127
5128 u64 aux_offset, aux_size;
5129
5130 if (!event->rb)
5131 return -EINVAL;
5132
5133 nr_pages = vma_size / PAGE_SIZE;
5134
5135 mutex_lock(&event->mmap_mutex);
5136 ret = -EINVAL;
5137
5138 rb = event->rb;
5139 if (!rb)
5140 goto aux_unlock;
5141
5142 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5143 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5144
5145 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5146 goto aux_unlock;
5147
5148 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5149 goto aux_unlock;
5150
5151
5152 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5153 goto aux_unlock;
5154
5155 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5156 goto aux_unlock;
5157
5158
5159 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5160 goto aux_unlock;
5161
5162 if (!is_power_of_2(nr_pages))
5163 goto aux_unlock;
5164
5165 if (!atomic_inc_not_zero(&rb->mmap_count))
5166 goto aux_unlock;
5167
5168 if (rb_has_aux(rb)) {
5169 atomic_inc(&rb->aux_mmap_count);
5170 ret = 0;
5171 goto unlock;
5172 }
5173
5174 atomic_set(&rb->aux_mmap_count, 1);
5175 user_extra = nr_pages;
5176
5177 goto accounting;
5178 }
5179
5180
5181
5182
5183
5184 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5185 return -EINVAL;
5186
5187 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5188 return -EINVAL;
5189
5190 WARN_ON_ONCE(event->ctx->parent_ctx);
5191again:
5192 mutex_lock(&event->mmap_mutex);
5193 if (event->rb) {
5194 if (event->rb->nr_pages != nr_pages) {
5195 ret = -EINVAL;
5196 goto unlock;
5197 }
5198
5199 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5200
5201
5202
5203
5204
5205 mutex_unlock(&event->mmap_mutex);
5206 goto again;
5207 }
5208
5209 goto unlock;
5210 }
5211
5212 user_extra = nr_pages + 1;
5213
5214accounting:
5215 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5216
5217
5218
5219
5220 user_lock_limit *= num_online_cpus();
5221
5222 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5223
5224 if (user_locked > user_lock_limit)
5225 extra = user_locked - user_lock_limit;
5226
5227 lock_limit = rlimit(RLIMIT_MEMLOCK);
5228 lock_limit >>= PAGE_SHIFT;
5229 locked = vma->vm_mm->pinned_vm + extra;
5230
5231 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5232 !capable(CAP_IPC_LOCK)) {
5233 ret = -EPERM;
5234 goto unlock;
5235 }
5236
5237 WARN_ON(!rb && event->rb);
5238
5239 if (vma->vm_flags & VM_WRITE)
5240 flags |= RING_BUFFER_WRITABLE;
5241
5242 if (!rb) {
5243 rb = rb_alloc(nr_pages,
5244 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5245 event->cpu, flags);
5246
5247 if (!rb) {
5248 ret = -ENOMEM;
5249 goto unlock;
5250 }
5251
5252 atomic_set(&rb->mmap_count, 1);
5253 rb->mmap_user = get_current_user();
5254 rb->mmap_locked = extra;
5255
5256 ring_buffer_attach(event, rb);
5257
5258 perf_event_init_userpage(event);
5259 perf_event_update_userpage(event);
5260 } else {
5261 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5262 event->attr.aux_watermark, flags);
5263 if (!ret)
5264 rb->aux_mmap_locked = extra;
5265 }
5266
5267unlock:
5268 if (!ret) {
5269 atomic_long_add(user_extra, &user->locked_vm);
5270 vma->vm_mm->pinned_vm += extra;
5271
5272 atomic_inc(&event->mmap_count);
5273 } else if (rb) {
5274 atomic_dec(&rb->mmap_count);
5275 }
5276aux_unlock:
5277 mutex_unlock(&event->mmap_mutex);
5278
5279
5280
5281
5282
5283 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5284 vma->vm_ops = &perf_mmap_vmops;
5285
5286 if (event->pmu->event_mapped)
5287 event->pmu->event_mapped(event);
5288
5289 return ret;
5290}
5291
5292static int perf_fasync(int fd, struct file *filp, int on)
5293{
5294 struct inode *inode = file_inode(filp);
5295 struct perf_event *event = filp->private_data;
5296 int retval;
5297
5298 inode_lock(inode);
5299 retval = fasync_helper(fd, filp, on, &event->fasync);
5300 inode_unlock(inode);
5301
5302 if (retval < 0)
5303 return retval;
5304
5305 return 0;
5306}
5307
5308static const struct file_operations perf_fops = {
5309 .llseek = no_llseek,
5310 .release = perf_release,
5311 .read = perf_read,
5312 .poll = perf_poll,
5313 .unlocked_ioctl = perf_ioctl,
5314 .compat_ioctl = perf_compat_ioctl,
5315 .mmap = perf_mmap,
5316 .fasync = perf_fasync,
5317};
5318
5319
5320
5321
5322
5323
5324
5325
5326static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5327{
5328
5329 if (event->parent)
5330 event = event->parent;
5331 return &event->fasync;
5332}
5333
5334void perf_event_wakeup(struct perf_event *event)
5335{
5336 ring_buffer_wakeup(event);
5337
5338 if (event->pending_kill) {
5339 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5340 event->pending_kill = 0;
5341 }
5342}
5343
5344static void perf_pending_event(struct irq_work *entry)
5345{
5346 struct perf_event *event = container_of(entry,
5347 struct perf_event, pending);
5348 int rctx;
5349
5350 rctx = perf_swevent_get_recursion_context();
5351
5352
5353
5354
5355
5356 if (event->pending_disable) {
5357 event->pending_disable = 0;
5358 perf_event_disable_local(event);
5359 }
5360
5361 if (event->pending_wakeup) {
5362 event->pending_wakeup = 0;
5363 perf_event_wakeup(event);
5364 }
5365
5366 if (rctx >= 0)
5367 perf_swevent_put_recursion_context(rctx);
5368}
5369
5370
5371
5372
5373
5374
5375struct perf_guest_info_callbacks *perf_guest_cbs;
5376
5377int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5378{
5379 perf_guest_cbs = cbs;
5380 return 0;
5381}
5382EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5383
5384int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5385{
5386 perf_guest_cbs = NULL;
5387 return 0;
5388}
5389EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5390
5391static void
5392perf_output_sample_regs(struct perf_output_handle *handle,
5393 struct pt_regs *regs, u64 mask)
5394{
5395 int bit;
5396 DECLARE_BITMAP(_mask, 64);
5397
5398 bitmap_from_u64(_mask, mask);
5399 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5400 u64 val;
5401
5402 val = perf_reg_value(regs, bit);
5403 perf_output_put(handle, val);
5404 }
5405}
5406
5407static void perf_sample_regs_user(struct perf_regs *regs_user,
5408 struct pt_regs *regs,
5409 struct pt_regs *regs_user_copy)
5410{
5411 if (user_mode(regs)) {
5412 regs_user->abi = perf_reg_abi(current);
5413 regs_user->regs = regs;
5414 } else if (current->mm) {
5415 perf_get_regs_user(regs_user, regs, regs_user_copy);
5416 } else {
5417 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5418 regs_user->regs = NULL;
5419 }
5420}
5421
5422static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5423 struct pt_regs *regs)
5424{
5425 regs_intr->regs = regs;
5426 regs_intr->abi = perf_reg_abi(current);
5427}
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437static u64 perf_ustack_task_size(struct pt_regs *regs)
5438{
5439 unsigned long addr = perf_user_stack_pointer(regs);
5440
5441 if (!addr || addr >= TASK_SIZE)
5442 return 0;
5443
5444 return TASK_SIZE - addr;
5445}
5446
5447static u16
5448perf_sample_ustack_size(u16 stack_size, u16 header_size,
5449 struct pt_regs *regs)
5450{
5451 u64 task_size;
5452
5453
5454 if (!regs)
5455 return 0;
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5468 stack_size = min(stack_size, (u16) task_size);
5469
5470
5471 header_size += 2 * sizeof(u64);
5472
5473
5474 if ((u16) (header_size + stack_size) < header_size) {
5475
5476
5477
5478
5479 stack_size = USHRT_MAX - header_size - sizeof(u64);
5480 stack_size = round_up(stack_size, sizeof(u64));
5481 }
5482
5483 return stack_size;
5484}
5485
5486static void
5487perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5488 struct pt_regs *regs)
5489{
5490
5491 if (!regs) {
5492 u64 size = 0;
5493 perf_output_put(handle, size);
5494 } else {
5495 unsigned long sp;
5496 unsigned int rem;
5497 u64 dyn_size;
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511 perf_output_put(handle, dump_size);
5512
5513
5514 sp = perf_user_stack_pointer(regs);
5515 rem = __output_copy_user(handle, (void *) sp, dump_size);
5516 dyn_size = dump_size - rem;
5517
5518 perf_output_skip(handle, rem);
5519
5520
5521 perf_output_put(handle, dyn_size);
5522 }
5523}
5524
5525static void __perf_event_header__init_id(struct perf_event_header *header,
5526 struct perf_sample_data *data,
5527 struct perf_event *event)
5528{
5529 u64 sample_type = event->attr.sample_type;
5530
5531 data->type = sample_type;
5532 header->size += event->id_header_size;
5533
5534 if (sample_type & PERF_SAMPLE_TID) {
5535
5536 data->tid_entry.pid = perf_event_pid(event, current);
5537 data->tid_entry.tid = perf_event_tid(event, current);
5538 }
5539
5540 if (sample_type & PERF_SAMPLE_TIME)
5541 data->time = perf_event_clock(event);
5542
5543 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5544 data->id = primary_event_id(event);
5545
5546 if (sample_type & PERF_SAMPLE_STREAM_ID)
5547 data->stream_id = event->id;
5548
5549 if (sample_type & PERF_SAMPLE_CPU) {
5550 data->cpu_entry.cpu = raw_smp_processor_id();
5551 data->cpu_entry.reserved = 0;
5552 }
5553}
5554
5555void perf_event_header__init_id(struct perf_event_header *header,
5556 struct perf_sample_data *data,
5557 struct perf_event *event)
5558{
5559 if (event->attr.sample_id_all)
5560 __perf_event_header__init_id(header, data, event);
5561}
5562
5563static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5564 struct perf_sample_data *data)
5565{
5566 u64 sample_type = data->type;
5567
5568 if (sample_type & PERF_SAMPLE_TID)
5569 perf_output_put(handle, data->tid_entry);
5570
5571 if (sample_type & PERF_SAMPLE_TIME)
5572 perf_output_put(handle, data->time);
5573
5574 if (sample_type & PERF_SAMPLE_ID)
5575 perf_output_put(handle, data->id);
5576
5577 if (sample_type & PERF_SAMPLE_STREAM_ID)
5578 perf_output_put(handle, data->stream_id);
5579
5580 if (sample_type & PERF_SAMPLE_CPU)
5581 perf_output_put(handle, data->cpu_entry);
5582
5583 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5584 perf_output_put(handle, data->id);
5585}
5586
5587void perf_event__output_id_sample(struct perf_event *event,
5588 struct perf_output_handle *handle,
5589 struct perf_sample_data *sample)
5590{
5591 if (event->attr.sample_id_all)
5592 __perf_event__output_id_sample(handle, sample);
5593}
5594
5595static void perf_output_read_one(struct perf_output_handle *handle,
5596 struct perf_event *event,
5597 u64 enabled, u64 running)
5598{
5599 u64 read_format = event->attr.read_format;
5600 u64 values[4];
5601 int n = 0;
5602
5603 values[n++] = perf_event_count(event);
5604 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5605 values[n++] = enabled +
5606 atomic64_read(&event->child_total_time_enabled);
5607 }
5608 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5609 values[n++] = running +
5610 atomic64_read(&event->child_total_time_running);
5611 }
5612 if (read_format & PERF_FORMAT_ID)
5613 values[n++] = primary_event_id(event);
5614
5615 __output_copy(handle, values, n * sizeof(u64));
5616}
5617
5618
5619
5620
5621static void perf_output_read_group(struct perf_output_handle *handle,
5622 struct perf_event *event,
5623 u64 enabled, u64 running)
5624{
5625 struct perf_event *leader = event->group_leader, *sub;
5626 u64 read_format = event->attr.read_format;
5627 u64 values[5];
5628 int n = 0;
5629
5630 values[n++] = 1 + leader->nr_siblings;
5631
5632 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5633 values[n++] = enabled;
5634
5635 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5636 values[n++] = running;
5637
5638 if (leader != event)
5639 leader->pmu->read(leader);
5640
5641 values[n++] = perf_event_count(leader);
5642 if (read_format & PERF_FORMAT_ID)
5643 values[n++] = primary_event_id(leader);
5644
5645 __output_copy(handle, values, n * sizeof(u64));
5646
5647 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5648 n = 0;
5649
5650 if ((sub != event) &&
5651 (sub->state == PERF_EVENT_STATE_ACTIVE))
5652 sub->pmu->read(sub);
5653
5654 values[n++] = perf_event_count(sub);
5655 if (read_format & PERF_FORMAT_ID)
5656 values[n++] = primary_event_id(sub);
5657
5658 __output_copy(handle, values, n * sizeof(u64));
5659 }
5660}
5661
5662#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5663 PERF_FORMAT_TOTAL_TIME_RUNNING)
5664
5665static void perf_output_read(struct perf_output_handle *handle,
5666 struct perf_event *event)
5667{
5668 u64 enabled = 0, running = 0, now;
5669 u64 read_format = event->attr.read_format;
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5681 calc_timer_values(event, &now, &enabled, &running);
5682
5683 if (event->attr.read_format & PERF_FORMAT_GROUP)
5684 perf_output_read_group(handle, event, enabled, running);
5685 else
5686 perf_output_read_one(handle, event, enabled, running);
5687}
5688
5689void perf_output_sample(struct perf_output_handle *handle,
5690 struct perf_event_header *header,
5691 struct perf_sample_data *data,
5692 struct perf_event *event)
5693{
5694 u64 sample_type = data->type;
5695
5696 perf_output_put(handle, *header);
5697
5698 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5699 perf_output_put(handle, data->id);
5700
5701 if (sample_type & PERF_SAMPLE_IP)
5702 perf_output_put(handle, data->ip);
5703
5704 if (sample_type & PERF_SAMPLE_TID)
5705 perf_output_put(handle, data->tid_entry);
5706
5707 if (sample_type & PERF_SAMPLE_TIME)
5708 perf_output_put(handle, data->time);
5709
5710 if (sample_type & PERF_SAMPLE_ADDR)
5711 perf_output_put(handle, data->addr);
5712
5713 if (sample_type & PERF_SAMPLE_ID)
5714 perf_output_put(handle, data->id);
5715
5716 if (sample_type & PERF_SAMPLE_STREAM_ID)
5717 perf_output_put(handle, data->stream_id);
5718
5719 if (sample_type & PERF_SAMPLE_CPU)
5720 perf_output_put(handle, data->cpu_entry);
5721
5722 if (sample_type & PERF_SAMPLE_PERIOD)
5723 perf_output_put(handle, data->period);
5724
5725 if (sample_type & PERF_SAMPLE_READ)
5726 perf_output_read(handle, event);
5727
5728 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5729 if (data->callchain) {
5730 int size = 1;
5731
5732 if (data->callchain)
5733 size += data->callchain->nr;
5734
5735 size *= sizeof(u64);
5736
5737 __output_copy(handle, data->callchain, size);
5738 } else {
5739 u64 nr = 0;
5740 perf_output_put(handle, nr);
5741 }
5742 }
5743
5744 if (sample_type & PERF_SAMPLE_RAW) {
5745 struct perf_raw_record *raw = data->raw;
5746
5747 if (raw) {
5748 struct perf_raw_frag *frag = &raw->frag;
5749
5750 perf_output_put(handle, raw->size);
5751 do {
5752 if (frag->copy) {
5753 __output_custom(handle, frag->copy,
5754 frag->data, frag->size);
5755 } else {
5756 __output_copy(handle, frag->data,
5757 frag->size);
5758 }
5759 if (perf_raw_frag_last(frag))
5760 break;
5761 frag = frag->next;
5762 } while (1);
5763 if (frag->pad)
5764 __output_skip(handle, NULL, frag->pad);
5765 } else {
5766 struct {
5767 u32 size;
5768 u32 data;
5769 } raw = {
5770 .size = sizeof(u32),
5771 .data = 0,
5772 };
5773 perf_output_put(handle, raw);
5774 }
5775 }
5776
5777 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5778 if (data->br_stack) {
5779 size_t size;
5780
5781 size = data->br_stack->nr
5782 * sizeof(struct perf_branch_entry);
5783
5784 perf_output_put(handle, data->br_stack->nr);
5785 perf_output_copy(handle, data->br_stack->entries, size);
5786 } else {
5787
5788
5789
5790 u64 nr = 0;
5791 perf_output_put(handle, nr);
5792 }
5793 }
5794
5795 if (sample_type & PERF_SAMPLE_REGS_USER) {
5796 u64 abi = data->regs_user.abi;
5797
5798
5799
5800
5801
5802 perf_output_put(handle, abi);
5803
5804 if (abi) {
5805 u64 mask = event->attr.sample_regs_user;
5806 perf_output_sample_regs(handle,
5807 data->regs_user.regs,
5808 mask);
5809 }
5810 }
5811
5812 if (sample_type & PERF_SAMPLE_STACK_USER) {
5813 perf_output_sample_ustack(handle,
5814 data->stack_user_size,
5815 data->regs_user.regs);
5816 }
5817
5818 if (sample_type & PERF_SAMPLE_WEIGHT)
5819 perf_output_put(handle, data->weight);
5820
5821 if (sample_type & PERF_SAMPLE_DATA_SRC)
5822 perf_output_put(handle, data->data_src.val);
5823
5824 if (sample_type & PERF_SAMPLE_TRANSACTION)
5825 perf_output_put(handle, data->txn);
5826
5827 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5828 u64 abi = data->regs_intr.abi;
5829
5830
5831
5832
5833 perf_output_put(handle, abi);
5834
5835 if (abi) {
5836 u64 mask = event->attr.sample_regs_intr;
5837
5838 perf_output_sample_regs(handle,
5839 data->regs_intr.regs,
5840 mask);
5841 }
5842 }
5843
5844 if (!event->attr.watermark) {
5845 int wakeup_events = event->attr.wakeup_events;
5846
5847 if (wakeup_events) {
5848 struct ring_buffer *rb = handle->rb;
5849 int events = local_inc_return(&rb->events);
5850
5851 if (events >= wakeup_events) {
5852 local_sub(wakeup_events, &rb->events);
5853 local_inc(&rb->wakeup);
5854 }
5855 }
5856 }
5857}
5858
5859void perf_prepare_sample(struct perf_event_header *header,
5860 struct perf_sample_data *data,
5861 struct perf_event *event,
5862 struct pt_regs *regs)
5863{
5864 u64 sample_type = event->attr.sample_type;
5865
5866 header->type = PERF_RECORD_SAMPLE;
5867 header->size = sizeof(*header) + event->header_size;
5868
5869 header->misc = 0;
5870 header->misc |= perf_misc_flags(regs);
5871
5872 __perf_event_header__init_id(header, data, event);
5873
5874 if (sample_type & PERF_SAMPLE_IP)
5875 data->ip = perf_instruction_pointer(regs);
5876
5877 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5878 int size = 1;
5879
5880 data->callchain = perf_callchain(event, regs);
5881
5882 if (data->callchain)
5883 size += data->callchain->nr;
5884
5885 header->size += size * sizeof(u64);
5886 }
5887
5888 if (sample_type & PERF_SAMPLE_RAW) {
5889 struct perf_raw_record *raw = data->raw;
5890 int size;
5891
5892 if (raw) {
5893 struct perf_raw_frag *frag = &raw->frag;
5894 u32 sum = 0;
5895
5896 do {
5897 sum += frag->size;
5898 if (perf_raw_frag_last(frag))
5899 break;
5900 frag = frag->next;
5901 } while (1);
5902
5903 size = round_up(sum + sizeof(u32), sizeof(u64));
5904 raw->size = size - sizeof(u32);
5905 frag->pad = raw->size - sum;
5906 } else {
5907 size = sizeof(u64);
5908 }
5909
5910 header->size += size;
5911 }
5912
5913 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5914 int size = sizeof(u64);
5915 if (data->br_stack) {
5916 size += data->br_stack->nr
5917 * sizeof(struct perf_branch_entry);
5918 }
5919 header->size += size;
5920 }
5921
5922 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5923 perf_sample_regs_user(&data->regs_user, regs,
5924 &data->regs_user_copy);
5925
5926 if (sample_type & PERF_SAMPLE_REGS_USER) {
5927
5928 int size = sizeof(u64);
5929
5930 if (data->regs_user.regs) {
5931 u64 mask = event->attr.sample_regs_user;
5932 size += hweight64(mask) * sizeof(u64);
5933 }
5934
5935 header->size += size;
5936 }
5937
5938 if (sample_type & PERF_SAMPLE_STACK_USER) {
5939
5940
5941
5942
5943
5944
5945 u16 stack_size = event->attr.sample_stack_user;
5946 u16 size = sizeof(u64);
5947
5948 stack_size = perf_sample_ustack_size(stack_size, header->size,
5949 data->regs_user.regs);
5950
5951
5952
5953
5954
5955
5956 if (stack_size)
5957 size += sizeof(u64) + stack_size;
5958
5959 data->stack_user_size = stack_size;
5960 header->size += size;
5961 }
5962
5963 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5964
5965 int size = sizeof(u64);
5966
5967 perf_sample_regs_intr(&data->regs_intr, regs);
5968
5969 if (data->regs_intr.regs) {
5970 u64 mask = event->attr.sample_regs_intr;
5971
5972 size += hweight64(mask) * sizeof(u64);
5973 }
5974
5975 header->size += size;
5976 }
5977}
5978
5979static void __always_inline
5980__perf_event_output(struct perf_event *event,
5981 struct perf_sample_data *data,
5982 struct pt_regs *regs,
5983 int (*output_begin)(struct perf_output_handle *,
5984 struct perf_event *,
5985 unsigned int))
5986{
5987 struct perf_output_handle handle;
5988 struct perf_event_header header;
5989
5990
5991 rcu_read_lock();
5992
5993 perf_prepare_sample(&header, data, event, regs);
5994
5995 if (output_begin(&handle, event, header.size))
5996 goto exit;
5997
5998 perf_output_sample(&handle, &header, data, event);
5999
6000 perf_output_end(&handle);
6001
6002exit:
6003 rcu_read_unlock();
6004}
6005
6006void
6007perf_event_output_forward(struct perf_event *event,
6008 struct perf_sample_data *data,
6009 struct pt_regs *regs)
6010{
6011 __perf_event_output(event, data, regs, perf_output_begin_forward);
6012}
6013
6014void
6015perf_event_output_backward(struct perf_event *event,
6016 struct perf_sample_data *data,
6017 struct pt_regs *regs)
6018{
6019 __perf_event_output(event, data, regs, perf_output_begin_backward);
6020}
6021
6022void
6023perf_event_output(struct perf_event *event,
6024 struct perf_sample_data *data,
6025 struct pt_regs *regs)
6026{
6027 __perf_event_output(event, data, regs, perf_output_begin);
6028}
6029
6030
6031
6032
6033
6034struct perf_read_event {
6035 struct perf_event_header header;
6036
6037 u32 pid;
6038 u32 tid;
6039};
6040
6041static void
6042perf_event_read_event(struct perf_event *event,
6043 struct task_struct *task)
6044{
6045 struct perf_output_handle handle;
6046 struct perf_sample_data sample;
6047 struct perf_read_event read_event = {
6048 .header = {
6049 .type = PERF_RECORD_READ,
6050 .misc = 0,
6051 .size = sizeof(read_event) + event->read_size,
6052 },
6053 .pid = perf_event_pid(event, task),
6054 .tid = perf_event_tid(event, task),
6055 };
6056 int ret;
6057
6058 perf_event_header__init_id(&read_event.header, &sample, event);
6059 ret = perf_output_begin(&handle, event, read_event.header.size);
6060 if (ret)
6061 return;
6062
6063 perf_output_put(&handle, read_event);
6064 perf_output_read(&handle, event);
6065 perf_event__output_id_sample(event, &handle, &sample);
6066
6067 perf_output_end(&handle);
6068}
6069
6070typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6071
6072static void
6073perf_iterate_ctx(struct perf_event_context *ctx,
6074 perf_iterate_f output,
6075 void *data, bool all)
6076{
6077 struct perf_event *event;
6078
6079 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6080 if (!all) {
6081 if (event->state < PERF_EVENT_STATE_INACTIVE)
6082 continue;
6083 if (!event_filter_match(event))
6084 continue;
6085 }
6086
6087 output(event, data);
6088 }
6089}
6090
6091static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6092{
6093 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6094 struct perf_event *event;
6095
6096 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6097
6098
6099
6100
6101
6102 if (!smp_load_acquire(&event->ctx))
6103 continue;
6104
6105 if (event->state < PERF_EVENT_STATE_INACTIVE)
6106 continue;
6107 if (!event_filter_match(event))
6108 continue;
6109 output(event, data);
6110 }
6111}
6112
6113
6114
6115
6116
6117
6118
6119static void
6120perf_iterate_sb(perf_iterate_f output, void *data,
6121 struct perf_event_context *task_ctx)
6122{
6123 struct perf_event_context *ctx;
6124 int ctxn;
6125
6126 rcu_read_lock();
6127 preempt_disable();
6128
6129
6130
6131
6132
6133
6134 if (task_ctx) {
6135 perf_iterate_ctx(task_ctx, output, data, false);
6136 goto done;
6137 }
6138
6139 perf_iterate_sb_cpu(output, data);
6140
6141 for_each_task_context_nr(ctxn) {
6142 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6143 if (ctx)
6144 perf_iterate_ctx(ctx, output, data, false);
6145 }
6146done:
6147 preempt_enable();
6148 rcu_read_unlock();
6149}
6150
6151
6152
6153
6154
6155static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6156{
6157 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6158 struct perf_addr_filter *filter;
6159 unsigned int restart = 0, count = 0;
6160 unsigned long flags;
6161
6162 if (!has_addr_filter(event))
6163 return;
6164
6165 raw_spin_lock_irqsave(&ifh->lock, flags);
6166 list_for_each_entry(filter, &ifh->list, entry) {
6167 if (filter->inode) {
6168 event->addr_filters_offs[count] = 0;
6169 restart++;
6170 }
6171
6172 count++;
6173 }
6174
6175 if (restart)
6176 event->addr_filters_gen++;
6177 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6178
6179 if (restart)
6180 perf_event_stop(event, 1);
6181}
6182
6183void perf_event_exec(void)
6184{
6185 struct perf_event_context *ctx;
6186 int ctxn;
6187
6188 rcu_read_lock();
6189 for_each_task_context_nr(ctxn) {
6190 ctx = current->perf_event_ctxp[ctxn];
6191 if (!ctx)
6192 continue;
6193
6194 perf_event_enable_on_exec(ctxn);
6195
6196 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6197 true);
6198 }
6199 rcu_read_unlock();
6200}
6201
6202struct remote_output {
6203 struct ring_buffer *rb;
6204 int err;
6205};
6206
6207static void __perf_event_output_stop(struct perf_event *event, void *data)
6208{
6209 struct perf_event *parent = event->parent;
6210 struct remote_output *ro = data;
6211 struct ring_buffer *rb = ro->rb;
6212 struct stop_event_data sd = {
6213 .event = event,
6214 };
6215
6216 if (!has_aux(event))
6217 return;
6218
6219 if (!parent)
6220 parent = event;
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232 if (rcu_dereference(parent->rb) == rb)
6233 ro->err = __perf_event_stop(&sd);
6234}
6235
6236static int __perf_pmu_output_stop(void *info)
6237{
6238 struct perf_event *event = info;
6239 struct pmu *pmu = event->pmu;
6240 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6241 struct remote_output ro = {
6242 .rb = event->rb,
6243 };
6244
6245 rcu_read_lock();
6246 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6247 if (cpuctx->task_ctx)
6248 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6249 &ro, false);
6250 rcu_read_unlock();
6251
6252 return ro.err;
6253}
6254
6255static void perf_pmu_output_stop(struct perf_event *event)
6256{
6257 struct perf_event *iter;
6258 int err, cpu;
6259
6260restart:
6261 rcu_read_lock();
6262 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6263
6264
6265
6266
6267
6268
6269 cpu = iter->cpu;
6270 if (cpu == -1)
6271 cpu = READ_ONCE(iter->oncpu);
6272
6273 if (cpu == -1)
6274 continue;
6275
6276 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6277 if (err == -EAGAIN) {
6278 rcu_read_unlock();
6279 goto restart;
6280 }
6281 }
6282 rcu_read_unlock();
6283}
6284
6285
6286
6287
6288
6289
6290
6291struct perf_task_event {
6292 struct task_struct *task;
6293 struct perf_event_context *task_ctx;
6294
6295 struct {
6296 struct perf_event_header header;
6297
6298 u32 pid;
6299 u32 ppid;
6300 u32 tid;
6301 u32 ptid;
6302 u64 time;
6303 } event_id;
6304};
6305
6306static int perf_event_task_match(struct perf_event *event)
6307{
6308 return event->attr.comm || event->attr.mmap ||
6309 event->attr.mmap2 || event->attr.mmap_data ||
6310 event->attr.task;
6311}
6312
6313static void perf_event_task_output(struct perf_event *event,
6314 void *data)
6315{
6316 struct perf_task_event *task_event = data;
6317 struct perf_output_handle handle;
6318 struct perf_sample_data sample;
6319 struct task_struct *task = task_event->task;
6320 int ret, size = task_event->event_id.header.size;
6321
6322 if (!perf_event_task_match(event))
6323 return;
6324
6325 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6326
6327 ret = perf_output_begin(&handle, event,
6328 task_event->event_id.header.size);
6329 if (ret)
6330 goto out;
6331
6332 task_event->event_id.pid = perf_event_pid(event, task);
6333 task_event->event_id.ppid = perf_event_pid(event, current);
6334
6335 task_event->event_id.tid = perf_event_tid(event, task);
6336 task_event->event_id.ptid = perf_event_tid(event, current);
6337
6338 task_event->event_id.time = perf_event_clock(event);
6339
6340 perf_output_put(&handle, task_event->event_id);
6341
6342 perf_event__output_id_sample(event, &handle, &sample);
6343
6344 perf_output_end(&handle);
6345out:
6346 task_event->event_id.header.size = size;
6347}
6348
6349static void perf_event_task(struct task_struct *task,
6350 struct perf_event_context *task_ctx,
6351 int new)
6352{
6353 struct perf_task_event task_event;
6354
6355 if (!atomic_read(&nr_comm_events) &&
6356 !atomic_read(&nr_mmap_events) &&
6357 !atomic_read(&nr_task_events))
6358 return;
6359
6360 task_event = (struct perf_task_event){
6361 .task = task,
6362 .task_ctx = task_ctx,
6363 .event_id = {
6364 .header = {
6365 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6366 .misc = 0,
6367 .size = sizeof(task_event.event_id),
6368 },
6369
6370
6371
6372
6373
6374 },
6375 };
6376
6377 perf_iterate_sb(perf_event_task_output,
6378 &task_event,
6379 task_ctx);
6380}
6381
6382void perf_event_fork(struct task_struct *task)
6383{
6384 perf_event_task(task, NULL, 1);
6385}
6386
6387
6388
6389
6390
6391struct perf_comm_event {
6392 struct task_struct *task;
6393 char *comm;
6394 int comm_size;
6395
6396 struct {
6397 struct perf_event_header header;
6398
6399 u32 pid;
6400 u32 tid;
6401 } event_id;
6402};
6403
6404static int perf_event_comm_match(struct perf_event *event)
6405{
6406 return event->attr.comm;
6407}
6408
6409static void perf_event_comm_output(struct perf_event *event,
6410 void *data)
6411{
6412 struct perf_comm_event *comm_event = data;
6413 struct perf_output_handle handle;
6414 struct perf_sample_data sample;
6415 int size = comm_event->event_id.header.size;
6416 int ret;
6417
6418 if (!perf_event_comm_match(event))
6419 return;
6420
6421 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6422 ret = perf_output_begin(&handle, event,
6423 comm_event->event_id.header.size);
6424
6425 if (ret)
6426 goto out;
6427
6428 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6429 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6430
6431 perf_output_put(&handle, comm_event->event_id);
6432 __output_copy(&handle, comm_event->comm,
6433 comm_event->comm_size);
6434
6435 perf_event__output_id_sample(event, &handle, &sample);
6436
6437 perf_output_end(&handle);
6438out:
6439 comm_event->event_id.header.size = size;
6440}
6441
6442static void perf_event_comm_event(struct perf_comm_event *comm_event)
6443{
6444 char comm[TASK_COMM_LEN];
6445 unsigned int size;
6446
6447 memset(comm, 0, sizeof(comm));
6448 strlcpy(comm, comm_event->task->comm, sizeof(comm));
6449 size = ALIGN(strlen(comm)+1, sizeof(u64));
6450
6451 comm_event->comm = comm;
6452 comm_event->comm_size = size;
6453
6454 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6455
6456 perf_iterate_sb(perf_event_comm_output,
6457 comm_event,
6458 NULL);
6459}
6460
6461void perf_event_comm(struct task_struct *task, bool exec)
6462{
6463 struct perf_comm_event comm_event;
6464
6465 if (!atomic_read(&nr_comm_events))
6466 return;
6467
6468 comm_event = (struct perf_comm_event){
6469 .task = task,
6470
6471
6472 .event_id = {
6473 .header = {
6474 .type = PERF_RECORD_COMM,
6475 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6476
6477 },
6478
6479
6480 },
6481 };
6482
6483 perf_event_comm_event(&comm_event);
6484}
6485
6486
6487
6488
6489
6490struct perf_mmap_event {
6491 struct vm_area_struct *vma;
6492
6493 const char *file_name;
6494 int file_size;
6495 int maj, min;
6496 u64 ino;
6497 u64 ino_generation;
6498 u32 prot, flags;
6499
6500 struct {
6501 struct perf_event_header header;
6502
6503 u32 pid;
6504 u32 tid;
6505 u64 start;
6506 u64 len;
6507 u64 pgoff;
6508 } event_id;
6509};
6510
6511static int perf_event_mmap_match(struct perf_event *event,
6512 void *data)
6513{
6514 struct perf_mmap_event *mmap_event = data;
6515 struct vm_area_struct *vma = mmap_event->vma;
6516 int executable = vma->vm_flags & VM_EXEC;
6517
6518 return (!executable && event->attr.mmap_data) ||
6519 (executable && (event->attr.mmap || event->attr.mmap2));
6520}
6521
6522static void perf_event_mmap_output(struct perf_event *event,
6523 void *data)
6524{
6525 struct perf_mmap_event *mmap_event = data;
6526 struct perf_output_handle handle;
6527 struct perf_sample_data sample;
6528 int size = mmap_event->event_id.header.size;
6529 int ret;
6530
6531 if (!perf_event_mmap_match(event, data))
6532 return;
6533
6534 if (event->attr.mmap2) {
6535 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6536 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6537 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6538 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6539 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6540 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6541 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6542 }
6543
6544 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6545 ret = perf_output_begin(&handle, event,
6546 mmap_event->event_id.header.size);
6547 if (ret)
6548 goto out;
6549
6550 mmap_event->event_id.pid = perf_event_pid(event, current);
6551 mmap_event->event_id.tid = perf_event_tid(event, current);
6552
6553 perf_output_put(&handle, mmap_event->event_id);
6554
6555 if (event->attr.mmap2) {
6556 perf_output_put(&handle, mmap_event->maj);
6557 perf_output_put(&handle, mmap_event->min);
6558 perf_output_put(&handle, mmap_event->ino);
6559 perf_output_put(&handle, mmap_event->ino_generation);
6560 perf_output_put(&handle, mmap_event->prot);
6561 perf_output_put(&handle, mmap_event->flags);
6562 }
6563
6564 __output_copy(&handle, mmap_event->file_name,
6565 mmap_event->file_size);
6566
6567 perf_event__output_id_sample(event, &handle, &sample);
6568
6569 perf_output_end(&handle);
6570out:
6571 mmap_event->event_id.header.size = size;
6572}
6573
6574static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6575{
6576 struct vm_area_struct *vma = mmap_event->vma;
6577 struct file *file = vma->vm_file;
6578 int maj = 0, min = 0;
6579 u64 ino = 0, gen = 0;
6580 u32 prot = 0, flags = 0;
6581 unsigned int size;
6582 char tmp[16];
6583 char *buf = NULL;
6584 char *name;
6585
6586 if (file) {
6587 struct inode *inode;
6588 dev_t dev;
6589
6590 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6591 if (!buf) {
6592 name = "//enomem";
6593 goto cpy_name;
6594 }
6595
6596
6597
6598
6599
6600 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6601 if (IS_ERR(name)) {
6602 name = "//toolong";
6603 goto cpy_name;
6604 }
6605 inode = file_inode(vma->vm_file);
6606 dev = inode->i_sb->s_dev;
6607 ino = inode->i_ino;
6608 gen = inode->i_generation;
6609 maj = MAJOR(dev);
6610 min = MINOR(dev);
6611
6612 if (vma->vm_flags & VM_READ)
6613 prot |= PROT_READ;
6614 if (vma->vm_flags & VM_WRITE)
6615 prot |= PROT_WRITE;
6616 if (vma->vm_flags & VM_EXEC)
6617 prot |= PROT_EXEC;
6618
6619 if (vma->vm_flags & VM_MAYSHARE)
6620 flags = MAP_SHARED;
6621 else
6622 flags = MAP_PRIVATE;
6623
6624 if (vma->vm_flags & VM_DENYWRITE)
6625 flags |= MAP_DENYWRITE;
6626 if (vma->vm_flags & VM_MAYEXEC)
6627 flags |= MAP_EXECUTABLE;
6628 if (vma->vm_flags & VM_LOCKED)
6629 flags |= MAP_LOCKED;
6630 if (vma->vm_flags & VM_HUGETLB)
6631 flags |= MAP_HUGETLB;
6632
6633 goto got_name;
6634 } else {
6635 if (vma->vm_ops && vma->vm_ops->name) {
6636 name = (char *) vma->vm_ops->name(vma);
6637 if (name)
6638 goto cpy_name;
6639 }
6640
6641 name = (char *)arch_vma_name(vma);
6642 if (name)
6643 goto cpy_name;
6644
6645 if (vma->vm_start <= vma->vm_mm->start_brk &&
6646 vma->vm_end >= vma->vm_mm->brk) {
6647 name = "[heap]";
6648 goto cpy_name;
6649 }
6650 if (vma->vm_start <= vma->vm_mm->start_stack &&
6651 vma->vm_end >= vma->vm_mm->start_stack) {
6652 name = "[stack]";
6653 goto cpy_name;
6654 }
6655
6656 name = "//anon";
6657 goto cpy_name;
6658 }
6659
6660cpy_name:
6661 strlcpy(tmp, name, sizeof(tmp));
6662 name = tmp;
6663got_name:
6664
6665
6666
6667
6668
6669 size = strlen(name)+1;
6670 while (!IS_ALIGNED(size, sizeof(u64)))
6671 name[size++] = '\0';
6672
6673 mmap_event->file_name = name;
6674 mmap_event->file_size = size;
6675 mmap_event->maj = maj;
6676 mmap_event->min = min;
6677 mmap_event->ino = ino;
6678 mmap_event->ino_generation = gen;
6679 mmap_event->prot = prot;
6680 mmap_event->flags = flags;
6681
6682 if (!(vma->vm_flags & VM_EXEC))
6683 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6684
6685 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6686
6687 perf_iterate_sb(perf_event_mmap_output,
6688 mmap_event,
6689 NULL);
6690
6691 kfree(buf);
6692}
6693
6694
6695
6696
6697static bool perf_addr_filter_match(struct perf_addr_filter *filter,
6698 struct file *file, unsigned long offset,
6699 unsigned long size)
6700{
6701 if (filter->inode != file->f_inode)
6702 return false;
6703
6704 if (filter->offset > offset + size)
6705 return false;
6706
6707 if (filter->offset + filter->size < offset)
6708 return false;
6709
6710 return true;
6711}
6712
6713static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
6714{
6715 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6716 struct vm_area_struct *vma = data;
6717 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
6718 struct file *file = vma->vm_file;
6719 struct perf_addr_filter *filter;
6720 unsigned int restart = 0, count = 0;
6721
6722 if (!has_addr_filter(event))
6723 return;
6724
6725 if (!file)
6726 return;
6727
6728 raw_spin_lock_irqsave(&ifh->lock, flags);
6729 list_for_each_entry(filter, &ifh->list, entry) {
6730 if (perf_addr_filter_match(filter, file, off,
6731 vma->vm_end - vma->vm_start)) {
6732 event->addr_filters_offs[count] = vma->vm_start;
6733 restart++;
6734 }
6735
6736 count++;
6737 }
6738
6739 if (restart)
6740 event->addr_filters_gen++;
6741 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6742
6743 if (restart)
6744 perf_event_stop(event, 1);
6745}
6746
6747
6748
6749
6750static void perf_addr_filters_adjust(struct vm_area_struct *vma)
6751{
6752 struct perf_event_context *ctx;
6753 int ctxn;
6754
6755
6756
6757
6758
6759 if (!(vma->vm_flags & VM_EXEC))
6760 return;
6761
6762 rcu_read_lock();
6763 for_each_task_context_nr(ctxn) {
6764 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6765 if (!ctx)
6766 continue;
6767
6768 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
6769 }
6770 rcu_read_unlock();
6771}
6772
6773void perf_event_mmap(struct vm_area_struct *vma)
6774{
6775 struct perf_mmap_event mmap_event;
6776
6777 if (!atomic_read(&nr_mmap_events))
6778 return;
6779
6780 mmap_event = (struct perf_mmap_event){
6781 .vma = vma,
6782
6783
6784 .event_id = {
6785 .header = {
6786 .type = PERF_RECORD_MMAP,
6787 .misc = PERF_RECORD_MISC_USER,
6788
6789 },
6790
6791
6792 .start = vma->vm_start,
6793 .len = vma->vm_end - vma->vm_start,
6794 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
6795 },
6796
6797
6798
6799
6800
6801
6802 };
6803
6804 perf_addr_filters_adjust(vma);
6805 perf_event_mmap_event(&mmap_event);
6806}
6807
6808void perf_event_aux_event(struct perf_event *event, unsigned long head,
6809 unsigned long size, u64 flags)
6810{
6811 struct perf_output_handle handle;
6812 struct perf_sample_data sample;
6813 struct perf_aux_event {
6814 struct perf_event_header header;
6815 u64 offset;
6816 u64 size;
6817 u64 flags;
6818 } rec = {
6819 .header = {
6820 .type = PERF_RECORD_AUX,
6821 .misc = 0,
6822 .size = sizeof(rec),
6823 },
6824 .offset = head,
6825 .size = size,
6826 .flags = flags,
6827 };
6828 int ret;
6829
6830 perf_event_header__init_id(&rec.header, &sample, event);
6831 ret = perf_output_begin(&handle, event, rec.header.size);
6832
6833 if (ret)
6834 return;
6835
6836 perf_output_put(&handle, rec);
6837 perf_event__output_id_sample(event, &handle, &sample);
6838
6839 perf_output_end(&handle);
6840}
6841
6842
6843
6844
6845void perf_log_lost_samples(struct perf_event *event, u64 lost)
6846{
6847 struct perf_output_handle handle;
6848 struct perf_sample_data sample;
6849 int ret;
6850
6851 struct {
6852 struct perf_event_header header;
6853 u64 lost;
6854 } lost_samples_event = {
6855 .header = {
6856 .type = PERF_RECORD_LOST_SAMPLES,
6857 .misc = 0,
6858 .size = sizeof(lost_samples_event),
6859 },
6860 .lost = lost,
6861 };
6862
6863 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6864
6865 ret = perf_output_begin(&handle, event,
6866 lost_samples_event.header.size);
6867 if (ret)
6868 return;
6869
6870 perf_output_put(&handle, lost_samples_event);
6871 perf_event__output_id_sample(event, &handle, &sample);
6872 perf_output_end(&handle);
6873}
6874
6875
6876
6877
6878
6879struct perf_switch_event {
6880 struct task_struct *task;
6881 struct task_struct *next_prev;
6882
6883 struct {
6884 struct perf_event_header header;
6885 u32 next_prev_pid;
6886 u32 next_prev_tid;
6887 } event_id;
6888};
6889
6890static int perf_event_switch_match(struct perf_event *event)
6891{
6892 return event->attr.context_switch;
6893}
6894
6895static void perf_event_switch_output(struct perf_event *event, void *data)
6896{
6897 struct perf_switch_event *se = data;
6898 struct perf_output_handle handle;
6899 struct perf_sample_data sample;
6900 int ret;
6901
6902 if (!perf_event_switch_match(event))
6903 return;
6904
6905
6906 if (event->ctx->task) {
6907 se->event_id.header.type = PERF_RECORD_SWITCH;
6908 se->event_id.header.size = sizeof(se->event_id.header);
6909 } else {
6910 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6911 se->event_id.header.size = sizeof(se->event_id);
6912 se->event_id.next_prev_pid =
6913 perf_event_pid(event, se->next_prev);
6914 se->event_id.next_prev_tid =
6915 perf_event_tid(event, se->next_prev);
6916 }
6917
6918 perf_event_header__init_id(&se->event_id.header, &sample, event);
6919
6920 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6921 if (ret)
6922 return;
6923
6924 if (event->ctx->task)
6925 perf_output_put(&handle, se->event_id.header);
6926 else
6927 perf_output_put(&handle, se->event_id);
6928
6929 perf_event__output_id_sample(event, &handle, &sample);
6930
6931 perf_output_end(&handle);
6932}
6933
6934static void perf_event_switch(struct task_struct *task,
6935 struct task_struct *next_prev, bool sched_in)
6936{
6937 struct perf_switch_event switch_event;
6938
6939
6940
6941 switch_event = (struct perf_switch_event){
6942 .task = task,
6943 .next_prev = next_prev,
6944 .event_id = {
6945 .header = {
6946
6947 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6948
6949 },
6950
6951
6952 },
6953 };
6954
6955 perf_iterate_sb(perf_event_switch_output,
6956 &switch_event,
6957 NULL);
6958}
6959
6960
6961
6962
6963
6964static void perf_log_throttle(struct perf_event *event, int enable)
6965{
6966 struct perf_output_handle handle;
6967 struct perf_sample_data sample;
6968 int ret;
6969
6970 struct {
6971 struct perf_event_header header;
6972 u64 time;
6973 u64 id;
6974 u64 stream_id;
6975 } throttle_event = {
6976 .header = {
6977 .type = PERF_RECORD_THROTTLE,
6978 .misc = 0,
6979 .size = sizeof(throttle_event),
6980 },
6981 .time = perf_event_clock(event),
6982 .id = primary_event_id(event),
6983 .stream_id = event->id,
6984 };
6985
6986 if (enable)
6987 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6988
6989 perf_event_header__init_id(&throttle_event.header, &sample, event);
6990
6991 ret = perf_output_begin(&handle, event,
6992 throttle_event.header.size);
6993 if (ret)
6994 return;
6995
6996 perf_output_put(&handle, throttle_event);
6997 perf_event__output_id_sample(event, &handle, &sample);
6998 perf_output_end(&handle);
6999}
7000
7001static void perf_log_itrace_start(struct perf_event *event)
7002{
7003 struct perf_output_handle handle;
7004 struct perf_sample_data sample;
7005 struct perf_aux_event {
7006 struct perf_event_header header;
7007 u32 pid;
7008 u32 tid;
7009 } rec;
7010 int ret;
7011
7012 if (event->parent)
7013 event = event->parent;
7014
7015 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7016 event->hw.itrace_started)
7017 return;
7018
7019 rec.header.type = PERF_RECORD_ITRACE_START;
7020 rec.header.misc = 0;
7021 rec.header.size = sizeof(rec);
7022 rec.pid = perf_event_pid(event, current);
7023 rec.tid = perf_event_tid(event, current);
7024
7025 perf_event_header__init_id(&rec.header, &sample, event);
7026 ret = perf_output_begin(&handle, event, rec.header.size);
7027
7028 if (ret)
7029 return;
7030
7031 perf_output_put(&handle, rec);
7032 perf_event__output_id_sample(event, &handle, &sample);
7033
7034 perf_output_end(&handle);
7035}
7036
7037
7038
7039
7040
7041static int __perf_event_overflow(struct perf_event *event,
7042 int throttle, struct perf_sample_data *data,
7043 struct pt_regs *regs)
7044{
7045 int events = atomic_read(&event->event_limit);
7046 struct hw_perf_event *hwc = &event->hw;
7047 u64 seq;
7048 int ret = 0;
7049
7050
7051
7052
7053
7054 if (unlikely(!is_sampling_event(event)))
7055 return 0;
7056
7057 seq = __this_cpu_read(perf_throttled_seq);
7058 if (seq != hwc->interrupts_seq) {
7059 hwc->interrupts_seq = seq;
7060 hwc->interrupts = 1;
7061 } else {
7062 hwc->interrupts++;
7063 if (unlikely(throttle
7064 && hwc->interrupts >= max_samples_per_tick)) {
7065 __this_cpu_inc(perf_throttled_count);
7066 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7067 hwc->interrupts = MAX_INTERRUPTS;
7068 perf_log_throttle(event, 0);
7069 ret = 1;
7070 }
7071 }
7072
7073 if (event->attr.freq) {
7074 u64 now = perf_clock();
7075 s64 delta = now - hwc->freq_time_stamp;
7076
7077 hwc->freq_time_stamp = now;
7078
7079 if (delta > 0 && delta < 2*TICK_NSEC)
7080 perf_adjust_period(event, delta, hwc->last_period, true);
7081 }
7082
7083
7084
7085
7086
7087
7088 event->pending_kill = POLL_IN;
7089 if (events && atomic_dec_and_test(&event->event_limit)) {
7090 ret = 1;
7091 event->pending_kill = POLL_HUP;
7092
7093 perf_event_disable_inatomic(event);
7094 }
7095
7096 READ_ONCE(event->overflow_handler)(event, data, regs);
7097
7098 if (*perf_event_fasync(event) && event->pending_kill) {
7099 event->pending_wakeup = 1;
7100 irq_work_queue(&event->pending);
7101 }
7102
7103 return ret;
7104}
7105
7106int perf_event_overflow(struct perf_event *event,
7107 struct perf_sample_data *data,
7108 struct pt_regs *regs)
7109{
7110 return __perf_event_overflow(event, 1, data, regs);
7111}
7112
7113
7114
7115
7116
7117struct swevent_htable {
7118 struct swevent_hlist *swevent_hlist;
7119 struct mutex hlist_mutex;
7120 int hlist_refcount;
7121
7122
7123 int recursion[PERF_NR_CONTEXTS];
7124};
7125
7126static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7127
7128
7129
7130
7131
7132
7133
7134
7135u64 perf_swevent_set_period(struct perf_event *event)
7136{
7137 struct hw_perf_event *hwc = &event->hw;
7138 u64 period = hwc->last_period;
7139 u64 nr, offset;
7140 s64 old, val;
7141
7142 hwc->last_period = hwc->sample_period;
7143
7144again:
7145 old = val = local64_read(&hwc->period_left);
7146 if (val < 0)
7147 return 0;
7148
7149 nr = div64_u64(period + val, period);
7150 offset = nr * period;
7151 val -= offset;
7152 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7153 goto again;
7154
7155 return nr;
7156}
7157
7158static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7159 struct perf_sample_data *data,
7160 struct pt_regs *regs)
7161{
7162 struct hw_perf_event *hwc = &event->hw;
7163 int throttle = 0;
7164
7165 if (!overflow)
7166 overflow = perf_swevent_set_period(event);
7167
7168 if (hwc->interrupts == MAX_INTERRUPTS)
7169 return;
7170
7171 for (; overflow; overflow--) {
7172 if (__perf_event_overflow(event, throttle,
7173 data, regs)) {
7174
7175
7176
7177
7178 break;
7179 }
7180 throttle = 1;
7181 }
7182}
7183
7184static void perf_swevent_event(struct perf_event *event, u64 nr,
7185 struct perf_sample_data *data,
7186 struct pt_regs *regs)
7187{
7188 struct hw_perf_event *hwc = &event->hw;
7189
7190 local64_add(nr, &event->count);
7191
7192 if (!regs)
7193 return;
7194
7195 if (!is_sampling_event(event))
7196 return;
7197
7198 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7199 data->period = nr;
7200 return perf_swevent_overflow(event, 1, data, regs);
7201 } else
7202 data->period = event->hw.last_period;
7203
7204 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7205 return perf_swevent_overflow(event, 1, data, regs);
7206
7207 if (local64_add_negative(nr, &hwc->period_left))
7208 return;
7209
7210 perf_swevent_overflow(event, 0, data, regs);
7211}
7212
7213static int perf_exclude_event(struct perf_event *event,
7214 struct pt_regs *regs)
7215{
7216 if (event->hw.state & PERF_HES_STOPPED)
7217 return 1;
7218
7219 if (regs) {
7220 if (event->attr.exclude_user && user_mode(regs))
7221 return 1;
7222
7223 if (event->attr.exclude_kernel && !user_mode(regs))
7224 return 1;
7225 }
7226
7227 return 0;
7228}
7229
7230static int perf_swevent_match(struct perf_event *event,
7231 enum perf_type_id type,
7232 u32 event_id,
7233 struct perf_sample_data *data,
7234 struct pt_regs *regs)
7235{
7236 if (event->attr.type != type)
7237 return 0;
7238
7239 if (event->attr.config != event_id)
7240 return 0;
7241
7242 if (perf_exclude_event(event, regs))
7243 return 0;
7244
7245 return 1;
7246}
7247
7248static inline u64 swevent_hash(u64 type, u32 event_id)
7249{
7250 u64 val = event_id | (type << 32);
7251
7252 return hash_64(val, SWEVENT_HLIST_BITS);
7253}
7254
7255static inline struct hlist_head *
7256__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7257{
7258 u64 hash = swevent_hash(type, event_id);
7259
7260 return &hlist->heads[hash];
7261}
7262
7263
7264static inline struct hlist_head *
7265find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7266{
7267 struct swevent_hlist *hlist;
7268
7269 hlist = rcu_dereference(swhash->swevent_hlist);
7270 if (!hlist)
7271 return NULL;
7272
7273 return __find_swevent_head(hlist, type, event_id);
7274}
7275
7276
7277static inline struct hlist_head *
7278find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7279{
7280 struct swevent_hlist *hlist;
7281 u32 event_id = event->attr.config;
7282 u64 type = event->attr.type;
7283
7284
7285
7286
7287
7288
7289 hlist = rcu_dereference_protected(swhash->swevent_hlist,
7290 lockdep_is_held(&event->ctx->lock));
7291 if (!hlist)
7292 return NULL;
7293
7294 return __find_swevent_head(hlist, type, event_id);
7295}
7296
7297static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7298 u64 nr,
7299 struct perf_sample_data *data,
7300 struct pt_regs *regs)
7301{
7302 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7303 struct perf_event *event;
7304 struct hlist_head *head;
7305
7306 rcu_read_lock();
7307 head = find_swevent_head_rcu(swhash, type, event_id);
7308 if (!head)
7309 goto end;
7310
7311 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7312 if (perf_swevent_match(event, type, event_id, data, regs))
7313 perf_swevent_event(event, nr, data, regs);
7314 }
7315end:
7316 rcu_read_unlock();
7317}
7318
7319DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7320
7321int perf_swevent_get_recursion_context(void)
7322{
7323 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7324
7325 return get_recursion_context(swhash->recursion);
7326}
7327EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7328
7329void perf_swevent_put_recursion_context(int rctx)
7330{
7331 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7332
7333 put_recursion_context(swhash->recursion, rctx);
7334}
7335
7336void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7337{
7338 struct perf_sample_data data;
7339
7340 if (WARN_ON_ONCE(!regs))
7341 return;
7342
7343 perf_sample_data_init(&data, addr, 0);
7344 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7345}
7346
7347void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7348{
7349 int rctx;
7350
7351 preempt_disable_notrace();
7352 rctx = perf_swevent_get_recursion_context();
7353 if (unlikely(rctx < 0))
7354 goto fail;
7355
7356 ___perf_sw_event(event_id, nr, regs, addr);
7357
7358 perf_swevent_put_recursion_context(rctx);
7359fail:
7360 preempt_enable_notrace();
7361}
7362
7363static void perf_swevent_read(struct perf_event *event)
7364{
7365}
7366
7367static int perf_swevent_add(struct perf_event *event, int flags)
7368{
7369 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7370 struct hw_perf_event *hwc = &event->hw;
7371 struct hlist_head *head;
7372
7373 if (is_sampling_event(event)) {
7374 hwc->last_period = hwc->sample_period;
7375 perf_swevent_set_period(event);
7376 }
7377
7378 hwc->state = !(flags & PERF_EF_START);
7379
7380 head = find_swevent_head(swhash, event);
7381 if (WARN_ON_ONCE(!head))
7382 return -EINVAL;
7383
7384 hlist_add_head_rcu(&event->hlist_entry, head);
7385 perf_event_update_userpage(event);
7386
7387 return 0;
7388}
7389
7390static void perf_swevent_del(struct perf_event *event, int flags)
7391{
7392 hlist_del_rcu(&event->hlist_entry);
7393}
7394
7395static void perf_swevent_start(struct perf_event *event, int flags)
7396{
7397 event->hw.state = 0;
7398}
7399
7400static void perf_swevent_stop(struct perf_event *event, int flags)
7401{
7402 event->hw.state = PERF_HES_STOPPED;
7403}
7404
7405
7406static inline struct swevent_hlist *
7407swevent_hlist_deref(struct swevent_htable *swhash)
7408{
7409 return rcu_dereference_protected(swhash->swevent_hlist,
7410 lockdep_is_held(&swhash->hlist_mutex));
7411}
7412
7413static void swevent_hlist_release(struct swevent_htable *swhash)
7414{
7415 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7416
7417 if (!hlist)
7418 return;
7419
7420 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7421 kfree_rcu(hlist, rcu_head);
7422}
7423
7424static void swevent_hlist_put_cpu(int cpu)
7425{
7426 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7427
7428 mutex_lock(&swhash->hlist_mutex);
7429
7430 if (!--swhash->hlist_refcount)
7431 swevent_hlist_release(swhash);
7432
7433 mutex_unlock(&swhash->hlist_mutex);
7434}
7435
7436static void swevent_hlist_put(void)
7437{
7438 int cpu;
7439
7440 for_each_possible_cpu(cpu)
7441 swevent_hlist_put_cpu(cpu);
7442}
7443
7444static int swevent_hlist_get_cpu(int cpu)
7445{
7446 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7447 int err = 0;
7448
7449 mutex_lock(&swhash->hlist_mutex);
7450 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
7451 struct swevent_hlist *hlist;
7452
7453 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7454 if (!hlist) {
7455 err = -ENOMEM;
7456 goto exit;
7457 }
7458 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7459 }
7460 swhash->hlist_refcount++;
7461exit:
7462 mutex_unlock(&swhash->hlist_mutex);
7463
7464 return err;
7465}
7466
7467static int swevent_hlist_get(void)
7468{
7469 int err, cpu, failed_cpu;
7470
7471 get_online_cpus();
7472 for_each_possible_cpu(cpu) {
7473 err = swevent_hlist_get_cpu(cpu);
7474 if (err) {
7475 failed_cpu = cpu;
7476 goto fail;
7477 }
7478 }
7479 put_online_cpus();
7480
7481 return 0;
7482fail:
7483 for_each_possible_cpu(cpu) {
7484 if (cpu == failed_cpu)
7485 break;
7486 swevent_hlist_put_cpu(cpu);
7487 }
7488
7489 put_online_cpus();
7490 return err;
7491}
7492
7493struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7494
7495static void sw_perf_event_destroy(struct perf_event *event)
7496{
7497 u64 event_id = event->attr.config;
7498
7499 WARN_ON(event->parent);
7500
7501 static_key_slow_dec(&perf_swevent_enabled[event_id]);
7502 swevent_hlist_put();
7503}
7504
7505static int perf_swevent_init(struct perf_event *event)
7506{
7507 u64 event_id = event->attr.config;
7508
7509 if (event->attr.type != PERF_TYPE_SOFTWARE)
7510 return -ENOENT;
7511
7512
7513
7514
7515 if (has_branch_stack(event))
7516 return -EOPNOTSUPP;
7517
7518 switch (event_id) {
7519 case PERF_COUNT_SW_CPU_CLOCK:
7520 case PERF_COUNT_SW_TASK_CLOCK:
7521 return -ENOENT;
7522
7523 default:
7524 break;
7525 }
7526
7527 if (event_id >= PERF_COUNT_SW_MAX)
7528 return -ENOENT;
7529
7530 if (!event->parent) {
7531 int err;
7532
7533 err = swevent_hlist_get();
7534 if (err)
7535 return err;
7536
7537 static_key_slow_inc(&perf_swevent_enabled[event_id]);
7538 event->destroy = sw_perf_event_destroy;
7539 }
7540
7541 return 0;
7542}
7543
7544static struct pmu perf_swevent = {
7545 .task_ctx_nr = perf_sw_context,
7546
7547 .capabilities = PERF_PMU_CAP_NO_NMI,
7548
7549 .event_init = perf_swevent_init,
7550 .add = perf_swevent_add,
7551 .del = perf_swevent_del,
7552 .start = perf_swevent_start,
7553 .stop = perf_swevent_stop,
7554 .read = perf_swevent_read,
7555};
7556
7557#ifdef CONFIG_EVENT_TRACING
7558
7559static int perf_tp_filter_match(struct perf_event *event,
7560 struct perf_sample_data *data)
7561{
7562 void *record = data->raw->frag.data;
7563
7564
7565 if (event->parent)
7566 event = event->parent;
7567
7568 if (likely(!event->filter) || filter_match_preds(event->filter, record))
7569 return 1;
7570 return 0;
7571}
7572
7573static int perf_tp_event_match(struct perf_event *event,
7574 struct perf_sample_data *data,
7575 struct pt_regs *regs)
7576{
7577 if (event->hw.state & PERF_HES_STOPPED)
7578 return 0;
7579
7580
7581
7582 if (event->attr.exclude_kernel)
7583 return 0;
7584
7585 if (!perf_tp_filter_match(event, data))
7586 return 0;
7587
7588 return 1;
7589}
7590
7591void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7592 struct trace_event_call *call, u64 count,
7593 struct pt_regs *regs, struct hlist_head *head,
7594 struct task_struct *task)
7595{
7596 struct bpf_prog *prog = call->prog;
7597
7598 if (prog) {
7599 *(struct pt_regs **)raw_data = regs;
7600 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7601 perf_swevent_put_recursion_context(rctx);
7602 return;
7603 }
7604 }
7605 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7606 rctx, task);
7607}
7608EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7609
7610void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7611 struct pt_regs *regs, struct hlist_head *head, int rctx,
7612 struct task_struct *task)
7613{
7614 struct perf_sample_data data;
7615 struct perf_event *event;
7616
7617 struct perf_raw_record raw = {
7618 .frag = {
7619 .size = entry_size,
7620 .data = record,
7621 },
7622 };
7623
7624 perf_sample_data_init(&data, 0, 0);
7625 data.raw = &raw;
7626
7627 perf_trace_buf_update(record, event_type);
7628
7629 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7630 if (perf_tp_event_match(event, &data, regs))
7631 perf_swevent_event(event, count, &data, regs);
7632 }
7633
7634
7635
7636
7637
7638 if (task && task != current) {
7639 struct perf_event_context *ctx;
7640 struct trace_entry *entry = record;
7641
7642 rcu_read_lock();
7643 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
7644 if (!ctx)
7645 goto unlock;
7646
7647 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7648 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7649 continue;
7650 if (event->attr.config != entry->type)
7651 continue;
7652 if (perf_tp_event_match(event, &data, regs))
7653 perf_swevent_event(event, count, &data, regs);
7654 }
7655unlock:
7656 rcu_read_unlock();
7657 }
7658
7659 perf_swevent_put_recursion_context(rctx);
7660}
7661EXPORT_SYMBOL_GPL(perf_tp_event);
7662
7663static void tp_perf_event_destroy(struct perf_event *event)
7664{
7665 perf_trace_destroy(event);
7666}
7667
7668static int perf_tp_event_init(struct perf_event *event)
7669{
7670 int err;
7671
7672 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7673 return -ENOENT;
7674
7675
7676
7677
7678 if (has_branch_stack(event))
7679 return -EOPNOTSUPP;
7680
7681 err = perf_trace_init(event);
7682 if (err)
7683 return err;
7684
7685 event->destroy = tp_perf_event_destroy;
7686
7687 return 0;
7688}
7689
7690static struct pmu perf_tracepoint = {
7691 .task_ctx_nr = perf_sw_context,
7692
7693 .event_init = perf_tp_event_init,
7694 .add = perf_trace_add,
7695 .del = perf_trace_del,
7696 .start = perf_swevent_start,
7697 .stop = perf_swevent_stop,
7698 .read = perf_swevent_read,
7699};
7700
7701static inline void perf_tp_register(void)
7702{
7703 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
7704}
7705
7706static void perf_event_free_filter(struct perf_event *event)
7707{
7708 ftrace_profile_free_filter(event);
7709}
7710
7711#ifdef CONFIG_BPF_SYSCALL
7712static void bpf_overflow_handler(struct perf_event *event,
7713 struct perf_sample_data *data,
7714 struct pt_regs *regs)
7715{
7716 struct bpf_perf_event_data_kern ctx = {
7717 .data = data,
7718 .regs = regs,
7719 };
7720 int ret = 0;
7721
7722 preempt_disable();
7723 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
7724 goto out;
7725 rcu_read_lock();
7726 ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
7727 rcu_read_unlock();
7728out:
7729 __this_cpu_dec(bpf_prog_active);
7730 preempt_enable();
7731 if (!ret)
7732 return;
7733
7734 event->orig_overflow_handler(event, data, regs);
7735}
7736
7737static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7738{
7739 struct bpf_prog *prog;
7740
7741 if (event->overflow_handler_context)
7742
7743 return -EINVAL;
7744
7745 if (event->prog)
7746 return -EEXIST;
7747
7748 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
7749 if (IS_ERR(prog))
7750 return PTR_ERR(prog);
7751
7752 event->prog = prog;
7753 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
7754 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
7755 return 0;
7756}
7757
7758static void perf_event_free_bpf_handler(struct perf_event *event)
7759{
7760 struct bpf_prog *prog = event->prog;
7761
7762 if (!prog)
7763 return;
7764
7765 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
7766 event->prog = NULL;
7767 bpf_prog_put(prog);
7768}
7769#else
7770static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7771{
7772 return -EOPNOTSUPP;
7773}
7774static void perf_event_free_bpf_handler(struct perf_event *event)
7775{
7776}
7777#endif
7778
7779static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7780{
7781 bool is_kprobe, is_tracepoint;
7782 struct bpf_prog *prog;
7783
7784 if (event->attr.type == PERF_TYPE_HARDWARE ||
7785 event->attr.type == PERF_TYPE_SOFTWARE)
7786 return perf_event_set_bpf_handler(event, prog_fd);
7787
7788 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7789 return -EINVAL;
7790
7791 if (event->tp_event->prog)
7792 return -EEXIST;
7793
7794 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
7795 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
7796 if (!is_kprobe && !is_tracepoint)
7797
7798 return -EINVAL;
7799
7800 prog = bpf_prog_get(prog_fd);
7801 if (IS_ERR(prog))
7802 return PTR_ERR(prog);
7803
7804 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
7805 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
7806
7807 bpf_prog_put(prog);
7808 return -EINVAL;
7809 }
7810
7811 if (is_tracepoint) {
7812 int off = trace_event_get_offsets(event->tp_event);
7813
7814 if (prog->aux->max_ctx_offset > off) {
7815 bpf_prog_put(prog);
7816 return -EACCES;
7817 }
7818 }
7819 event->tp_event->prog = prog;
7820
7821 return 0;
7822}
7823
7824static void perf_event_free_bpf_prog(struct perf_event *event)
7825{
7826 struct bpf_prog *prog;
7827
7828 perf_event_free_bpf_handler(event);
7829
7830 if (!event->tp_event)
7831 return;
7832
7833 prog = event->tp_event->prog;
7834 if (prog) {
7835 event->tp_event->prog = NULL;
7836 bpf_prog_put(prog);
7837 }
7838}
7839
7840#else
7841
7842static inline void perf_tp_register(void)
7843{
7844}
7845
7846static void perf_event_free_filter(struct perf_event *event)
7847{
7848}
7849
7850static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7851{
7852 return -ENOENT;
7853}
7854
7855static void perf_event_free_bpf_prog(struct perf_event *event)
7856{
7857}
7858#endif
7859
7860#ifdef CONFIG_HAVE_HW_BREAKPOINT
7861void perf_bp_event(struct perf_event *bp, void *data)
7862{
7863 struct perf_sample_data sample;
7864 struct pt_regs *regs = data;
7865
7866 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
7867
7868 if (!bp->hw.state && !perf_exclude_event(bp, regs))
7869 perf_swevent_event(bp, 1, &sample, regs);
7870}
7871#endif
7872
7873
7874
7875
7876static struct perf_addr_filter *
7877perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
7878{
7879 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
7880 struct perf_addr_filter *filter;
7881
7882 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
7883 if (!filter)
7884 return NULL;
7885
7886 INIT_LIST_HEAD(&filter->entry);
7887 list_add_tail(&filter->entry, filters);
7888
7889 return filter;
7890}
7891
7892static void free_filters_list(struct list_head *filters)
7893{
7894 struct perf_addr_filter *filter, *iter;
7895
7896 list_for_each_entry_safe(filter, iter, filters, entry) {
7897 if (filter->inode)
7898 iput(filter->inode);
7899 list_del(&filter->entry);
7900 kfree(filter);
7901 }
7902}
7903
7904
7905
7906
7907static void perf_addr_filters_splice(struct perf_event *event,
7908 struct list_head *head)
7909{
7910 unsigned long flags;
7911 LIST_HEAD(list);
7912
7913 if (!has_addr_filter(event))
7914 return;
7915
7916
7917 if (event->parent)
7918 return;
7919
7920 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
7921
7922 list_splice_init(&event->addr_filters.list, &list);
7923 if (head)
7924 list_splice(head, &event->addr_filters.list);
7925
7926 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
7927
7928 free_filters_list(&list);
7929}
7930
7931
7932
7933
7934
7935
7936static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
7937 struct mm_struct *mm)
7938{
7939 struct vm_area_struct *vma;
7940
7941 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7942 struct file *file = vma->vm_file;
7943 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
7944 unsigned long vma_size = vma->vm_end - vma->vm_start;
7945
7946 if (!file)
7947 continue;
7948
7949 if (!perf_addr_filter_match(filter, file, off, vma_size))
7950 continue;
7951
7952 return vma->vm_start;
7953 }
7954
7955 return 0;
7956}
7957
7958
7959
7960
7961
7962static void perf_event_addr_filters_apply(struct perf_event *event)
7963{
7964 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7965 struct task_struct *task = READ_ONCE(event->ctx->task);
7966 struct perf_addr_filter *filter;
7967 struct mm_struct *mm = NULL;
7968 unsigned int count = 0;
7969 unsigned long flags;
7970
7971
7972
7973
7974
7975 if (task == TASK_TOMBSTONE)
7976 return;
7977
7978 mm = get_task_mm(event->ctx->task);
7979 if (!mm)
7980 goto restart;
7981
7982 down_read(&mm->mmap_sem);
7983
7984 raw_spin_lock_irqsave(&ifh->lock, flags);
7985 list_for_each_entry(filter, &ifh->list, entry) {
7986 event->addr_filters_offs[count] = 0;
7987
7988
7989
7990
7991
7992 if (filter->inode)
7993 event->addr_filters_offs[count] =
7994 perf_addr_filter_apply(filter, mm);
7995
7996 count++;
7997 }
7998
7999 event->addr_filters_gen++;
8000 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8001
8002 up_read(&mm->mmap_sem);
8003
8004 mmput(mm);
8005
8006restart:
8007 perf_event_stop(event, 1);
8008}
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028enum {
8029 IF_ACT_NONE = -1,
8030 IF_ACT_FILTER,
8031 IF_ACT_START,
8032 IF_ACT_STOP,
8033 IF_SRC_FILE,
8034 IF_SRC_KERNEL,
8035 IF_SRC_FILEADDR,
8036 IF_SRC_KERNELADDR,
8037};
8038
8039enum {
8040 IF_STATE_ACTION = 0,
8041 IF_STATE_SOURCE,
8042 IF_STATE_END,
8043};
8044
8045static const match_table_t if_tokens = {
8046 { IF_ACT_FILTER, "filter" },
8047 { IF_ACT_START, "start" },
8048 { IF_ACT_STOP, "stop" },
8049 { IF_SRC_FILE, "%u/%u@%s" },
8050 { IF_SRC_KERNEL, "%u/%u" },
8051 { IF_SRC_FILEADDR, "%u@%s" },
8052 { IF_SRC_KERNELADDR, "%u" },
8053 { IF_ACT_NONE, NULL },
8054};
8055
8056
8057
8058
8059static int
8060perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8061 struct list_head *filters)
8062{
8063 struct perf_addr_filter *filter = NULL;
8064 char *start, *orig, *filename = NULL;
8065 struct path path;
8066 substring_t args[MAX_OPT_ARGS];
8067 int state = IF_STATE_ACTION, token;
8068 unsigned int kernel = 0;
8069 int ret = -EINVAL;
8070
8071 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8072 if (!fstr)
8073 return -ENOMEM;
8074
8075 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8076 ret = -EINVAL;
8077
8078 if (!*start)
8079 continue;
8080
8081
8082 if (state == IF_STATE_ACTION) {
8083 filter = perf_addr_filter_new(event, filters);
8084 if (!filter)
8085 goto fail;
8086 }
8087
8088 token = match_token(start, if_tokens, args);
8089 switch (token) {
8090 case IF_ACT_FILTER:
8091 case IF_ACT_START:
8092 filter->filter = 1;
8093
8094 case IF_ACT_STOP:
8095 if (state != IF_STATE_ACTION)
8096 goto fail;
8097
8098 state = IF_STATE_SOURCE;
8099 break;
8100
8101 case IF_SRC_KERNELADDR:
8102 case IF_SRC_KERNEL:
8103 kernel = 1;
8104
8105 case IF_SRC_FILEADDR:
8106 case IF_SRC_FILE:
8107 if (state != IF_STATE_SOURCE)
8108 goto fail;
8109
8110 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8111 filter->range = 1;
8112
8113 *args[0].to = 0;
8114 ret = kstrtoul(args[0].from, 0, &filter->offset);
8115 if (ret)
8116 goto fail;
8117
8118 if (filter->range) {
8119 *args[1].to = 0;
8120 ret = kstrtoul(args[1].from, 0, &filter->size);
8121 if (ret)
8122 goto fail;
8123 }
8124
8125 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8126 int fpos = filter->range ? 2 : 1;
8127
8128 filename = match_strdup(&args[fpos]);
8129 if (!filename) {
8130 ret = -ENOMEM;
8131 goto fail;
8132 }
8133 }
8134
8135 state = IF_STATE_END;
8136 break;
8137
8138 default:
8139 goto fail;
8140 }
8141
8142
8143
8144
8145
8146
8147 if (state == IF_STATE_END) {
8148 if (kernel && event->attr.exclude_kernel)
8149 goto fail;
8150
8151 if (!kernel) {
8152 if (!filename)
8153 goto fail;
8154
8155
8156 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8157 if (ret)
8158 goto fail_free_name;
8159
8160 filter->inode = igrab(d_inode(path.dentry));
8161 path_put(&path);
8162 kfree(filename);
8163 filename = NULL;
8164
8165 ret = -EINVAL;
8166 if (!filter->inode ||
8167 !S_ISREG(filter->inode->i_mode))
8168
8169 goto fail;
8170 }
8171
8172
8173 state = IF_STATE_ACTION;
8174 filter = NULL;
8175 }
8176 }
8177
8178 if (state != IF_STATE_ACTION)
8179 goto fail;
8180
8181 kfree(orig);
8182
8183 return 0;
8184
8185fail_free_name:
8186 kfree(filename);
8187fail:
8188 free_filters_list(filters);
8189 kfree(orig);
8190
8191 return ret;
8192}
8193
8194static int
8195perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8196{
8197 LIST_HEAD(filters);
8198 int ret;
8199
8200
8201
8202
8203
8204 lockdep_assert_held(&event->ctx->mutex);
8205
8206 if (WARN_ON_ONCE(event->parent))
8207 return -EINVAL;
8208
8209
8210
8211
8212
8213
8214
8215 if (!event->ctx->task)
8216 return -EOPNOTSUPP;
8217
8218 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8219 if (ret)
8220 return ret;
8221
8222 ret = event->pmu->addr_filters_validate(&filters);
8223 if (ret) {
8224 free_filters_list(&filters);
8225 return ret;
8226 }
8227
8228
8229 perf_addr_filters_splice(event, &filters);
8230
8231
8232 perf_event_for_each_child(event, perf_event_addr_filters_apply);
8233
8234 return ret;
8235}
8236
8237static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8238{
8239 char *filter_str;
8240 int ret = -EINVAL;
8241
8242 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8243 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8244 !has_addr_filter(event))
8245 return -EINVAL;
8246
8247 filter_str = strndup_user(arg, PAGE_SIZE);
8248 if (IS_ERR(filter_str))
8249 return PTR_ERR(filter_str);
8250
8251 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8252 event->attr.type == PERF_TYPE_TRACEPOINT)
8253 ret = ftrace_profile_set_filter(event, event->attr.config,
8254 filter_str);
8255 else if (has_addr_filter(event))
8256 ret = perf_event_set_addr_filter(event, filter_str);
8257
8258 kfree(filter_str);
8259 return ret;
8260}
8261
8262
8263
8264
8265
8266static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8267{
8268 enum hrtimer_restart ret = HRTIMER_RESTART;
8269 struct perf_sample_data data;
8270 struct pt_regs *regs;
8271 struct perf_event *event;
8272 u64 period;
8273
8274 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8275
8276 if (event->state != PERF_EVENT_STATE_ACTIVE)
8277 return HRTIMER_NORESTART;
8278
8279 event->pmu->read(event);
8280
8281 perf_sample_data_init(&data, 0, event->hw.last_period);
8282 regs = get_irq_regs();
8283
8284 if (regs && !perf_exclude_event(event, regs)) {
8285 if (!(event->attr.exclude_idle && is_idle_task(current)))
8286 if (__perf_event_overflow(event, 1, &data, regs))
8287 ret = HRTIMER_NORESTART;
8288 }
8289
8290 period = max_t(u64, 10000, event->hw.sample_period);
8291 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8292
8293 return ret;
8294}
8295
8296static void perf_swevent_start_hrtimer(struct perf_event *event)
8297{
8298 struct hw_perf_event *hwc = &event->hw;
8299 s64 period;
8300
8301 if (!is_sampling_event(event))
8302 return;
8303
8304 period = local64_read(&hwc->period_left);
8305 if (period) {
8306 if (period < 0)
8307 period = 10000;
8308
8309 local64_set(&hwc->period_left, 0);
8310 } else {
8311 period = max_t(u64, 10000, hwc->sample_period);
8312 }
8313 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8314 HRTIMER_MODE_REL_PINNED);
8315}
8316
8317static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8318{
8319 struct hw_perf_event *hwc = &event->hw;
8320
8321 if (is_sampling_event(event)) {
8322 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8323 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8324
8325 hrtimer_cancel(&hwc->hrtimer);
8326 }
8327}
8328
8329static void perf_swevent_init_hrtimer(struct perf_event *event)
8330{
8331 struct hw_perf_event *hwc = &event->hw;
8332
8333 if (!is_sampling_event(event))
8334 return;
8335
8336 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8337 hwc->hrtimer.function = perf_swevent_hrtimer;
8338
8339
8340
8341
8342
8343 if (event->attr.freq) {
8344 long freq = event->attr.sample_freq;
8345
8346 event->attr.sample_period = NSEC_PER_SEC / freq;
8347 hwc->sample_period = event->attr.sample_period;
8348 local64_set(&hwc->period_left, hwc->sample_period);
8349 hwc->last_period = hwc->sample_period;
8350 event->attr.freq = 0;
8351 }
8352}
8353
8354
8355
8356
8357
8358static void cpu_clock_event_update(struct perf_event *event)
8359{
8360 s64 prev;
8361 u64 now;
8362
8363 now = local_clock();
8364 prev = local64_xchg(&event->hw.prev_count, now);
8365 local64_add(now - prev, &event->count);
8366}
8367
8368static void cpu_clock_event_start(struct perf_event *event, int flags)
8369{
8370 local64_set(&event->hw.prev_count, local_clock());
8371 perf_swevent_start_hrtimer(event);
8372}
8373
8374static void cpu_clock_event_stop(struct perf_event *event, int flags)
8375{
8376 perf_swevent_cancel_hrtimer(event);
8377 cpu_clock_event_update(event);
8378}
8379
8380static int cpu_clock_event_add(struct perf_event *event, int flags)
8381{
8382 if (flags & PERF_EF_START)
8383 cpu_clock_event_start(event, flags);
8384 perf_event_update_userpage(event);
8385
8386 return 0;
8387}
8388
8389static void cpu_clock_event_del(struct perf_event *event, int flags)
8390{
8391 cpu_clock_event_stop(event, flags);
8392}
8393
8394static void cpu_clock_event_read(struct perf_event *event)
8395{
8396 cpu_clock_event_update(event);
8397}
8398
8399static int cpu_clock_event_init(struct perf_event *event)
8400{
8401 if (event->attr.type != PERF_TYPE_SOFTWARE)
8402 return -ENOENT;
8403
8404 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8405 return -ENOENT;
8406
8407
8408
8409
8410 if (has_branch_stack(event))
8411 return -EOPNOTSUPP;
8412
8413 perf_swevent_init_hrtimer(event);
8414
8415 return 0;
8416}
8417
8418static struct pmu perf_cpu_clock = {
8419 .task_ctx_nr = perf_sw_context,
8420
8421 .capabilities = PERF_PMU_CAP_NO_NMI,
8422
8423 .event_init = cpu_clock_event_init,
8424 .add = cpu_clock_event_add,
8425 .del = cpu_clock_event_del,
8426 .start = cpu_clock_event_start,
8427 .stop = cpu_clock_event_stop,
8428 .read = cpu_clock_event_read,
8429};
8430
8431
8432
8433
8434
8435static void task_clock_event_update(struct perf_event *event, u64 now)
8436{
8437 u64 prev;
8438 s64 delta;
8439
8440 prev = local64_xchg(&event->hw.prev_count, now);
8441 delta = now - prev;
8442 local64_add(delta, &event->count);
8443}
8444
8445static void task_clock_event_start(struct perf_event *event, int flags)
8446{
8447 local64_set(&event->hw.prev_count, event->ctx->time);
8448 perf_swevent_start_hrtimer(event);
8449}
8450
8451static void task_clock_event_stop(struct perf_event *event, int flags)
8452{
8453 perf_swevent_cancel_hrtimer(event);
8454 task_clock_event_update(event, event->ctx->time);
8455}
8456
8457static int task_clock_event_add(struct perf_event *event, int flags)
8458{
8459 if (flags & PERF_EF_START)
8460 task_clock_event_start(event, flags);
8461 perf_event_update_userpage(event);
8462
8463 return 0;
8464}
8465
8466static void task_clock_event_del(struct perf_event *event, int flags)
8467{
8468 task_clock_event_stop(event, PERF_EF_UPDATE);
8469}
8470
8471static void task_clock_event_read(struct perf_event *event)
8472{
8473 u64 now = perf_clock();
8474 u64 delta = now - event->ctx->timestamp;
8475 u64 time = event->ctx->time + delta;
8476
8477 task_clock_event_update(event, time);
8478}
8479
8480static int task_clock_event_init(struct perf_event *event)
8481{
8482 if (event->attr.type != PERF_TYPE_SOFTWARE)
8483 return -ENOENT;
8484
8485 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8486 return -ENOENT;
8487
8488
8489
8490
8491 if (has_branch_stack(event))
8492 return -EOPNOTSUPP;
8493
8494 perf_swevent_init_hrtimer(event);
8495
8496 return 0;
8497}
8498
8499static struct pmu perf_task_clock = {
8500 .task_ctx_nr = perf_sw_context,
8501
8502 .capabilities = PERF_PMU_CAP_NO_NMI,
8503
8504 .event_init = task_clock_event_init,
8505 .add = task_clock_event_add,
8506 .del = task_clock_event_del,
8507 .start = task_clock_event_start,
8508 .stop = task_clock_event_stop,
8509 .read = task_clock_event_read,
8510};
8511
8512static void perf_pmu_nop_void(struct pmu *pmu)
8513{
8514}
8515
8516static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8517{
8518}
8519
8520static int perf_pmu_nop_int(struct pmu *pmu)
8521{
8522 return 0;
8523}
8524
8525static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
8526
8527static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
8528{
8529 __this_cpu_write(nop_txn_flags, flags);
8530
8531 if (flags & ~PERF_PMU_TXN_ADD)
8532 return;
8533
8534 perf_pmu_disable(pmu);
8535}
8536
8537static int perf_pmu_commit_txn(struct pmu *pmu)
8538{
8539 unsigned int flags = __this_cpu_read(nop_txn_flags);
8540
8541 __this_cpu_write(nop_txn_flags, 0);
8542
8543 if (flags & ~PERF_PMU_TXN_ADD)
8544 return 0;
8545
8546 perf_pmu_enable(pmu);
8547 return 0;
8548}
8549
8550static void perf_pmu_cancel_txn(struct pmu *pmu)
8551{
8552 unsigned int flags = __this_cpu_read(nop_txn_flags);
8553
8554 __this_cpu_write(nop_txn_flags, 0);
8555
8556 if (flags & ~PERF_PMU_TXN_ADD)
8557 return;
8558
8559 perf_pmu_enable(pmu);
8560}
8561
8562static int perf_event_idx_default(struct perf_event *event)
8563{
8564 return 0;
8565}
8566
8567
8568
8569
8570
8571static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8572{
8573 struct pmu *pmu;
8574
8575 if (ctxn < 0)
8576 return NULL;
8577
8578 list_for_each_entry(pmu, &pmus, entry) {
8579 if (pmu->task_ctx_nr == ctxn)
8580 return pmu->pmu_cpu_context;
8581 }
8582
8583 return NULL;
8584}
8585
8586static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
8587{
8588 int cpu;
8589
8590 for_each_possible_cpu(cpu) {
8591 struct perf_cpu_context *cpuctx;
8592
8593 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8594
8595 if (cpuctx->unique_pmu == old_pmu)
8596 cpuctx->unique_pmu = pmu;
8597 }
8598}
8599
8600static void free_pmu_context(struct pmu *pmu)
8601{
8602 struct pmu *i;
8603
8604 mutex_lock(&pmus_lock);
8605
8606
8607
8608 list_for_each_entry(i, &pmus, entry) {
8609 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
8610 update_pmu_context(i, pmu);
8611 goto out;
8612 }
8613 }
8614
8615 free_percpu(pmu->pmu_cpu_context);
8616out:
8617 mutex_unlock(&pmus_lock);
8618}
8619
8620
8621
8622
8623static ssize_t nr_addr_filters_show(struct device *dev,
8624 struct device_attribute *attr,
8625 char *page)
8626{
8627 struct pmu *pmu = dev_get_drvdata(dev);
8628
8629 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
8630}
8631DEVICE_ATTR_RO(nr_addr_filters);
8632
8633static struct idr pmu_idr;
8634
8635static ssize_t
8636type_show(struct device *dev, struct device_attribute *attr, char *page)
8637{
8638 struct pmu *pmu = dev_get_drvdata(dev);
8639
8640 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
8641}
8642static DEVICE_ATTR_RO(type);
8643
8644static ssize_t
8645perf_event_mux_interval_ms_show(struct device *dev,
8646 struct device_attribute *attr,
8647 char *page)
8648{
8649 struct pmu *pmu = dev_get_drvdata(dev);
8650
8651 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
8652}
8653
8654static DEFINE_MUTEX(mux_interval_mutex);
8655
8656static ssize_t
8657perf_event_mux_interval_ms_store(struct device *dev,
8658 struct device_attribute *attr,
8659 const char *buf, size_t count)
8660{
8661 struct pmu *pmu = dev_get_drvdata(dev);
8662 int timer, cpu, ret;
8663
8664 ret = kstrtoint(buf, 0, &timer);
8665 if (ret)
8666 return ret;
8667
8668 if (timer < 1)
8669 return -EINVAL;
8670
8671
8672 if (timer == pmu->hrtimer_interval_ms)
8673 return count;
8674
8675 mutex_lock(&mux_interval_mutex);
8676 pmu->hrtimer_interval_ms = timer;
8677
8678
8679 get_online_cpus();
8680 for_each_online_cpu(cpu) {
8681 struct perf_cpu_context *cpuctx;
8682 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8683 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
8684
8685 cpu_function_call(cpu,
8686 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
8687 }
8688 put_online_cpus();
8689 mutex_unlock(&mux_interval_mutex);
8690
8691 return count;
8692}
8693static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
8694
8695static struct attribute *pmu_dev_attrs[] = {
8696 &dev_attr_type.attr,
8697 &dev_attr_perf_event_mux_interval_ms.attr,
8698 NULL,
8699};
8700ATTRIBUTE_GROUPS(pmu_dev);
8701
8702static int pmu_bus_running;
8703static struct bus_type pmu_bus = {
8704 .name = "event_source",
8705 .dev_groups = pmu_dev_groups,
8706};
8707
8708static void pmu_dev_release(struct device *dev)
8709{
8710 kfree(dev);
8711}
8712
8713static int pmu_dev_alloc(struct pmu *pmu)
8714{
8715 int ret = -ENOMEM;
8716
8717 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
8718 if (!pmu->dev)
8719 goto out;
8720
8721 pmu->dev->groups = pmu->attr_groups;
8722 device_initialize(pmu->dev);
8723 ret = dev_set_name(pmu->dev, "%s", pmu->name);
8724 if (ret)
8725 goto free_dev;
8726
8727 dev_set_drvdata(pmu->dev, pmu);
8728 pmu->dev->bus = &pmu_bus;
8729 pmu->dev->release = pmu_dev_release;
8730 ret = device_add(pmu->dev);
8731 if (ret)
8732 goto free_dev;
8733
8734
8735 if (pmu->nr_addr_filters)
8736 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
8737
8738 if (ret)
8739 goto del_dev;
8740
8741out:
8742 return ret;
8743
8744del_dev:
8745 device_del(pmu->dev);
8746
8747free_dev:
8748 put_device(pmu->dev);
8749 goto out;
8750}
8751
8752static struct lock_class_key cpuctx_mutex;
8753static struct lock_class_key cpuctx_lock;
8754
8755int perf_pmu_register(struct pmu *pmu, const char *name, int type)
8756{
8757 int cpu, ret;
8758
8759 mutex_lock(&pmus_lock);
8760 ret = -ENOMEM;
8761 pmu->pmu_disable_count = alloc_percpu(int);
8762 if (!pmu->pmu_disable_count)
8763 goto unlock;
8764
8765 pmu->type = -1;
8766 if (!name)
8767 goto skip_type;
8768 pmu->name = name;
8769
8770 if (type < 0) {
8771 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
8772 if (type < 0) {
8773 ret = type;
8774 goto free_pdc;
8775 }
8776 }
8777 pmu->type = type;
8778
8779 if (pmu_bus_running) {
8780 ret = pmu_dev_alloc(pmu);
8781 if (ret)
8782 goto free_idr;
8783 }
8784
8785skip_type:
8786 if (pmu->task_ctx_nr == perf_hw_context) {
8787 static int hw_context_taken = 0;
8788
8789
8790
8791
8792
8793
8794 if (WARN_ON_ONCE(hw_context_taken &&
8795 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
8796 pmu->task_ctx_nr = perf_invalid_context;
8797
8798 hw_context_taken = 1;
8799 }
8800
8801 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
8802 if (pmu->pmu_cpu_context)
8803 goto got_cpu_context;
8804
8805 ret = -ENOMEM;
8806 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
8807 if (!pmu->pmu_cpu_context)
8808 goto free_dev;
8809
8810 for_each_possible_cpu(cpu) {
8811 struct perf_cpu_context *cpuctx;
8812
8813 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8814 __perf_event_init_context(&cpuctx->ctx);
8815 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
8816 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
8817 cpuctx->ctx.pmu = pmu;
8818
8819 __perf_mux_hrtimer_init(cpuctx, cpu);
8820
8821 cpuctx->unique_pmu = pmu;
8822 }
8823
8824got_cpu_context:
8825 if (!pmu->start_txn) {
8826 if (pmu->pmu_enable) {
8827
8828
8829
8830
8831
8832 pmu->start_txn = perf_pmu_start_txn;
8833 pmu->commit_txn = perf_pmu_commit_txn;
8834 pmu->cancel_txn = perf_pmu_cancel_txn;
8835 } else {
8836 pmu->start_txn = perf_pmu_nop_txn;
8837 pmu->commit_txn = perf_pmu_nop_int;
8838 pmu->cancel_txn = perf_pmu_nop_void;
8839 }
8840 }
8841
8842 if (!pmu->pmu_enable) {
8843 pmu->pmu_enable = perf_pmu_nop_void;
8844 pmu->pmu_disable = perf_pmu_nop_void;
8845 }
8846
8847 if (!pmu->event_idx)
8848 pmu->event_idx = perf_event_idx_default;
8849
8850 list_add_rcu(&pmu->entry, &pmus);
8851 atomic_set(&pmu->exclusive_cnt, 0);
8852 ret = 0;
8853unlock:
8854 mutex_unlock(&pmus_lock);
8855
8856 return ret;
8857
8858free_dev:
8859 device_del(pmu->dev);
8860 put_device(pmu->dev);
8861
8862free_idr:
8863 if (pmu->type >= PERF_TYPE_MAX)
8864 idr_remove(&pmu_idr, pmu->type);
8865
8866free_pdc:
8867 free_percpu(pmu->pmu_disable_count);
8868 goto unlock;
8869}
8870EXPORT_SYMBOL_GPL(perf_pmu_register);
8871
8872void perf_pmu_unregister(struct pmu *pmu)
8873{
8874 int remove_device;
8875
8876 mutex_lock(&pmus_lock);
8877 remove_device = pmu_bus_running;
8878 list_del_rcu(&pmu->entry);
8879 mutex_unlock(&pmus_lock);
8880
8881
8882
8883
8884
8885 synchronize_srcu(&pmus_srcu);
8886 synchronize_rcu();
8887
8888 free_percpu(pmu->pmu_disable_count);
8889 if (pmu->type >= PERF_TYPE_MAX)
8890 idr_remove(&pmu_idr, pmu->type);
8891 if (remove_device) {
8892 if (pmu->nr_addr_filters)
8893 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
8894 device_del(pmu->dev);
8895 put_device(pmu->dev);
8896 }
8897 free_pmu_context(pmu);
8898}
8899EXPORT_SYMBOL_GPL(perf_pmu_unregister);
8900
8901static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
8902{
8903 struct perf_event_context *ctx = NULL;
8904 int ret;
8905
8906 if (!try_module_get(pmu->module))
8907 return -ENODEV;
8908
8909 if (event->group_leader != event) {
8910
8911
8912
8913
8914 ctx = perf_event_ctx_lock_nested(event->group_leader,
8915 SINGLE_DEPTH_NESTING);
8916 BUG_ON(!ctx);
8917 }
8918
8919 event->pmu = pmu;
8920 ret = pmu->event_init(event);
8921
8922 if (ctx)
8923 perf_event_ctx_unlock(event->group_leader, ctx);
8924
8925 if (ret)
8926 module_put(pmu->module);
8927
8928 return ret;
8929}
8930
8931static struct pmu *perf_init_event(struct perf_event *event)
8932{
8933 struct pmu *pmu = NULL;
8934 int idx;
8935 int ret;
8936
8937 idx = srcu_read_lock(&pmus_srcu);
8938
8939 rcu_read_lock();
8940 pmu = idr_find(&pmu_idr, event->attr.type);
8941 rcu_read_unlock();
8942 if (pmu) {
8943 ret = perf_try_init_event(pmu, event);
8944 if (ret)
8945 pmu = ERR_PTR(ret);
8946 goto unlock;
8947 }
8948
8949 list_for_each_entry_rcu(pmu, &pmus, entry) {
8950 ret = perf_try_init_event(pmu, event);
8951 if (!ret)
8952 goto unlock;
8953
8954 if (ret != -ENOENT) {
8955 pmu = ERR_PTR(ret);
8956 goto unlock;
8957 }
8958 }
8959 pmu = ERR_PTR(-ENOENT);
8960unlock:
8961 srcu_read_unlock(&pmus_srcu, idx);
8962
8963 return pmu;
8964}
8965
8966static void attach_sb_event(struct perf_event *event)
8967{
8968 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
8969
8970 raw_spin_lock(&pel->lock);
8971 list_add_rcu(&event->sb_list, &pel->list);
8972 raw_spin_unlock(&pel->lock);
8973}
8974
8975
8976
8977
8978
8979
8980
8981
8982static void account_pmu_sb_event(struct perf_event *event)
8983{
8984 if (is_sb_event(event))
8985 attach_sb_event(event);
8986}
8987
8988static void account_event_cpu(struct perf_event *event, int cpu)
8989{
8990 if (event->parent)
8991 return;
8992
8993 if (is_cgroup_event(event))
8994 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
8995}
8996
8997
8998static void account_freq_event_nohz(void)
8999{
9000#ifdef CONFIG_NO_HZ_FULL
9001
9002 spin_lock(&nr_freq_lock);
9003 if (atomic_inc_return(&nr_freq_events) == 1)
9004 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9005 spin_unlock(&nr_freq_lock);
9006#endif
9007}
9008
9009static void account_freq_event(void)
9010{
9011 if (tick_nohz_full_enabled())
9012 account_freq_event_nohz();
9013 else
9014 atomic_inc(&nr_freq_events);
9015}
9016
9017
9018static void account_event(struct perf_event *event)
9019{
9020 bool inc = false;
9021
9022 if (event->parent)
9023 return;
9024
9025 if (event->attach_state & PERF_ATTACH_TASK)
9026 inc = true;
9027 if (event->attr.mmap || event->attr.mmap_data)
9028 atomic_inc(&nr_mmap_events);
9029 if (event->attr.comm)
9030 atomic_inc(&nr_comm_events);
9031 if (event->attr.task)
9032 atomic_inc(&nr_task_events);
9033 if (event->attr.freq)
9034 account_freq_event();
9035 if (event->attr.context_switch) {
9036 atomic_inc(&nr_switch_events);
9037 inc = true;
9038 }
9039 if (has_branch_stack(event))
9040 inc = true;
9041 if (is_cgroup_event(event))
9042 inc = true;
9043
9044 if (inc) {
9045 if (atomic_inc_not_zero(&perf_sched_count))
9046 goto enabled;
9047
9048 mutex_lock(&perf_sched_mutex);
9049 if (!atomic_read(&perf_sched_count)) {
9050 static_branch_enable(&perf_sched_events);
9051
9052
9053
9054
9055
9056 synchronize_sched();
9057 }
9058
9059
9060
9061
9062 atomic_inc(&perf_sched_count);
9063 mutex_unlock(&perf_sched_mutex);
9064 }
9065enabled:
9066
9067 account_event_cpu(event, event->cpu);
9068
9069 account_pmu_sb_event(event);
9070}
9071
9072
9073
9074
9075static struct perf_event *
9076perf_event_alloc(struct perf_event_attr *attr, int cpu,
9077 struct task_struct *task,
9078 struct perf_event *group_leader,
9079 struct perf_event *parent_event,
9080 perf_overflow_handler_t overflow_handler,
9081 void *context, int cgroup_fd)
9082{
9083 struct pmu *pmu;
9084 struct perf_event *event;
9085 struct hw_perf_event *hwc;
9086 long err = -EINVAL;
9087
9088 if ((unsigned)cpu >= nr_cpu_ids) {
9089 if (!task || cpu != -1)
9090 return ERR_PTR(-EINVAL);
9091 }
9092
9093 event = kzalloc(sizeof(*event), GFP_KERNEL);
9094 if (!event)
9095 return ERR_PTR(-ENOMEM);
9096
9097
9098
9099
9100
9101 if (!group_leader)
9102 group_leader = event;
9103
9104 mutex_init(&event->child_mutex);
9105 INIT_LIST_HEAD(&event->child_list);
9106
9107 INIT_LIST_HEAD(&event->group_entry);
9108 INIT_LIST_HEAD(&event->event_entry);
9109 INIT_LIST_HEAD(&event->sibling_list);
9110 INIT_LIST_HEAD(&event->rb_entry);
9111 INIT_LIST_HEAD(&event->active_entry);
9112 INIT_LIST_HEAD(&event->addr_filters.list);
9113 INIT_HLIST_NODE(&event->hlist_entry);
9114
9115
9116 init_waitqueue_head(&event->waitq);
9117 init_irq_work(&event->pending, perf_pending_event);
9118
9119 mutex_init(&event->mmap_mutex);
9120 raw_spin_lock_init(&event->addr_filters.lock);
9121
9122 atomic_long_set(&event->refcount, 1);
9123 event->cpu = cpu;
9124 event->attr = *attr;
9125 event->group_leader = group_leader;
9126 event->pmu = NULL;
9127 event->oncpu = -1;
9128
9129 event->parent = parent_event;
9130
9131 event->ns = get_pid_ns(task_active_pid_ns(current));
9132 event->id = atomic64_inc_return(&perf_event_id);
9133
9134 event->state = PERF_EVENT_STATE_INACTIVE;
9135
9136 if (task) {
9137 event->attach_state = PERF_ATTACH_TASK;
9138
9139
9140
9141
9142
9143 event->hw.target = task;
9144 }
9145
9146 event->clock = &local_clock;
9147 if (parent_event)
9148 event->clock = parent_event->clock;
9149
9150 if (!overflow_handler && parent_event) {
9151 overflow_handler = parent_event->overflow_handler;
9152 context = parent_event->overflow_handler_context;
9153#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9154 if (overflow_handler == bpf_overflow_handler) {
9155 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9156
9157 if (IS_ERR(prog)) {
9158 err = PTR_ERR(prog);
9159 goto err_ns;
9160 }
9161 event->prog = prog;
9162 event->orig_overflow_handler =
9163 parent_event->orig_overflow_handler;
9164 }
9165#endif
9166 }
9167
9168 if (overflow_handler) {
9169 event->overflow_handler = overflow_handler;
9170 event->overflow_handler_context = context;
9171 } else if (is_write_backward(event)){
9172 event->overflow_handler = perf_event_output_backward;
9173 event->overflow_handler_context = NULL;
9174 } else {
9175 event->overflow_handler = perf_event_output_forward;
9176 event->overflow_handler_context = NULL;
9177 }
9178
9179 perf_event__state_init(event);
9180
9181 pmu = NULL;
9182
9183 hwc = &event->hw;
9184 hwc->sample_period = attr->sample_period;
9185 if (attr->freq && attr->sample_freq)
9186 hwc->sample_period = 1;
9187 hwc->last_period = hwc->sample_period;
9188
9189 local64_set(&hwc->period_left, hwc->sample_period);
9190
9191
9192
9193
9194 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
9195 goto err_ns;
9196
9197 if (!has_branch_stack(event))
9198 event->attr.branch_sample_type = 0;
9199
9200 if (cgroup_fd != -1) {
9201 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9202 if (err)
9203 goto err_ns;
9204 }
9205
9206 pmu = perf_init_event(event);
9207 if (!pmu)
9208 goto err_ns;
9209 else if (IS_ERR(pmu)) {
9210 err = PTR_ERR(pmu);
9211 goto err_ns;
9212 }
9213
9214 err = exclusive_event_init(event);
9215 if (err)
9216 goto err_pmu;
9217
9218 if (has_addr_filter(event)) {
9219 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9220 sizeof(unsigned long),
9221 GFP_KERNEL);
9222 if (!event->addr_filters_offs)
9223 goto err_per_task;
9224
9225
9226 event->addr_filters_gen = 1;
9227 }
9228
9229 if (!event->parent) {
9230 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9231 err = get_callchain_buffers(attr->sample_max_stack);
9232 if (err)
9233 goto err_addr_filters;
9234 }
9235 }
9236
9237
9238 account_event(event);
9239
9240 return event;
9241
9242err_addr_filters:
9243 kfree(event->addr_filters_offs);
9244
9245err_per_task:
9246 exclusive_event_destroy(event);
9247
9248err_pmu:
9249 if (event->destroy)
9250 event->destroy(event);
9251 module_put(pmu->module);
9252err_ns:
9253 if (is_cgroup_event(event))
9254 perf_detach_cgroup(event);
9255 if (event->ns)
9256 put_pid_ns(event->ns);
9257 kfree(event);
9258
9259 return ERR_PTR(err);
9260}
9261
9262static int perf_copy_attr(struct perf_event_attr __user *uattr,
9263 struct perf_event_attr *attr)
9264{
9265 u32 size;
9266 int ret;
9267
9268 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9269 return -EFAULT;
9270
9271
9272
9273
9274 memset(attr, 0, sizeof(*attr));
9275
9276 ret = get_user(size, &uattr->size);
9277 if (ret)
9278 return ret;
9279
9280 if (size > PAGE_SIZE)
9281 goto err_size;
9282
9283 if (!size)
9284 size = PERF_ATTR_SIZE_VER0;
9285
9286 if (size < PERF_ATTR_SIZE_VER0)
9287 goto err_size;
9288
9289
9290
9291
9292
9293
9294
9295 if (size > sizeof(*attr)) {
9296 unsigned char __user *addr;
9297 unsigned char __user *end;
9298 unsigned char val;
9299
9300 addr = (void __user *)uattr + sizeof(*attr);
9301 end = (void __user *)uattr + size;
9302
9303 for (; addr < end; addr++) {
9304 ret = get_user(val, addr);
9305 if (ret)
9306 return ret;
9307 if (val)
9308 goto err_size;
9309 }
9310 size = sizeof(*attr);
9311 }
9312
9313 ret = copy_from_user(attr, uattr, size);
9314 if (ret)
9315 return -EFAULT;
9316
9317 if (attr->__reserved_1)
9318 return -EINVAL;
9319
9320 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9321 return -EINVAL;
9322
9323 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9324 return -EINVAL;
9325
9326 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9327 u64 mask = attr->branch_sample_type;
9328
9329
9330 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9331 return -EINVAL;
9332
9333
9334 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9335 return -EINVAL;
9336
9337
9338 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9339
9340
9341 if (!attr->exclude_kernel)
9342 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9343
9344 if (!attr->exclude_user)
9345 mask |= PERF_SAMPLE_BRANCH_USER;
9346
9347 if (!attr->exclude_hv)
9348 mask |= PERF_SAMPLE_BRANCH_HV;
9349
9350
9351
9352 attr->branch_sample_type = mask;
9353 }
9354
9355 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9356 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9357 return -EACCES;
9358 }
9359
9360 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9361 ret = perf_reg_validate(attr->sample_regs_user);
9362 if (ret)
9363 return ret;
9364 }
9365
9366 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9367 if (!arch_perf_have_user_stack_dump())
9368 return -ENOSYS;
9369
9370
9371
9372
9373
9374
9375 if (attr->sample_stack_user >= USHRT_MAX)
9376 ret = -EINVAL;
9377 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9378 ret = -EINVAL;
9379 }
9380
9381 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9382 ret = perf_reg_validate(attr->sample_regs_intr);
9383out:
9384 return ret;
9385
9386err_size:
9387 put_user(sizeof(*attr), &uattr->size);
9388 ret = -E2BIG;
9389 goto out;
9390}
9391
9392static int
9393perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9394{
9395 struct ring_buffer *rb = NULL;
9396 int ret = -EINVAL;
9397
9398 if (!output_event)
9399 goto set;
9400
9401
9402 if (event == output_event)
9403 goto out;
9404
9405
9406
9407
9408 if (output_event->cpu != event->cpu)
9409 goto out;
9410
9411
9412
9413
9414 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9415 goto out;
9416
9417
9418
9419
9420 if (output_event->clock != event->clock)
9421 goto out;
9422
9423
9424
9425
9426
9427 if (is_write_backward(output_event) != is_write_backward(event))
9428 goto out;
9429
9430
9431
9432
9433 if (has_aux(event) && has_aux(output_event) &&
9434 event->pmu != output_event->pmu)
9435 goto out;
9436
9437set:
9438 mutex_lock(&event->mmap_mutex);
9439
9440 if (atomic_read(&event->mmap_count))
9441 goto unlock;
9442
9443 if (output_event) {
9444
9445 rb = ring_buffer_get(output_event);
9446 if (!rb)
9447 goto unlock;
9448 }
9449
9450 ring_buffer_attach(event, rb);
9451
9452 ret = 0;
9453unlock:
9454 mutex_unlock(&event->mmap_mutex);
9455
9456out:
9457 return ret;
9458}
9459
9460static void mutex_lock_double(struct mutex *a, struct mutex *b)
9461{
9462 if (b < a)
9463 swap(a, b);
9464
9465 mutex_lock(a);
9466 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9467}
9468
9469static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9470{
9471 bool nmi_safe = false;
9472
9473 switch (clk_id) {
9474 case CLOCK_MONOTONIC:
9475 event->clock = &ktime_get_mono_fast_ns;
9476 nmi_safe = true;
9477 break;
9478
9479 case CLOCK_MONOTONIC_RAW:
9480 event->clock = &ktime_get_raw_fast_ns;
9481 nmi_safe = true;
9482 break;
9483
9484 case CLOCK_REALTIME:
9485 event->clock = &ktime_get_real_ns;
9486 break;
9487
9488 case CLOCK_BOOTTIME:
9489 event->clock = &ktime_get_boot_ns;
9490 break;
9491
9492 case CLOCK_TAI:
9493 event->clock = &ktime_get_tai_ns;
9494 break;
9495
9496 default:
9497 return -EINVAL;
9498 }
9499
9500 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9501 return -EINVAL;
9502
9503 return 0;
9504}
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514SYSCALL_DEFINE5(perf_event_open,
9515 struct perf_event_attr __user *, attr_uptr,
9516 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
9517{
9518 struct perf_event *group_leader = NULL, *output_event = NULL;
9519 struct perf_event *event, *sibling;
9520 struct perf_event_attr attr;
9521 struct perf_event_context *ctx, *uninitialized_var(gctx);
9522 struct file *event_file = NULL;
9523 struct fd group = {NULL, 0};
9524 struct task_struct *task = NULL;
9525 struct pmu *pmu;
9526 int event_fd;
9527 int move_group = 0;
9528 int err;
9529 int f_flags = O_RDWR;
9530 int cgroup_fd = -1;
9531
9532
9533 if (flags & ~PERF_FLAG_ALL)
9534 return -EINVAL;
9535
9536 err = perf_copy_attr(attr_uptr, &attr);
9537 if (err)
9538 return err;
9539
9540 if (!attr.exclude_kernel) {
9541 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9542 return -EACCES;
9543 }
9544
9545 if (attr.freq) {
9546 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9547 return -EINVAL;
9548 } else {
9549 if (attr.sample_period & (1ULL << 63))
9550 return -EINVAL;
9551 }
9552
9553 if (!attr.sample_max_stack)
9554 attr.sample_max_stack = sysctl_perf_event_max_stack;
9555
9556
9557
9558
9559
9560
9561
9562 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9563 return -EINVAL;
9564
9565 if (flags & PERF_FLAG_FD_CLOEXEC)
9566 f_flags |= O_CLOEXEC;
9567
9568 event_fd = get_unused_fd_flags(f_flags);
9569 if (event_fd < 0)
9570 return event_fd;
9571
9572 if (group_fd != -1) {
9573 err = perf_fget_light(group_fd, &group);
9574 if (err)
9575 goto err_fd;
9576 group_leader = group.file->private_data;
9577 if (flags & PERF_FLAG_FD_OUTPUT)
9578 output_event = group_leader;
9579 if (flags & PERF_FLAG_FD_NO_GROUP)
9580 group_leader = NULL;
9581 }
9582
9583 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
9584 task = find_lively_task_by_vpid(pid);
9585 if (IS_ERR(task)) {
9586 err = PTR_ERR(task);
9587 goto err_group_fd;
9588 }
9589 }
9590
9591 if (task && group_leader &&
9592 group_leader->attr.inherit != attr.inherit) {
9593 err = -EINVAL;
9594 goto err_task;
9595 }
9596
9597 get_online_cpus();
9598
9599 if (task) {
9600 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
9601 if (err)
9602 goto err_cpus;
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612 err = -EACCES;
9613 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
9614 goto err_cred;
9615 }
9616
9617 if (flags & PERF_FLAG_PID_CGROUP)
9618 cgroup_fd = pid;
9619
9620 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
9621 NULL, NULL, cgroup_fd);
9622 if (IS_ERR(event)) {
9623 err = PTR_ERR(event);
9624 goto err_cred;
9625 }
9626
9627 if (is_sampling_event(event)) {
9628 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
9629 err = -EOPNOTSUPP;
9630 goto err_alloc;
9631 }
9632 }
9633
9634
9635
9636
9637
9638 pmu = event->pmu;
9639
9640 if (attr.use_clockid) {
9641 err = perf_event_set_clock(event, attr.clockid);
9642 if (err)
9643 goto err_alloc;
9644 }
9645
9646 if (pmu->task_ctx_nr == perf_sw_context)
9647 event->event_caps |= PERF_EV_CAP_SOFTWARE;
9648
9649 if (group_leader &&
9650 (is_software_event(event) != is_software_event(group_leader))) {
9651 if (is_software_event(event)) {
9652
9653
9654
9655
9656
9657
9658
9659
9660 pmu = group_leader->pmu;
9661 } else if (is_software_event(group_leader) &&
9662 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9663
9664
9665
9666
9667
9668 move_group = 1;
9669 }
9670 }
9671
9672
9673
9674
9675 ctx = find_get_context(pmu, task, event);
9676 if (IS_ERR(ctx)) {
9677 err = PTR_ERR(ctx);
9678 goto err_alloc;
9679 }
9680
9681 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
9682 err = -EBUSY;
9683 goto err_context;
9684 }
9685
9686
9687
9688
9689 if (group_leader) {
9690 err = -EINVAL;
9691
9692
9693
9694
9695
9696 if (group_leader->group_leader != group_leader)
9697 goto err_context;
9698
9699
9700 if (group_leader->clock != event->clock)
9701 goto err_context;
9702
9703
9704
9705
9706
9707 if (move_group) {
9708
9709
9710
9711
9712 if (group_leader->ctx->task != ctx->task)
9713 goto err_context;
9714
9715
9716
9717
9718
9719
9720 if (group_leader->cpu != event->cpu)
9721 goto err_context;
9722 } else {
9723 if (group_leader->ctx != ctx)
9724 goto err_context;
9725 }
9726
9727
9728
9729
9730 if (attr.exclusive || attr.pinned)
9731 goto err_context;
9732 }
9733
9734 if (output_event) {
9735 err = perf_event_set_output(event, output_event);
9736 if (err)
9737 goto err_context;
9738 }
9739
9740 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
9741 f_flags);
9742 if (IS_ERR(event_file)) {
9743 err = PTR_ERR(event_file);
9744 event_file = NULL;
9745 goto err_context;
9746 }
9747
9748 if (move_group) {
9749 gctx = group_leader->ctx;
9750 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9751 if (gctx->task == TASK_TOMBSTONE) {
9752 err = -ESRCH;
9753 goto err_locked;
9754 }
9755 } else {
9756 mutex_lock(&ctx->mutex);
9757 }
9758
9759 if (ctx->task == TASK_TOMBSTONE) {
9760 err = -ESRCH;
9761 goto err_locked;
9762 }
9763
9764 if (!perf_event_validate_size(event)) {
9765 err = -E2BIG;
9766 goto err_locked;
9767 }
9768
9769
9770
9771
9772
9773 if (!exclusive_event_installable(event, ctx)) {
9774
9775 WARN_ON_ONCE(move_group);
9776
9777 err = -EBUSY;
9778 goto err_locked;
9779 }
9780
9781 WARN_ON_ONCE(ctx->parent_ctx);
9782
9783
9784
9785
9786
9787
9788 if (move_group) {
9789
9790
9791
9792
9793 perf_remove_from_context(group_leader, 0);
9794
9795 list_for_each_entry(sibling, &group_leader->sibling_list,
9796 group_entry) {
9797 perf_remove_from_context(sibling, 0);
9798 put_ctx(gctx);
9799 }
9800
9801
9802
9803
9804
9805 synchronize_rcu();
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817 list_for_each_entry(sibling, &group_leader->sibling_list,
9818 group_entry) {
9819 perf_event__state_init(sibling);
9820 perf_install_in_context(ctx, sibling, sibling->cpu);
9821 get_ctx(ctx);
9822 }
9823
9824
9825
9826
9827
9828
9829 perf_event__state_init(group_leader);
9830 perf_install_in_context(ctx, group_leader, group_leader->cpu);
9831 get_ctx(ctx);
9832
9833
9834
9835
9836
9837
9838 put_ctx(gctx);
9839 }
9840
9841
9842
9843
9844
9845
9846
9847 perf_event__header_size(event);
9848 perf_event__id_header_size(event);
9849
9850 event->owner = current;
9851
9852 perf_install_in_context(ctx, event, event->cpu);
9853 perf_unpin_context(ctx);
9854
9855 if (move_group)
9856 mutex_unlock(&gctx->mutex);
9857 mutex_unlock(&ctx->mutex);
9858
9859 if (task) {
9860 mutex_unlock(&task->signal->cred_guard_mutex);
9861 put_task_struct(task);
9862 }
9863
9864 put_online_cpus();
9865
9866 mutex_lock(¤t->perf_event_mutex);
9867 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
9868 mutex_unlock(¤t->perf_event_mutex);
9869
9870
9871
9872
9873
9874
9875
9876 fdput(group);
9877 fd_install(event_fd, event_file);
9878 return event_fd;
9879
9880err_locked:
9881 if (move_group)
9882 mutex_unlock(&gctx->mutex);
9883 mutex_unlock(&ctx->mutex);
9884
9885 fput(event_file);
9886err_context:
9887 perf_unpin_context(ctx);
9888 put_ctx(ctx);
9889err_alloc:
9890
9891
9892
9893
9894 if (!event_file)
9895 free_event(event);
9896err_cred:
9897 if (task)
9898 mutex_unlock(&task->signal->cred_guard_mutex);
9899err_cpus:
9900 put_online_cpus();
9901err_task:
9902 if (task)
9903 put_task_struct(task);
9904err_group_fd:
9905 fdput(group);
9906err_fd:
9907 put_unused_fd(event_fd);
9908 return err;
9909}
9910
9911
9912
9913
9914
9915
9916
9917
9918struct perf_event *
9919perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
9920 struct task_struct *task,
9921 perf_overflow_handler_t overflow_handler,
9922 void *context)
9923{
9924 struct perf_event_context *ctx;
9925 struct perf_event *event;
9926 int err;
9927
9928
9929
9930
9931
9932 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
9933 overflow_handler, context, -1);
9934 if (IS_ERR(event)) {
9935 err = PTR_ERR(event);
9936 goto err;
9937 }
9938
9939
9940 event->owner = TASK_TOMBSTONE;
9941
9942 ctx = find_get_context(event->pmu, task, event);
9943 if (IS_ERR(ctx)) {
9944 err = PTR_ERR(ctx);
9945 goto err_free;
9946 }
9947
9948 WARN_ON_ONCE(ctx->parent_ctx);
9949 mutex_lock(&ctx->mutex);
9950 if (ctx->task == TASK_TOMBSTONE) {
9951 err = -ESRCH;
9952 goto err_unlock;
9953 }
9954
9955 if (!exclusive_event_installable(event, ctx)) {
9956 err = -EBUSY;
9957 goto err_unlock;
9958 }
9959
9960 perf_install_in_context(ctx, event, cpu);
9961 perf_unpin_context(ctx);
9962 mutex_unlock(&ctx->mutex);
9963
9964 return event;
9965
9966err_unlock:
9967 mutex_unlock(&ctx->mutex);
9968 perf_unpin_context(ctx);
9969 put_ctx(ctx);
9970err_free:
9971 free_event(event);
9972err:
9973 return ERR_PTR(err);
9974}
9975EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
9976
9977void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
9978{
9979 struct perf_event_context *src_ctx;
9980 struct perf_event_context *dst_ctx;
9981 struct perf_event *event, *tmp;
9982 LIST_HEAD(events);
9983
9984 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
9985 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
9986
9987
9988
9989
9990
9991 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
9992 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
9993 event_entry) {
9994 perf_remove_from_context(event, 0);
9995 unaccount_event_cpu(event, src_cpu);
9996 put_ctx(src_ctx);
9997 list_add(&event->migrate_entry, &events);
9998 }
9999
10000
10001
10002
10003 synchronize_rcu();
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10014 if (event->group_leader == event)
10015 continue;
10016
10017 list_del(&event->migrate_entry);
10018 if (event->state >= PERF_EVENT_STATE_OFF)
10019 event->state = PERF_EVENT_STATE_INACTIVE;
10020 account_event_cpu(event, dst_cpu);
10021 perf_install_in_context(dst_ctx, event, dst_cpu);
10022 get_ctx(dst_ctx);
10023 }
10024
10025
10026
10027
10028
10029 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10030 list_del(&event->migrate_entry);
10031 if (event->state >= PERF_EVENT_STATE_OFF)
10032 event->state = PERF_EVENT_STATE_INACTIVE;
10033 account_event_cpu(event, dst_cpu);
10034 perf_install_in_context(dst_ctx, event, dst_cpu);
10035 get_ctx(dst_ctx);
10036 }
10037 mutex_unlock(&dst_ctx->mutex);
10038 mutex_unlock(&src_ctx->mutex);
10039}
10040EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10041
10042static void sync_child_event(struct perf_event *child_event,
10043 struct task_struct *child)
10044{
10045 struct perf_event *parent_event = child_event->parent;
10046 u64 child_val;
10047
10048 if (child_event->attr.inherit_stat)
10049 perf_event_read_event(child_event, child);
10050
10051 child_val = perf_event_count(child_event);
10052
10053
10054
10055
10056 atomic64_add(child_val, &parent_event->child_count);
10057 atomic64_add(child_event->total_time_enabled,
10058 &parent_event->child_total_time_enabled);
10059 atomic64_add(child_event->total_time_running,
10060 &parent_event->child_total_time_running);
10061}
10062
10063static void
10064perf_event_exit_event(struct perf_event *child_event,
10065 struct perf_event_context *child_ctx,
10066 struct task_struct *child)
10067{
10068 struct perf_event *parent_event = child_event->parent;
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082 raw_spin_lock_irq(&child_ctx->lock);
10083 WARN_ON_ONCE(child_ctx->is_active);
10084
10085 if (parent_event)
10086 perf_group_detach(child_event);
10087 list_del_event(child_event, child_ctx);
10088 child_event->state = PERF_EVENT_STATE_EXIT;
10089 raw_spin_unlock_irq(&child_ctx->lock);
10090
10091
10092
10093
10094 if (!parent_event) {
10095 perf_event_wakeup(child_event);
10096 return;
10097 }
10098
10099
10100
10101
10102 sync_child_event(child_event, child);
10103
10104
10105
10106
10107 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10108 mutex_lock(&parent_event->child_mutex);
10109 list_del_init(&child_event->child_list);
10110 mutex_unlock(&parent_event->child_mutex);
10111
10112
10113
10114
10115 perf_event_wakeup(parent_event);
10116 free_event(child_event);
10117 put_event(parent_event);
10118}
10119
10120static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10121{
10122 struct perf_event_context *child_ctx, *clone_ctx = NULL;
10123 struct perf_event *child_event, *next;
10124
10125 WARN_ON_ONCE(child != current);
10126
10127 child_ctx = perf_pin_task_context(child, ctxn);
10128 if (!child_ctx)
10129 return;
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141 mutex_lock(&child_ctx->mutex);
10142
10143
10144
10145
10146
10147
10148 raw_spin_lock_irq(&child_ctx->lock);
10149 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
10150
10151
10152
10153
10154
10155 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10156 put_ctx(child_ctx);
10157 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10158 put_task_struct(current);
10159
10160 clone_ctx = unclone_ctx(child_ctx);
10161 raw_spin_unlock_irq(&child_ctx->lock);
10162
10163 if (clone_ctx)
10164 put_ctx(clone_ctx);
10165
10166
10167
10168
10169
10170
10171 perf_event_task(child, child_ctx, 0);
10172
10173 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10174 perf_event_exit_event(child_event, child_ctx, child);
10175
10176 mutex_unlock(&child_ctx->mutex);
10177
10178 put_ctx(child_ctx);
10179}
10180
10181
10182
10183
10184
10185
10186
10187void perf_event_exit_task(struct task_struct *child)
10188{
10189 struct perf_event *event, *tmp;
10190 int ctxn;
10191
10192 mutex_lock(&child->perf_event_mutex);
10193 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10194 owner_entry) {
10195 list_del_init(&event->owner_entry);
10196
10197
10198
10199
10200
10201
10202 smp_store_release(&event->owner, NULL);
10203 }
10204 mutex_unlock(&child->perf_event_mutex);
10205
10206 for_each_task_context_nr(ctxn)
10207 perf_event_exit_task_context(child, ctxn);
10208
10209
10210
10211
10212
10213
10214
10215 perf_event_task(child, NULL, 0);
10216}
10217
10218static void perf_free_event(struct perf_event *event,
10219 struct perf_event_context *ctx)
10220{
10221 struct perf_event *parent = event->parent;
10222
10223 if (WARN_ON_ONCE(!parent))
10224 return;
10225
10226 mutex_lock(&parent->child_mutex);
10227 list_del_init(&event->child_list);
10228 mutex_unlock(&parent->child_mutex);
10229
10230 put_event(parent);
10231
10232 raw_spin_lock_irq(&ctx->lock);
10233 perf_group_detach(event);
10234 list_del_event(event, ctx);
10235 raw_spin_unlock_irq(&ctx->lock);
10236 free_event(event);
10237}
10238
10239
10240
10241
10242
10243
10244
10245
10246void perf_event_free_task(struct task_struct *task)
10247{
10248 struct perf_event_context *ctx;
10249 struct perf_event *event, *tmp;
10250 int ctxn;
10251
10252 for_each_task_context_nr(ctxn) {
10253 ctx = task->perf_event_ctxp[ctxn];
10254 if (!ctx)
10255 continue;
10256
10257 mutex_lock(&ctx->mutex);
10258again:
10259 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
10260 group_entry)
10261 perf_free_event(event, ctx);
10262
10263 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
10264 group_entry)
10265 perf_free_event(event, ctx);
10266
10267 if (!list_empty(&ctx->pinned_groups) ||
10268 !list_empty(&ctx->flexible_groups))
10269 goto again;
10270
10271 mutex_unlock(&ctx->mutex);
10272
10273 put_ctx(ctx);
10274 }
10275}
10276
10277void perf_event_delayed_put(struct task_struct *task)
10278{
10279 int ctxn;
10280
10281 for_each_task_context_nr(ctxn)
10282 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10283}
10284
10285struct file *perf_event_get(unsigned int fd)
10286{
10287 struct file *file;
10288
10289 file = fget_raw(fd);
10290 if (!file)
10291 return ERR_PTR(-EBADF);
10292
10293 if (file->f_op != &perf_fops) {
10294 fput(file);
10295 return ERR_PTR(-EBADF);
10296 }
10297
10298 return file;
10299}
10300
10301const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10302{
10303 if (!event)
10304 return ERR_PTR(-EINVAL);
10305
10306 return &event->attr;
10307}
10308
10309
10310
10311
10312static struct perf_event *
10313inherit_event(struct perf_event *parent_event,
10314 struct task_struct *parent,
10315 struct perf_event_context *parent_ctx,
10316 struct task_struct *child,
10317 struct perf_event *group_leader,
10318 struct perf_event_context *child_ctx)
10319{
10320 enum perf_event_active_state parent_state = parent_event->state;
10321 struct perf_event *child_event;
10322 unsigned long flags;
10323
10324
10325
10326
10327
10328
10329
10330 if (parent_event->parent)
10331 parent_event = parent_event->parent;
10332
10333 child_event = perf_event_alloc(&parent_event->attr,
10334 parent_event->cpu,
10335 child,
10336 group_leader, parent_event,
10337 NULL, NULL, -1);
10338 if (IS_ERR(child_event))
10339 return child_event;
10340
10341
10342
10343
10344
10345
10346
10347 mutex_lock(&parent_event->child_mutex);
10348 if (is_orphaned_event(parent_event) ||
10349 !atomic_long_inc_not_zero(&parent_event->refcount)) {
10350 mutex_unlock(&parent_event->child_mutex);
10351 free_event(child_event);
10352 return NULL;
10353 }
10354
10355 get_ctx(child_ctx);
10356
10357
10358
10359
10360
10361
10362 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10363 child_event->state = PERF_EVENT_STATE_INACTIVE;
10364 else
10365 child_event->state = PERF_EVENT_STATE_OFF;
10366
10367 if (parent_event->attr.freq) {
10368 u64 sample_period = parent_event->hw.sample_period;
10369 struct hw_perf_event *hwc = &child_event->hw;
10370
10371 hwc->sample_period = sample_period;
10372 hwc->last_period = sample_period;
10373
10374 local64_set(&hwc->period_left, sample_period);
10375 }
10376
10377 child_event->ctx = child_ctx;
10378 child_event->overflow_handler = parent_event->overflow_handler;
10379 child_event->overflow_handler_context
10380 = parent_event->overflow_handler_context;
10381
10382
10383
10384
10385 perf_event__header_size(child_event);
10386 perf_event__id_header_size(child_event);
10387
10388
10389
10390
10391 raw_spin_lock_irqsave(&child_ctx->lock, flags);
10392 add_event_to_ctx(child_event, child_ctx);
10393 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10394
10395
10396
10397
10398 list_add_tail(&child_event->child_list, &parent_event->child_list);
10399 mutex_unlock(&parent_event->child_mutex);
10400
10401 return child_event;
10402}
10403
10404static int inherit_group(struct perf_event *parent_event,
10405 struct task_struct *parent,
10406 struct perf_event_context *parent_ctx,
10407 struct task_struct *child,
10408 struct perf_event_context *child_ctx)
10409{
10410 struct perf_event *leader;
10411 struct perf_event *sub;
10412 struct perf_event *child_ctr;
10413
10414 leader = inherit_event(parent_event, parent, parent_ctx,
10415 child, NULL, child_ctx);
10416 if (IS_ERR(leader))
10417 return PTR_ERR(leader);
10418 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10419 child_ctr = inherit_event(sub, parent, parent_ctx,
10420 child, leader, child_ctx);
10421 if (IS_ERR(child_ctr))
10422 return PTR_ERR(child_ctr);
10423 }
10424 return 0;
10425}
10426
10427static int
10428inherit_task_group(struct perf_event *event, struct task_struct *parent,
10429 struct perf_event_context *parent_ctx,
10430 struct task_struct *child, int ctxn,
10431 int *inherited_all)
10432{
10433 int ret;
10434 struct perf_event_context *child_ctx;
10435
10436 if (!event->attr.inherit) {
10437 *inherited_all = 0;
10438 return 0;
10439 }
10440
10441 child_ctx = child->perf_event_ctxp[ctxn];
10442 if (!child_ctx) {
10443
10444
10445
10446
10447
10448
10449
10450 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10451 if (!child_ctx)
10452 return -ENOMEM;
10453
10454 child->perf_event_ctxp[ctxn] = child_ctx;
10455 }
10456
10457 ret = inherit_group(event, parent, parent_ctx,
10458 child, child_ctx);
10459
10460 if (ret)
10461 *inherited_all = 0;
10462
10463 return ret;
10464}
10465
10466
10467
10468
10469static int perf_event_init_context(struct task_struct *child, int ctxn)
10470{
10471 struct perf_event_context *child_ctx, *parent_ctx;
10472 struct perf_event_context *cloned_ctx;
10473 struct perf_event *event;
10474 struct task_struct *parent = current;
10475 int inherited_all = 1;
10476 unsigned long flags;
10477 int ret = 0;
10478
10479 if (likely(!parent->perf_event_ctxp[ctxn]))
10480 return 0;
10481
10482
10483
10484
10485
10486 parent_ctx = perf_pin_task_context(parent, ctxn);
10487 if (!parent_ctx)
10488 return 0;
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501 mutex_lock(&parent_ctx->mutex);
10502
10503
10504
10505
10506
10507 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
10508 ret = inherit_task_group(event, parent, parent_ctx,
10509 child, ctxn, &inherited_all);
10510 if (ret)
10511 break;
10512 }
10513
10514
10515
10516
10517
10518
10519 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10520 parent_ctx->rotate_disable = 1;
10521 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10522
10523 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
10524 ret = inherit_task_group(event, parent, parent_ctx,
10525 child, ctxn, &inherited_all);
10526 if (ret)
10527 break;
10528 }
10529
10530 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10531 parent_ctx->rotate_disable = 0;
10532
10533 child_ctx = child->perf_event_ctxp[ctxn];
10534
10535 if (child_ctx && inherited_all) {
10536
10537
10538
10539
10540
10541
10542
10543 cloned_ctx = parent_ctx->parent_ctx;
10544 if (cloned_ctx) {
10545 child_ctx->parent_ctx = cloned_ctx;
10546 child_ctx->parent_gen = parent_ctx->parent_gen;
10547 } else {
10548 child_ctx->parent_ctx = parent_ctx;
10549 child_ctx->parent_gen = parent_ctx->generation;
10550 }
10551 get_ctx(child_ctx->parent_ctx);
10552 }
10553
10554 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10555 mutex_unlock(&parent_ctx->mutex);
10556
10557 perf_unpin_context(parent_ctx);
10558 put_ctx(parent_ctx);
10559
10560 return ret;
10561}
10562
10563
10564
10565
10566int perf_event_init_task(struct task_struct *child)
10567{
10568 int ctxn, ret;
10569
10570 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
10571 mutex_init(&child->perf_event_mutex);
10572 INIT_LIST_HEAD(&child->perf_event_list);
10573
10574 for_each_task_context_nr(ctxn) {
10575 ret = perf_event_init_context(child, ctxn);
10576 if (ret) {
10577 perf_event_free_task(child);
10578 return ret;
10579 }
10580 }
10581
10582 return 0;
10583}
10584
10585static void __init perf_event_init_all_cpus(void)
10586{
10587 struct swevent_htable *swhash;
10588 int cpu;
10589
10590 for_each_possible_cpu(cpu) {
10591 swhash = &per_cpu(swevent_htable, cpu);
10592 mutex_init(&swhash->hlist_mutex);
10593 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
10594
10595 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10596 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
10597
10598 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
10599 }
10600}
10601
10602int perf_event_init_cpu(unsigned int cpu)
10603{
10604 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10605
10606 mutex_lock(&swhash->hlist_mutex);
10607 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
10608 struct swevent_hlist *hlist;
10609
10610 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
10611 WARN_ON(!hlist);
10612 rcu_assign_pointer(swhash->swevent_hlist, hlist);
10613 }
10614 mutex_unlock(&swhash->hlist_mutex);
10615 return 0;
10616}
10617
10618#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
10619static void __perf_event_exit_context(void *__info)
10620{
10621 struct perf_event_context *ctx = __info;
10622 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
10623 struct perf_event *event;
10624
10625 raw_spin_lock(&ctx->lock);
10626 list_for_each_entry(event, &ctx->event_list, event_entry)
10627 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
10628 raw_spin_unlock(&ctx->lock);
10629}
10630
10631static void perf_event_exit_cpu_context(int cpu)
10632{
10633 struct perf_event_context *ctx;
10634 struct pmu *pmu;
10635 int idx;
10636
10637 idx = srcu_read_lock(&pmus_srcu);
10638 list_for_each_entry_rcu(pmu, &pmus, entry) {
10639 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
10640
10641 mutex_lock(&ctx->mutex);
10642 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
10643 mutex_unlock(&ctx->mutex);
10644 }
10645 srcu_read_unlock(&pmus_srcu, idx);
10646}
10647#else
10648
10649static void perf_event_exit_cpu_context(int cpu) { }
10650
10651#endif
10652
10653int perf_event_exit_cpu(unsigned int cpu)
10654{
10655 perf_event_exit_cpu_context(cpu);
10656 return 0;
10657}
10658
10659static int
10660perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
10661{
10662 int cpu;
10663
10664 for_each_online_cpu(cpu)
10665 perf_event_exit_cpu(cpu);
10666
10667 return NOTIFY_OK;
10668}
10669
10670
10671
10672
10673
10674static struct notifier_block perf_reboot_notifier = {
10675 .notifier_call = perf_reboot,
10676 .priority = INT_MIN,
10677};
10678
10679void __init perf_event_init(void)
10680{
10681 int ret;
10682
10683 idr_init(&pmu_idr);
10684
10685 perf_event_init_all_cpus();
10686 init_srcu_struct(&pmus_srcu);
10687 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
10688 perf_pmu_register(&perf_cpu_clock, NULL, -1);
10689 perf_pmu_register(&perf_task_clock, NULL, -1);
10690 perf_tp_register();
10691 perf_event_init_cpu(smp_processor_id());
10692 register_reboot_notifier(&perf_reboot_notifier);
10693
10694 ret = init_hw_breakpoint();
10695 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
10696
10697
10698
10699
10700
10701 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
10702 != 1024);
10703}
10704
10705ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
10706 char *page)
10707{
10708 struct perf_pmu_events_attr *pmu_attr =
10709 container_of(attr, struct perf_pmu_events_attr, attr);
10710
10711 if (pmu_attr->event_str)
10712 return sprintf(page, "%s\n", pmu_attr->event_str);
10713
10714 return 0;
10715}
10716EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
10717
10718static int __init perf_event_sysfs_init(void)
10719{
10720 struct pmu *pmu;
10721 int ret;
10722
10723 mutex_lock(&pmus_lock);
10724
10725 ret = bus_register(&pmu_bus);
10726 if (ret)
10727 goto unlock;
10728
10729 list_for_each_entry(pmu, &pmus, entry) {
10730 if (!pmu->name || pmu->type < 0)
10731 continue;
10732
10733 ret = pmu_dev_alloc(pmu);
10734 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
10735 }
10736 pmu_bus_running = 1;
10737 ret = 0;
10738
10739unlock:
10740 mutex_unlock(&pmus_lock);
10741
10742 return ret;
10743}
10744device_initcall(perf_event_sysfs_init);
10745
10746#ifdef CONFIG_CGROUP_PERF
10747static struct cgroup_subsys_state *
10748perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
10749{
10750 struct perf_cgroup *jc;
10751
10752 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
10753 if (!jc)
10754 return ERR_PTR(-ENOMEM);
10755
10756 jc->info = alloc_percpu(struct perf_cgroup_info);
10757 if (!jc->info) {
10758 kfree(jc);
10759 return ERR_PTR(-ENOMEM);
10760 }
10761
10762 return &jc->css;
10763}
10764
10765static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
10766{
10767 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
10768
10769 free_percpu(jc->info);
10770 kfree(jc);
10771}
10772
10773static int __perf_cgroup_move(void *info)
10774{
10775 struct task_struct *task = info;
10776 rcu_read_lock();
10777 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
10778 rcu_read_unlock();
10779 return 0;
10780}
10781
10782static void perf_cgroup_attach(struct cgroup_taskset *tset)
10783{
10784 struct task_struct *task;
10785 struct cgroup_subsys_state *css;
10786
10787 cgroup_taskset_for_each(task, css, tset)
10788 task_function_call(task, __perf_cgroup_move, task);
10789}
10790
10791struct cgroup_subsys perf_event_cgrp_subsys = {
10792 .css_alloc = perf_cgroup_css_alloc,
10793 .css_free = perf_cgroup_css_free,
10794 .attach = perf_cgroup_attach,
10795};
10796#endif
10797