1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47
48#include "internal.h"
49
50#include <asm/irq_regs.h>
51
52static struct workqueue_struct *perf_wq;
53
54typedef int (*remote_function_f)(void *);
55
56struct remote_function_call {
57 struct task_struct *p;
58 remote_function_f func;
59 void *info;
60 int ret;
61};
62
63static void remote_function(void *data)
64{
65 struct remote_function_call *tfc = data;
66 struct task_struct *p = tfc->p;
67
68 if (p) {
69 tfc->ret = -EAGAIN;
70 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
71 return;
72 }
73
74 tfc->ret = tfc->func(tfc->info);
75}
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90static int
91task_function_call(struct task_struct *p, remote_function_f func, void *info)
92{
93 struct remote_function_call data = {
94 .p = p,
95 .func = func,
96 .info = info,
97 .ret = -ESRCH,
98 };
99
100 if (task_curr(p))
101 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
102
103 return data.ret;
104}
105
106
107
108
109
110
111
112
113
114
115static int cpu_function_call(int cpu, remote_function_f func, void *info)
116{
117 struct remote_function_call data = {
118 .p = NULL,
119 .func = func,
120 .info = info,
121 .ret = -ENXIO,
122 };
123
124 smp_call_function_single(cpu, remote_function, &data, 1);
125
126 return data.ret;
127}
128
129#define EVENT_OWNER_KERNEL ((void *) -1)
130
131static bool is_kernel_event(struct perf_event *event)
132{
133 return event->owner == EVENT_OWNER_KERNEL;
134}
135
136#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
137 PERF_FLAG_FD_OUTPUT |\
138 PERF_FLAG_PID_CGROUP |\
139 PERF_FLAG_FD_CLOEXEC)
140
141
142
143
144#define PERF_SAMPLE_BRANCH_PERM_PLM \
145 (PERF_SAMPLE_BRANCH_KERNEL |\
146 PERF_SAMPLE_BRANCH_HV)
147
148enum event_type_t {
149 EVENT_FLEXIBLE = 0x1,
150 EVENT_PINNED = 0x2,
151 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
152};
153
154
155
156
157
158struct static_key_deferred perf_sched_events __read_mostly;
159static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
160static DEFINE_PER_CPU(int, perf_sched_cb_usages);
161
162static atomic_t nr_mmap_events __read_mostly;
163static atomic_t nr_comm_events __read_mostly;
164static atomic_t nr_task_events __read_mostly;
165static atomic_t nr_freq_events __read_mostly;
166static atomic_t nr_switch_events __read_mostly;
167
168static LIST_HEAD(pmus);
169static DEFINE_MUTEX(pmus_lock);
170static struct srcu_struct pmus_srcu;
171
172
173
174
175
176
177
178
179int sysctl_perf_event_paranoid __read_mostly = 1;
180
181
182int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
183
184
185
186
187#define DEFAULT_MAX_SAMPLE_RATE 100000
188#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
189#define DEFAULT_CPU_TIME_MAX_PERCENT 25
190
191int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
192
193static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
194static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
195
196static int perf_sample_allowed_ns __read_mostly =
197 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
198
199static void update_perf_cpu_limits(void)
200{
201 u64 tmp = perf_sample_period_ns;
202
203 tmp *= sysctl_perf_cpu_time_max_percent;
204 do_div(tmp, 100);
205 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
206}
207
208static int perf_rotate_context(struct perf_cpu_context *cpuctx);
209
210int perf_proc_update_handler(struct ctl_table *table, int write,
211 void __user *buffer, size_t *lenp,
212 loff_t *ppos)
213{
214 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
215
216 if (ret || !write)
217 return ret;
218
219 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
220 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
221 update_perf_cpu_limits();
222
223 return 0;
224}
225
226int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
227
228int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
229 void __user *buffer, size_t *lenp,
230 loff_t *ppos)
231{
232 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
233
234 if (ret || !write)
235 return ret;
236
237 update_perf_cpu_limits();
238
239 return 0;
240}
241
242
243
244
245
246
247
248#define NR_ACCUMULATED_SAMPLES 128
249static DEFINE_PER_CPU(u64, running_sample_length);
250
251static void perf_duration_warn(struct irq_work *w)
252{
253 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
254 u64 avg_local_sample_len;
255 u64 local_samples_len;
256
257 local_samples_len = __this_cpu_read(running_sample_length);
258 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
259
260 printk_ratelimited(KERN_WARNING
261 "perf interrupt took too long (%lld > %lld), lowering "
262 "kernel.perf_event_max_sample_rate to %d\n",
263 avg_local_sample_len, allowed_ns >> 1,
264 sysctl_perf_event_sample_rate);
265}
266
267static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
268
269void perf_sample_event_took(u64 sample_len_ns)
270{
271 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
272 u64 avg_local_sample_len;
273 u64 local_samples_len;
274
275 if (allowed_ns == 0)
276 return;
277
278
279 local_samples_len = __this_cpu_read(running_sample_length);
280 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
281 local_samples_len += sample_len_ns;
282 __this_cpu_write(running_sample_length, local_samples_len);
283
284
285
286
287
288
289 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
290
291 if (avg_local_sample_len <= allowed_ns)
292 return;
293
294 if (max_samples_per_tick <= 1)
295 return;
296
297 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
298 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
299 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
300
301 update_perf_cpu_limits();
302
303 if (!irq_work_queue(&perf_duration_work)) {
304 early_printk("perf interrupt took too long (%lld > %lld), lowering "
305 "kernel.perf_event_max_sample_rate to %d\n",
306 avg_local_sample_len, allowed_ns >> 1,
307 sysctl_perf_event_sample_rate);
308 }
309}
310
311static atomic64_t perf_event_id;
312
313static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
314 enum event_type_t event_type);
315
316static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
317 enum event_type_t event_type,
318 struct task_struct *task);
319
320static void update_context_time(struct perf_event_context *ctx);
321static u64 perf_event_time(struct perf_event *event);
322
323void __weak perf_event_print_debug(void) { }
324
325extern __weak const char *perf_pmu_name(void)
326{
327 return "pmu";
328}
329
330static inline u64 perf_clock(void)
331{
332 return local_clock();
333}
334
335static inline u64 perf_event_clock(struct perf_event *event)
336{
337 return event->clock();
338}
339
340static inline struct perf_cpu_context *
341__get_cpu_context(struct perf_event_context *ctx)
342{
343 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
344}
345
346static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
347 struct perf_event_context *ctx)
348{
349 raw_spin_lock(&cpuctx->ctx.lock);
350 if (ctx)
351 raw_spin_lock(&ctx->lock);
352}
353
354static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
355 struct perf_event_context *ctx)
356{
357 if (ctx)
358 raw_spin_unlock(&ctx->lock);
359 raw_spin_unlock(&cpuctx->ctx.lock);
360}
361
362#ifdef CONFIG_CGROUP_PERF
363
364static inline bool
365perf_cgroup_match(struct perf_event *event)
366{
367 struct perf_event_context *ctx = event->ctx;
368 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
369
370
371 if (!event->cgrp)
372 return true;
373
374
375 if (!cpuctx->cgrp)
376 return false;
377
378
379
380
381
382
383
384 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
385 event->cgrp->css.cgroup);
386}
387
388static inline void perf_detach_cgroup(struct perf_event *event)
389{
390 css_put(&event->cgrp->css);
391 event->cgrp = NULL;
392}
393
394static inline int is_cgroup_event(struct perf_event *event)
395{
396 return event->cgrp != NULL;
397}
398
399static inline u64 perf_cgroup_event_time(struct perf_event *event)
400{
401 struct perf_cgroup_info *t;
402
403 t = per_cpu_ptr(event->cgrp->info, event->cpu);
404 return t->time;
405}
406
407static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
408{
409 struct perf_cgroup_info *info;
410 u64 now;
411
412 now = perf_clock();
413
414 info = this_cpu_ptr(cgrp->info);
415
416 info->time += now - info->timestamp;
417 info->timestamp = now;
418}
419
420static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
421{
422 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
423 if (cgrp_out)
424 __update_cgrp_time(cgrp_out);
425}
426
427static inline void update_cgrp_time_from_event(struct perf_event *event)
428{
429 struct perf_cgroup *cgrp;
430
431
432
433
434
435 if (!is_cgroup_event(event))
436 return;
437
438 cgrp = perf_cgroup_from_task(current, event->ctx);
439
440
441
442 if (cgrp == event->cgrp)
443 __update_cgrp_time(event->cgrp);
444}
445
446static inline void
447perf_cgroup_set_timestamp(struct task_struct *task,
448 struct perf_event_context *ctx)
449{
450 struct perf_cgroup *cgrp;
451 struct perf_cgroup_info *info;
452
453
454
455
456
457
458 if (!task || !ctx->nr_cgroups)
459 return;
460
461 cgrp = perf_cgroup_from_task(task, ctx);
462 info = this_cpu_ptr(cgrp->info);
463 info->timestamp = ctx->timestamp;
464}
465
466#define PERF_CGROUP_SWOUT 0x1
467#define PERF_CGROUP_SWIN 0x2
468
469
470
471
472
473
474
475static void perf_cgroup_switch(struct task_struct *task, int mode)
476{
477 struct perf_cpu_context *cpuctx;
478 struct pmu *pmu;
479 unsigned long flags;
480
481
482
483
484
485
486 local_irq_save(flags);
487
488
489
490
491
492
493 list_for_each_entry_rcu(pmu, &pmus, entry) {
494 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
495 if (cpuctx->unique_pmu != pmu)
496 continue;
497
498
499
500
501
502
503
504
505 if (cpuctx->ctx.nr_cgroups > 0) {
506 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
507 perf_pmu_disable(cpuctx->ctx.pmu);
508
509 if (mode & PERF_CGROUP_SWOUT) {
510 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
511
512
513
514
515 cpuctx->cgrp = NULL;
516 }
517
518 if (mode & PERF_CGROUP_SWIN) {
519 WARN_ON_ONCE(cpuctx->cgrp);
520
521
522
523
524
525
526
527 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
528 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
529 }
530 perf_pmu_enable(cpuctx->ctx.pmu);
531 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
532 }
533 }
534
535 local_irq_restore(flags);
536}
537
538static inline void perf_cgroup_sched_out(struct task_struct *task,
539 struct task_struct *next)
540{
541 struct perf_cgroup *cgrp1;
542 struct perf_cgroup *cgrp2 = NULL;
543
544 rcu_read_lock();
545
546
547
548
549
550 cgrp1 = perf_cgroup_from_task(task, NULL);
551
552
553
554
555
556 if (next)
557 cgrp2 = perf_cgroup_from_task(next, NULL);
558
559
560
561
562
563
564 if (cgrp1 != cgrp2)
565 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
566
567 rcu_read_unlock();
568}
569
570static inline void perf_cgroup_sched_in(struct task_struct *prev,
571 struct task_struct *task)
572{
573 struct perf_cgroup *cgrp1;
574 struct perf_cgroup *cgrp2 = NULL;
575
576 rcu_read_lock();
577
578
579
580
581
582 cgrp1 = perf_cgroup_from_task(task, NULL);
583
584
585 cgrp2 = perf_cgroup_from_task(prev, NULL);
586
587
588
589
590
591
592 if (cgrp1 != cgrp2)
593 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
594
595 rcu_read_unlock();
596}
597
598static inline int perf_cgroup_connect(int fd, struct perf_event *event,
599 struct perf_event_attr *attr,
600 struct perf_event *group_leader)
601{
602 struct perf_cgroup *cgrp;
603 struct cgroup_subsys_state *css;
604 struct fd f = fdget(fd);
605 int ret = 0;
606
607 if (!f.file)
608 return -EBADF;
609
610 css = css_tryget_online_from_dir(f.file->f_path.dentry,
611 &perf_event_cgrp_subsys);
612 if (IS_ERR(css)) {
613 ret = PTR_ERR(css);
614 goto out;
615 }
616
617 cgrp = container_of(css, struct perf_cgroup, css);
618 event->cgrp = cgrp;
619
620
621
622
623
624
625 if (group_leader && group_leader->cgrp != cgrp) {
626 perf_detach_cgroup(event);
627 ret = -EINVAL;
628 }
629out:
630 fdput(f);
631 return ret;
632}
633
634static inline void
635perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
636{
637 struct perf_cgroup_info *t;
638 t = per_cpu_ptr(event->cgrp->info, event->cpu);
639 event->shadow_ctx_time = now - t->timestamp;
640}
641
642static inline void
643perf_cgroup_defer_enabled(struct perf_event *event)
644{
645
646
647
648
649
650
651 if (is_cgroup_event(event) && !perf_cgroup_match(event))
652 event->cgrp_defer_enabled = 1;
653}
654
655static inline void
656perf_cgroup_mark_enabled(struct perf_event *event,
657 struct perf_event_context *ctx)
658{
659 struct perf_event *sub;
660 u64 tstamp = perf_event_time(event);
661
662 if (!event->cgrp_defer_enabled)
663 return;
664
665 event->cgrp_defer_enabled = 0;
666
667 event->tstamp_enabled = tstamp - event->total_time_enabled;
668 list_for_each_entry(sub, &event->sibling_list, group_entry) {
669 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
670 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
671 sub->cgrp_defer_enabled = 0;
672 }
673 }
674}
675#else
676
677static inline bool
678perf_cgroup_match(struct perf_event *event)
679{
680 return true;
681}
682
683static inline void perf_detach_cgroup(struct perf_event *event)
684{}
685
686static inline int is_cgroup_event(struct perf_event *event)
687{
688 return 0;
689}
690
691static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
692{
693 return 0;
694}
695
696static inline void update_cgrp_time_from_event(struct perf_event *event)
697{
698}
699
700static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
701{
702}
703
704static inline void perf_cgroup_sched_out(struct task_struct *task,
705 struct task_struct *next)
706{
707}
708
709static inline void perf_cgroup_sched_in(struct task_struct *prev,
710 struct task_struct *task)
711{
712}
713
714static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
715 struct perf_event_attr *attr,
716 struct perf_event *group_leader)
717{
718 return -EINVAL;
719}
720
721static inline void
722perf_cgroup_set_timestamp(struct task_struct *task,
723 struct perf_event_context *ctx)
724{
725}
726
727void
728perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
729{
730}
731
732static inline void
733perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
734{
735}
736
737static inline u64 perf_cgroup_event_time(struct perf_event *event)
738{
739 return 0;
740}
741
742static inline void
743perf_cgroup_defer_enabled(struct perf_event *event)
744{
745}
746
747static inline void
748perf_cgroup_mark_enabled(struct perf_event *event,
749 struct perf_event_context *ctx)
750{
751}
752#endif
753
754
755
756
757
758#define PERF_CPU_HRTIMER (1000 / HZ)
759
760
761
762static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
763{
764 struct perf_cpu_context *cpuctx;
765 int rotations = 0;
766
767 WARN_ON(!irqs_disabled());
768
769 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
770 rotations = perf_rotate_context(cpuctx);
771
772 raw_spin_lock(&cpuctx->hrtimer_lock);
773 if (rotations)
774 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
775 else
776 cpuctx->hrtimer_active = 0;
777 raw_spin_unlock(&cpuctx->hrtimer_lock);
778
779 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
780}
781
782static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
783{
784 struct hrtimer *timer = &cpuctx->hrtimer;
785 struct pmu *pmu = cpuctx->ctx.pmu;
786 u64 interval;
787
788
789 if (pmu->task_ctx_nr == perf_sw_context)
790 return;
791
792
793
794
795
796 interval = pmu->hrtimer_interval_ms;
797 if (interval < 1)
798 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
799
800 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
801
802 raw_spin_lock_init(&cpuctx->hrtimer_lock);
803 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
804 timer->function = perf_mux_hrtimer_handler;
805}
806
807static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
808{
809 struct hrtimer *timer = &cpuctx->hrtimer;
810 struct pmu *pmu = cpuctx->ctx.pmu;
811 unsigned long flags;
812
813
814 if (pmu->task_ctx_nr == perf_sw_context)
815 return 0;
816
817 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
818 if (!cpuctx->hrtimer_active) {
819 cpuctx->hrtimer_active = 1;
820 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
821 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
822 }
823 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
824
825 return 0;
826}
827
828void perf_pmu_disable(struct pmu *pmu)
829{
830 int *count = this_cpu_ptr(pmu->pmu_disable_count);
831 if (!(*count)++)
832 pmu->pmu_disable(pmu);
833}
834
835void perf_pmu_enable(struct pmu *pmu)
836{
837 int *count = this_cpu_ptr(pmu->pmu_disable_count);
838 if (!--(*count))
839 pmu->pmu_enable(pmu);
840}
841
842static DEFINE_PER_CPU(struct list_head, active_ctx_list);
843
844
845
846
847
848
849
850static void perf_event_ctx_activate(struct perf_event_context *ctx)
851{
852 struct list_head *head = this_cpu_ptr(&active_ctx_list);
853
854 WARN_ON(!irqs_disabled());
855
856 WARN_ON(!list_empty(&ctx->active_ctx_list));
857
858 list_add(&ctx->active_ctx_list, head);
859}
860
861static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
862{
863 WARN_ON(!irqs_disabled());
864
865 WARN_ON(list_empty(&ctx->active_ctx_list));
866
867 list_del_init(&ctx->active_ctx_list);
868}
869
870static void get_ctx(struct perf_event_context *ctx)
871{
872 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
873}
874
875static void free_ctx(struct rcu_head *head)
876{
877 struct perf_event_context *ctx;
878
879 ctx = container_of(head, struct perf_event_context, rcu_head);
880 kfree(ctx->task_ctx_data);
881 kfree(ctx);
882}
883
884static void put_ctx(struct perf_event_context *ctx)
885{
886 if (atomic_dec_and_test(&ctx->refcount)) {
887 if (ctx->parent_ctx)
888 put_ctx(ctx->parent_ctx);
889 if (ctx->task)
890 put_task_struct(ctx->task);
891 call_rcu(&ctx->rcu_head, free_ctx);
892 }
893}
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956static struct perf_event_context *
957perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
958{
959 struct perf_event_context *ctx;
960
961again:
962 rcu_read_lock();
963 ctx = ACCESS_ONCE(event->ctx);
964 if (!atomic_inc_not_zero(&ctx->refcount)) {
965 rcu_read_unlock();
966 goto again;
967 }
968 rcu_read_unlock();
969
970 mutex_lock_nested(&ctx->mutex, nesting);
971 if (event->ctx != ctx) {
972 mutex_unlock(&ctx->mutex);
973 put_ctx(ctx);
974 goto again;
975 }
976
977 return ctx;
978}
979
980static inline struct perf_event_context *
981perf_event_ctx_lock(struct perf_event *event)
982{
983 return perf_event_ctx_lock_nested(event, 0);
984}
985
986static void perf_event_ctx_unlock(struct perf_event *event,
987 struct perf_event_context *ctx)
988{
989 mutex_unlock(&ctx->mutex);
990 put_ctx(ctx);
991}
992
993
994
995
996
997
998static __must_check struct perf_event_context *
999unclone_ctx(struct perf_event_context *ctx)
1000{
1001 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1002
1003 lockdep_assert_held(&ctx->lock);
1004
1005 if (parent_ctx)
1006 ctx->parent_ctx = NULL;
1007 ctx->generation++;
1008
1009 return parent_ctx;
1010}
1011
1012static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1013{
1014
1015
1016
1017 if (event->parent)
1018 event = event->parent;
1019
1020 return task_tgid_nr_ns(p, event->ns);
1021}
1022
1023static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1024{
1025
1026
1027
1028 if (event->parent)
1029 event = event->parent;
1030
1031 return task_pid_nr_ns(p, event->ns);
1032}
1033
1034
1035
1036
1037
1038static u64 primary_event_id(struct perf_event *event)
1039{
1040 u64 id = event->id;
1041
1042 if (event->parent)
1043 id = event->parent->id;
1044
1045 return id;
1046}
1047
1048
1049
1050
1051
1052
1053static struct perf_event_context *
1054perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1055{
1056 struct perf_event_context *ctx;
1057
1058retry:
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068 local_irq_save(*flags);
1069 rcu_read_lock();
1070 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1071 if (ctx) {
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082 raw_spin_lock(&ctx->lock);
1083 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1084 raw_spin_unlock(&ctx->lock);
1085 rcu_read_unlock();
1086 local_irq_restore(*flags);
1087 goto retry;
1088 }
1089
1090 if (!atomic_inc_not_zero(&ctx->refcount)) {
1091 raw_spin_unlock(&ctx->lock);
1092 ctx = NULL;
1093 }
1094 }
1095 rcu_read_unlock();
1096 if (!ctx)
1097 local_irq_restore(*flags);
1098 return ctx;
1099}
1100
1101
1102
1103
1104
1105
1106static struct perf_event_context *
1107perf_pin_task_context(struct task_struct *task, int ctxn)
1108{
1109 struct perf_event_context *ctx;
1110 unsigned long flags;
1111
1112 ctx = perf_lock_task_context(task, ctxn, &flags);
1113 if (ctx) {
1114 ++ctx->pin_count;
1115 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1116 }
1117 return ctx;
1118}
1119
1120static void perf_unpin_context(struct perf_event_context *ctx)
1121{
1122 unsigned long flags;
1123
1124 raw_spin_lock_irqsave(&ctx->lock, flags);
1125 --ctx->pin_count;
1126 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1127}
1128
1129
1130
1131
1132static void update_context_time(struct perf_event_context *ctx)
1133{
1134 u64 now = perf_clock();
1135
1136 ctx->time += now - ctx->timestamp;
1137 ctx->timestamp = now;
1138}
1139
1140static u64 perf_event_time(struct perf_event *event)
1141{
1142 struct perf_event_context *ctx = event->ctx;
1143
1144 if (is_cgroup_event(event))
1145 return perf_cgroup_event_time(event);
1146
1147 return ctx ? ctx->time : 0;
1148}
1149
1150
1151
1152
1153
1154static void update_event_times(struct perf_event *event)
1155{
1156 struct perf_event_context *ctx = event->ctx;
1157 u64 run_end;
1158
1159 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1160 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1161 return;
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172 if (is_cgroup_event(event))
1173 run_end = perf_cgroup_event_time(event);
1174 else if (ctx->is_active)
1175 run_end = ctx->time;
1176 else
1177 run_end = event->tstamp_stopped;
1178
1179 event->total_time_enabled = run_end - event->tstamp_enabled;
1180
1181 if (event->state == PERF_EVENT_STATE_INACTIVE)
1182 run_end = event->tstamp_stopped;
1183 else
1184 run_end = perf_event_time(event);
1185
1186 event->total_time_running = run_end - event->tstamp_running;
1187
1188}
1189
1190
1191
1192
1193static void update_group_times(struct perf_event *leader)
1194{
1195 struct perf_event *event;
1196
1197 update_event_times(leader);
1198 list_for_each_entry(event, &leader->sibling_list, group_entry)
1199 update_event_times(event);
1200}
1201
1202static struct list_head *
1203ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1204{
1205 if (event->attr.pinned)
1206 return &ctx->pinned_groups;
1207 else
1208 return &ctx->flexible_groups;
1209}
1210
1211
1212
1213
1214
1215static void
1216list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1217{
1218 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1219 event->attach_state |= PERF_ATTACH_CONTEXT;
1220
1221
1222
1223
1224
1225
1226 if (event->group_leader == event) {
1227 struct list_head *list;
1228
1229 if (is_software_event(event))
1230 event->group_flags |= PERF_GROUP_SOFTWARE;
1231
1232 list = ctx_group_list(event, ctx);
1233 list_add_tail(&event->group_entry, list);
1234 }
1235
1236 if (is_cgroup_event(event))
1237 ctx->nr_cgroups++;
1238
1239 list_add_rcu(&event->event_entry, &ctx->event_list);
1240 ctx->nr_events++;
1241 if (event->attr.inherit_stat)
1242 ctx->nr_stat++;
1243
1244 ctx->generation++;
1245}
1246
1247
1248
1249
1250static inline void perf_event__state_init(struct perf_event *event)
1251{
1252 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1253 PERF_EVENT_STATE_INACTIVE;
1254}
1255
1256static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1257{
1258 int entry = sizeof(u64);
1259 int size = 0;
1260 int nr = 1;
1261
1262 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1263 size += sizeof(u64);
1264
1265 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1266 size += sizeof(u64);
1267
1268 if (event->attr.read_format & PERF_FORMAT_ID)
1269 entry += sizeof(u64);
1270
1271 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1272 nr += nr_siblings;
1273 size += sizeof(u64);
1274 }
1275
1276 size += entry * nr;
1277 event->read_size = size;
1278}
1279
1280static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1281{
1282 struct perf_sample_data *data;
1283 u16 size = 0;
1284
1285 if (sample_type & PERF_SAMPLE_IP)
1286 size += sizeof(data->ip);
1287
1288 if (sample_type & PERF_SAMPLE_ADDR)
1289 size += sizeof(data->addr);
1290
1291 if (sample_type & PERF_SAMPLE_PERIOD)
1292 size += sizeof(data->period);
1293
1294 if (sample_type & PERF_SAMPLE_WEIGHT)
1295 size += sizeof(data->weight);
1296
1297 if (sample_type & PERF_SAMPLE_READ)
1298 size += event->read_size;
1299
1300 if (sample_type & PERF_SAMPLE_DATA_SRC)
1301 size += sizeof(data->data_src.val);
1302
1303 if (sample_type & PERF_SAMPLE_TRANSACTION)
1304 size += sizeof(data->txn);
1305
1306 event->header_size = size;
1307}
1308
1309
1310
1311
1312
1313static void perf_event__header_size(struct perf_event *event)
1314{
1315 __perf_event_read_size(event,
1316 event->group_leader->nr_siblings);
1317 __perf_event_header_size(event, event->attr.sample_type);
1318}
1319
1320static void perf_event__id_header_size(struct perf_event *event)
1321{
1322 struct perf_sample_data *data;
1323 u64 sample_type = event->attr.sample_type;
1324 u16 size = 0;
1325
1326 if (sample_type & PERF_SAMPLE_TID)
1327 size += sizeof(data->tid_entry);
1328
1329 if (sample_type & PERF_SAMPLE_TIME)
1330 size += sizeof(data->time);
1331
1332 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1333 size += sizeof(data->id);
1334
1335 if (sample_type & PERF_SAMPLE_ID)
1336 size += sizeof(data->id);
1337
1338 if (sample_type & PERF_SAMPLE_STREAM_ID)
1339 size += sizeof(data->stream_id);
1340
1341 if (sample_type & PERF_SAMPLE_CPU)
1342 size += sizeof(data->cpu_entry);
1343
1344 event->id_header_size = size;
1345}
1346
1347static bool perf_event_validate_size(struct perf_event *event)
1348{
1349
1350
1351
1352
1353 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1354 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1355 perf_event__id_header_size(event);
1356
1357
1358
1359
1360
1361 if (event->read_size + event->header_size +
1362 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1363 return false;
1364
1365 return true;
1366}
1367
1368static void perf_group_attach(struct perf_event *event)
1369{
1370 struct perf_event *group_leader = event->group_leader, *pos;
1371
1372
1373
1374
1375 if (event->attach_state & PERF_ATTACH_GROUP)
1376 return;
1377
1378 event->attach_state |= PERF_ATTACH_GROUP;
1379
1380 if (group_leader == event)
1381 return;
1382
1383 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1384
1385 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1386 !is_software_event(event))
1387 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1388
1389 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1390 group_leader->nr_siblings++;
1391
1392 perf_event__header_size(group_leader);
1393
1394 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1395 perf_event__header_size(pos);
1396}
1397
1398
1399
1400
1401
1402static void
1403list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1404{
1405 struct perf_cpu_context *cpuctx;
1406
1407 WARN_ON_ONCE(event->ctx != ctx);
1408 lockdep_assert_held(&ctx->lock);
1409
1410
1411
1412
1413 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1414 return;
1415
1416 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1417
1418 if (is_cgroup_event(event)) {
1419 ctx->nr_cgroups--;
1420 cpuctx = __get_cpu_context(ctx);
1421
1422
1423
1424
1425
1426 if (!ctx->nr_cgroups)
1427 cpuctx->cgrp = NULL;
1428 }
1429
1430 ctx->nr_events--;
1431 if (event->attr.inherit_stat)
1432 ctx->nr_stat--;
1433
1434 list_del_rcu(&event->event_entry);
1435
1436 if (event->group_leader == event)
1437 list_del_init(&event->group_entry);
1438
1439 update_group_times(event);
1440
1441
1442
1443
1444
1445
1446
1447
1448 if (event->state > PERF_EVENT_STATE_OFF)
1449 event->state = PERF_EVENT_STATE_OFF;
1450
1451 ctx->generation++;
1452}
1453
1454static void perf_group_detach(struct perf_event *event)
1455{
1456 struct perf_event *sibling, *tmp;
1457 struct list_head *list = NULL;
1458
1459
1460
1461
1462 if (!(event->attach_state & PERF_ATTACH_GROUP))
1463 return;
1464
1465 event->attach_state &= ~PERF_ATTACH_GROUP;
1466
1467
1468
1469
1470 if (event->group_leader != event) {
1471 list_del_init(&event->group_entry);
1472 event->group_leader->nr_siblings--;
1473 goto out;
1474 }
1475
1476 if (!list_empty(&event->group_entry))
1477 list = &event->group_entry;
1478
1479
1480
1481
1482
1483
1484 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1485 if (list)
1486 list_move_tail(&sibling->group_entry, list);
1487 sibling->group_leader = sibling;
1488
1489
1490 sibling->group_flags = event->group_flags;
1491
1492 WARN_ON_ONCE(sibling->ctx != event->ctx);
1493 }
1494
1495out:
1496 perf_event__header_size(event->group_leader);
1497
1498 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1499 perf_event__header_size(tmp);
1500}
1501
1502
1503
1504
1505static bool is_orphaned_event(struct perf_event *event)
1506{
1507 return event && !is_kernel_event(event) && !event->owner;
1508}
1509
1510
1511
1512
1513
1514static bool is_orphaned_child(struct perf_event *event)
1515{
1516 return is_orphaned_event(event->parent);
1517}
1518
1519static void orphans_remove_work(struct work_struct *work);
1520
1521static void schedule_orphans_remove(struct perf_event_context *ctx)
1522{
1523 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1524 return;
1525
1526 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1527 get_ctx(ctx);
1528 ctx->orphans_remove_sched = true;
1529 }
1530}
1531
1532static int __init perf_workqueue_init(void)
1533{
1534 perf_wq = create_singlethread_workqueue("perf");
1535 WARN(!perf_wq, "failed to create perf workqueue\n");
1536 return perf_wq ? 0 : -1;
1537}
1538
1539core_initcall(perf_workqueue_init);
1540
1541static inline int pmu_filter_match(struct perf_event *event)
1542{
1543 struct pmu *pmu = event->pmu;
1544 return pmu->filter_match ? pmu->filter_match(event) : 1;
1545}
1546
1547static inline int
1548event_filter_match(struct perf_event *event)
1549{
1550 return (event->cpu == -1 || event->cpu == smp_processor_id())
1551 && perf_cgroup_match(event) && pmu_filter_match(event);
1552}
1553
1554static void
1555event_sched_out(struct perf_event *event,
1556 struct perf_cpu_context *cpuctx,
1557 struct perf_event_context *ctx)
1558{
1559 u64 tstamp = perf_event_time(event);
1560 u64 delta;
1561
1562 WARN_ON_ONCE(event->ctx != ctx);
1563 lockdep_assert_held(&ctx->lock);
1564
1565
1566
1567
1568
1569
1570
1571 if (event->state == PERF_EVENT_STATE_INACTIVE
1572 && !event_filter_match(event)) {
1573 delta = tstamp - event->tstamp_stopped;
1574 event->tstamp_running += delta;
1575 event->tstamp_stopped = tstamp;
1576 }
1577
1578 if (event->state != PERF_EVENT_STATE_ACTIVE)
1579 return;
1580
1581 perf_pmu_disable(event->pmu);
1582
1583 event->state = PERF_EVENT_STATE_INACTIVE;
1584 if (event->pending_disable) {
1585 event->pending_disable = 0;
1586 event->state = PERF_EVENT_STATE_OFF;
1587 }
1588 event->tstamp_stopped = tstamp;
1589 event->pmu->del(event, 0);
1590 event->oncpu = -1;
1591
1592 if (!is_software_event(event))
1593 cpuctx->active_oncpu--;
1594 if (!--ctx->nr_active)
1595 perf_event_ctx_deactivate(ctx);
1596 if (event->attr.freq && event->attr.sample_freq)
1597 ctx->nr_freq--;
1598 if (event->attr.exclusive || !cpuctx->active_oncpu)
1599 cpuctx->exclusive = 0;
1600
1601 if (is_orphaned_child(event))
1602 schedule_orphans_remove(ctx);
1603
1604 perf_pmu_enable(event->pmu);
1605}
1606
1607static void
1608group_sched_out(struct perf_event *group_event,
1609 struct perf_cpu_context *cpuctx,
1610 struct perf_event_context *ctx)
1611{
1612 struct perf_event *event;
1613 int state = group_event->state;
1614
1615 event_sched_out(group_event, cpuctx, ctx);
1616
1617
1618
1619
1620 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1621 event_sched_out(event, cpuctx, ctx);
1622
1623 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1624 cpuctx->exclusive = 0;
1625}
1626
1627struct remove_event {
1628 struct perf_event *event;
1629 bool detach_group;
1630};
1631
1632
1633
1634
1635
1636
1637
1638static int __perf_remove_from_context(void *info)
1639{
1640 struct remove_event *re = info;
1641 struct perf_event *event = re->event;
1642 struct perf_event_context *ctx = event->ctx;
1643 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1644
1645 raw_spin_lock(&ctx->lock);
1646 event_sched_out(event, cpuctx, ctx);
1647 if (re->detach_group)
1648 perf_group_detach(event);
1649 list_del_event(event, ctx);
1650 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1651 ctx->is_active = 0;
1652 cpuctx->task_ctx = NULL;
1653 }
1654 raw_spin_unlock(&ctx->lock);
1655
1656 return 0;
1657}
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1674{
1675 struct perf_event_context *ctx = event->ctx;
1676 struct task_struct *task = ctx->task;
1677 struct remove_event re = {
1678 .event = event,
1679 .detach_group = detach_group,
1680 };
1681
1682 lockdep_assert_held(&ctx->mutex);
1683
1684 if (!task) {
1685
1686
1687
1688
1689
1690
1691 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1692 return;
1693 }
1694
1695retry:
1696 if (!task_function_call(task, __perf_remove_from_context, &re))
1697 return;
1698
1699 raw_spin_lock_irq(&ctx->lock);
1700
1701
1702
1703
1704 if (ctx->is_active) {
1705 raw_spin_unlock_irq(&ctx->lock);
1706
1707
1708
1709
1710 task = ctx->task;
1711 goto retry;
1712 }
1713
1714
1715
1716
1717
1718 if (detach_group)
1719 perf_group_detach(event);
1720 list_del_event(event, ctx);
1721 raw_spin_unlock_irq(&ctx->lock);
1722}
1723
1724
1725
1726
1727int __perf_event_disable(void *info)
1728{
1729 struct perf_event *event = info;
1730 struct perf_event_context *ctx = event->ctx;
1731 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1732
1733
1734
1735
1736
1737
1738
1739
1740 if (ctx->task && cpuctx->task_ctx != ctx)
1741 return -EINVAL;
1742
1743 raw_spin_lock(&ctx->lock);
1744
1745
1746
1747
1748
1749 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1750 update_context_time(ctx);
1751 update_cgrp_time_from_event(event);
1752 update_group_times(event);
1753 if (event == event->group_leader)
1754 group_sched_out(event, cpuctx, ctx);
1755 else
1756 event_sched_out(event, cpuctx, ctx);
1757 event->state = PERF_EVENT_STATE_OFF;
1758 }
1759
1760 raw_spin_unlock(&ctx->lock);
1761
1762 return 0;
1763}
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778static void _perf_event_disable(struct perf_event *event)
1779{
1780 struct perf_event_context *ctx = event->ctx;
1781 struct task_struct *task = ctx->task;
1782
1783 if (!task) {
1784
1785
1786
1787 cpu_function_call(event->cpu, __perf_event_disable, event);
1788 return;
1789 }
1790
1791retry:
1792 if (!task_function_call(task, __perf_event_disable, event))
1793 return;
1794
1795 raw_spin_lock_irq(&ctx->lock);
1796
1797
1798
1799 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1800 raw_spin_unlock_irq(&ctx->lock);
1801
1802
1803
1804
1805 task = ctx->task;
1806 goto retry;
1807 }
1808
1809
1810
1811
1812
1813 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1814 update_group_times(event);
1815 event->state = PERF_EVENT_STATE_OFF;
1816 }
1817 raw_spin_unlock_irq(&ctx->lock);
1818}
1819
1820
1821
1822
1823
1824void perf_event_disable(struct perf_event *event)
1825{
1826 struct perf_event_context *ctx;
1827
1828 ctx = perf_event_ctx_lock(event);
1829 _perf_event_disable(event);
1830 perf_event_ctx_unlock(event, ctx);
1831}
1832EXPORT_SYMBOL_GPL(perf_event_disable);
1833
1834static void perf_set_shadow_time(struct perf_event *event,
1835 struct perf_event_context *ctx,
1836 u64 tstamp)
1837{
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863 if (is_cgroup_event(event))
1864 perf_cgroup_set_shadow_time(event, tstamp);
1865 else
1866 event->shadow_ctx_time = tstamp - ctx->timestamp;
1867}
1868
1869#define MAX_INTERRUPTS (~0ULL)
1870
1871static void perf_log_throttle(struct perf_event *event, int enable);
1872static void perf_log_itrace_start(struct perf_event *event);
1873
1874static int
1875event_sched_in(struct perf_event *event,
1876 struct perf_cpu_context *cpuctx,
1877 struct perf_event_context *ctx)
1878{
1879 u64 tstamp = perf_event_time(event);
1880 int ret = 0;
1881
1882 lockdep_assert_held(&ctx->lock);
1883
1884 if (event->state <= PERF_EVENT_STATE_OFF)
1885 return 0;
1886
1887 event->state = PERF_EVENT_STATE_ACTIVE;
1888 event->oncpu = smp_processor_id();
1889
1890
1891
1892
1893
1894
1895 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1896 perf_log_throttle(event, 1);
1897 event->hw.interrupts = 0;
1898 }
1899
1900
1901
1902
1903 smp_wmb();
1904
1905 perf_pmu_disable(event->pmu);
1906
1907 perf_set_shadow_time(event, ctx, tstamp);
1908
1909 perf_log_itrace_start(event);
1910
1911 if (event->pmu->add(event, PERF_EF_START)) {
1912 event->state = PERF_EVENT_STATE_INACTIVE;
1913 event->oncpu = -1;
1914 ret = -EAGAIN;
1915 goto out;
1916 }
1917
1918 event->tstamp_running += tstamp - event->tstamp_stopped;
1919
1920 if (!is_software_event(event))
1921 cpuctx->active_oncpu++;
1922 if (!ctx->nr_active++)
1923 perf_event_ctx_activate(ctx);
1924 if (event->attr.freq && event->attr.sample_freq)
1925 ctx->nr_freq++;
1926
1927 if (event->attr.exclusive)
1928 cpuctx->exclusive = 1;
1929
1930 if (is_orphaned_child(event))
1931 schedule_orphans_remove(ctx);
1932
1933out:
1934 perf_pmu_enable(event->pmu);
1935
1936 return ret;
1937}
1938
1939static int
1940group_sched_in(struct perf_event *group_event,
1941 struct perf_cpu_context *cpuctx,
1942 struct perf_event_context *ctx)
1943{
1944 struct perf_event *event, *partial_group = NULL;
1945 struct pmu *pmu = ctx->pmu;
1946 u64 now = ctx->time;
1947 bool simulate = false;
1948
1949 if (group_event->state == PERF_EVENT_STATE_OFF)
1950 return 0;
1951
1952 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
1953
1954 if (event_sched_in(group_event, cpuctx, ctx)) {
1955 pmu->cancel_txn(pmu);
1956 perf_mux_hrtimer_restart(cpuctx);
1957 return -EAGAIN;
1958 }
1959
1960
1961
1962
1963 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1964 if (event_sched_in(event, cpuctx, ctx)) {
1965 partial_group = event;
1966 goto group_error;
1967 }
1968 }
1969
1970 if (!pmu->commit_txn(pmu))
1971 return 0;
1972
1973group_error:
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1989 if (event == partial_group)
1990 simulate = true;
1991
1992 if (simulate) {
1993 event->tstamp_running += now - event->tstamp_stopped;
1994 event->tstamp_stopped = now;
1995 } else {
1996 event_sched_out(event, cpuctx, ctx);
1997 }
1998 }
1999 event_sched_out(group_event, cpuctx, ctx);
2000
2001 pmu->cancel_txn(pmu);
2002
2003 perf_mux_hrtimer_restart(cpuctx);
2004
2005 return -EAGAIN;
2006}
2007
2008
2009
2010
2011static int group_can_go_on(struct perf_event *event,
2012 struct perf_cpu_context *cpuctx,
2013 int can_add_hw)
2014{
2015
2016
2017
2018 if (event->group_flags & PERF_GROUP_SOFTWARE)
2019 return 1;
2020
2021
2022
2023
2024 if (cpuctx->exclusive)
2025 return 0;
2026
2027
2028
2029
2030 if (event->attr.exclusive && cpuctx->active_oncpu)
2031 return 0;
2032
2033
2034
2035
2036 return can_add_hw;
2037}
2038
2039static void add_event_to_ctx(struct perf_event *event,
2040 struct perf_event_context *ctx)
2041{
2042 u64 tstamp = perf_event_time(event);
2043
2044 list_add_event(event, ctx);
2045 perf_group_attach(event);
2046 event->tstamp_enabled = tstamp;
2047 event->tstamp_running = tstamp;
2048 event->tstamp_stopped = tstamp;
2049}
2050
2051static void task_ctx_sched_out(struct perf_event_context *ctx);
2052static void
2053ctx_sched_in(struct perf_event_context *ctx,
2054 struct perf_cpu_context *cpuctx,
2055 enum event_type_t event_type,
2056 struct task_struct *task);
2057
2058static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2059 struct perf_event_context *ctx,
2060 struct task_struct *task)
2061{
2062 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2063 if (ctx)
2064 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2065 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2066 if (ctx)
2067 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2068}
2069
2070
2071
2072
2073
2074
2075static int __perf_install_in_context(void *info)
2076{
2077 struct perf_event *event = info;
2078 struct perf_event_context *ctx = event->ctx;
2079 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2080 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2081 struct task_struct *task = current;
2082
2083 perf_ctx_lock(cpuctx, task_ctx);
2084 perf_pmu_disable(cpuctx->ctx.pmu);
2085
2086
2087
2088
2089 if (task_ctx)
2090 task_ctx_sched_out(task_ctx);
2091
2092
2093
2094
2095
2096 if (ctx->task && task_ctx != ctx) {
2097 if (task_ctx)
2098 raw_spin_unlock(&task_ctx->lock);
2099 raw_spin_lock(&ctx->lock);
2100 task_ctx = ctx;
2101 }
2102
2103 if (task_ctx) {
2104 cpuctx->task_ctx = task_ctx;
2105 task = task_ctx->task;
2106 }
2107
2108 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2109
2110 update_context_time(ctx);
2111
2112
2113
2114
2115
2116 update_cgrp_time_from_event(event);
2117
2118 add_event_to_ctx(event, ctx);
2119
2120
2121
2122
2123 perf_event_sched_in(cpuctx, task_ctx, task);
2124
2125 perf_pmu_enable(cpuctx->ctx.pmu);
2126 perf_ctx_unlock(cpuctx, task_ctx);
2127
2128 return 0;
2129}
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141static void
2142perf_install_in_context(struct perf_event_context *ctx,
2143 struct perf_event *event,
2144 int cpu)
2145{
2146 struct task_struct *task = ctx->task;
2147
2148 lockdep_assert_held(&ctx->mutex);
2149
2150 event->ctx = ctx;
2151 if (event->cpu != -1)
2152 event->cpu = cpu;
2153
2154 if (!task) {
2155
2156
2157
2158
2159 cpu_function_call(cpu, __perf_install_in_context, event);
2160 return;
2161 }
2162
2163retry:
2164 if (!task_function_call(task, __perf_install_in_context, event))
2165 return;
2166
2167 raw_spin_lock_irq(&ctx->lock);
2168
2169
2170
2171
2172 if (ctx->is_active) {
2173 raw_spin_unlock_irq(&ctx->lock);
2174
2175
2176
2177
2178 task = ctx->task;
2179 goto retry;
2180 }
2181
2182
2183
2184
2185
2186 add_event_to_ctx(event, ctx);
2187 raw_spin_unlock_irq(&ctx->lock);
2188}
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198static void __perf_event_mark_enabled(struct perf_event *event)
2199{
2200 struct perf_event *sub;
2201 u64 tstamp = perf_event_time(event);
2202
2203 event->state = PERF_EVENT_STATE_INACTIVE;
2204 event->tstamp_enabled = tstamp - event->total_time_enabled;
2205 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2206 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2207 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2208 }
2209}
2210
2211
2212
2213
2214static int __perf_event_enable(void *info)
2215{
2216 struct perf_event *event = info;
2217 struct perf_event_context *ctx = event->ctx;
2218 struct perf_event *leader = event->group_leader;
2219 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2220 int err;
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231 if (!ctx->is_active)
2232 return -EINVAL;
2233
2234 raw_spin_lock(&ctx->lock);
2235 update_context_time(ctx);
2236
2237 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2238 goto unlock;
2239
2240
2241
2242
2243 perf_cgroup_set_timestamp(current, ctx);
2244
2245 __perf_event_mark_enabled(event);
2246
2247 if (!event_filter_match(event)) {
2248 if (is_cgroup_event(event))
2249 perf_cgroup_defer_enabled(event);
2250 goto unlock;
2251 }
2252
2253
2254
2255
2256
2257 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2258 goto unlock;
2259
2260 if (!group_can_go_on(event, cpuctx, 1)) {
2261 err = -EEXIST;
2262 } else {
2263 if (event == leader)
2264 err = group_sched_in(event, cpuctx, ctx);
2265 else
2266 err = event_sched_in(event, cpuctx, ctx);
2267 }
2268
2269 if (err) {
2270
2271
2272
2273
2274 if (leader != event) {
2275 group_sched_out(leader, cpuctx, ctx);
2276 perf_mux_hrtimer_restart(cpuctx);
2277 }
2278 if (leader->attr.pinned) {
2279 update_group_times(leader);
2280 leader->state = PERF_EVENT_STATE_ERROR;
2281 }
2282 }
2283
2284unlock:
2285 raw_spin_unlock(&ctx->lock);
2286
2287 return 0;
2288}
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299static void _perf_event_enable(struct perf_event *event)
2300{
2301 struct perf_event_context *ctx = event->ctx;
2302 struct task_struct *task = ctx->task;
2303
2304 if (!task) {
2305
2306
2307
2308 cpu_function_call(event->cpu, __perf_event_enable, event);
2309 return;
2310 }
2311
2312 raw_spin_lock_irq(&ctx->lock);
2313 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2314 goto out;
2315
2316
2317
2318
2319
2320
2321
2322
2323 if (event->state == PERF_EVENT_STATE_ERROR)
2324 event->state = PERF_EVENT_STATE_OFF;
2325
2326retry:
2327 if (!ctx->is_active) {
2328 __perf_event_mark_enabled(event);
2329 goto out;
2330 }
2331
2332 raw_spin_unlock_irq(&ctx->lock);
2333
2334 if (!task_function_call(task, __perf_event_enable, event))
2335 return;
2336
2337 raw_spin_lock_irq(&ctx->lock);
2338
2339
2340
2341
2342
2343 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2344
2345
2346
2347
2348 task = ctx->task;
2349 goto retry;
2350 }
2351
2352out:
2353 raw_spin_unlock_irq(&ctx->lock);
2354}
2355
2356
2357
2358
2359void perf_event_enable(struct perf_event *event)
2360{
2361 struct perf_event_context *ctx;
2362
2363 ctx = perf_event_ctx_lock(event);
2364 _perf_event_enable(event);
2365 perf_event_ctx_unlock(event, ctx);
2366}
2367EXPORT_SYMBOL_GPL(perf_event_enable);
2368
2369static int _perf_event_refresh(struct perf_event *event, int refresh)
2370{
2371
2372
2373
2374 if (event->attr.inherit || !is_sampling_event(event))
2375 return -EINVAL;
2376
2377 atomic_add(refresh, &event->event_limit);
2378 _perf_event_enable(event);
2379
2380 return 0;
2381}
2382
2383
2384
2385
2386int perf_event_refresh(struct perf_event *event, int refresh)
2387{
2388 struct perf_event_context *ctx;
2389 int ret;
2390
2391 ctx = perf_event_ctx_lock(event);
2392 ret = _perf_event_refresh(event, refresh);
2393 perf_event_ctx_unlock(event, ctx);
2394
2395 return ret;
2396}
2397EXPORT_SYMBOL_GPL(perf_event_refresh);
2398
2399static void ctx_sched_out(struct perf_event_context *ctx,
2400 struct perf_cpu_context *cpuctx,
2401 enum event_type_t event_type)
2402{
2403 struct perf_event *event;
2404 int is_active = ctx->is_active;
2405
2406 ctx->is_active &= ~event_type;
2407 if (likely(!ctx->nr_events))
2408 return;
2409
2410 update_context_time(ctx);
2411 update_cgrp_time_from_cpuctx(cpuctx);
2412 if (!ctx->nr_active)
2413 return;
2414
2415 perf_pmu_disable(ctx->pmu);
2416 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2417 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2418 group_sched_out(event, cpuctx, ctx);
2419 }
2420
2421 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2422 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2423 group_sched_out(event, cpuctx, ctx);
2424 }
2425 perf_pmu_enable(ctx->pmu);
2426}
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436static int context_equiv(struct perf_event_context *ctx1,
2437 struct perf_event_context *ctx2)
2438{
2439 lockdep_assert_held(&ctx1->lock);
2440 lockdep_assert_held(&ctx2->lock);
2441
2442
2443 if (ctx1->pin_count || ctx2->pin_count)
2444 return 0;
2445
2446
2447 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2448 return 1;
2449
2450
2451 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2452 return 1;
2453
2454
2455
2456
2457
2458 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2459 ctx1->parent_gen == ctx2->parent_gen)
2460 return 1;
2461
2462
2463 return 0;
2464}
2465
2466static void __perf_event_sync_stat(struct perf_event *event,
2467 struct perf_event *next_event)
2468{
2469 u64 value;
2470
2471 if (!event->attr.inherit_stat)
2472 return;
2473
2474
2475
2476
2477
2478
2479
2480
2481 switch (event->state) {
2482 case PERF_EVENT_STATE_ACTIVE:
2483 event->pmu->read(event);
2484
2485
2486 case PERF_EVENT_STATE_INACTIVE:
2487 update_event_times(event);
2488 break;
2489
2490 default:
2491 break;
2492 }
2493
2494
2495
2496
2497
2498 value = local64_read(&next_event->count);
2499 value = local64_xchg(&event->count, value);
2500 local64_set(&next_event->count, value);
2501
2502 swap(event->total_time_enabled, next_event->total_time_enabled);
2503 swap(event->total_time_running, next_event->total_time_running);
2504
2505
2506
2507
2508 perf_event_update_userpage(event);
2509 perf_event_update_userpage(next_event);
2510}
2511
2512static void perf_event_sync_stat(struct perf_event_context *ctx,
2513 struct perf_event_context *next_ctx)
2514{
2515 struct perf_event *event, *next_event;
2516
2517 if (!ctx->nr_stat)
2518 return;
2519
2520 update_context_time(ctx);
2521
2522 event = list_first_entry(&ctx->event_list,
2523 struct perf_event, event_entry);
2524
2525 next_event = list_first_entry(&next_ctx->event_list,
2526 struct perf_event, event_entry);
2527
2528 while (&event->event_entry != &ctx->event_list &&
2529 &next_event->event_entry != &next_ctx->event_list) {
2530
2531 __perf_event_sync_stat(event, next_event);
2532
2533 event = list_next_entry(event, event_entry);
2534 next_event = list_next_entry(next_event, event_entry);
2535 }
2536}
2537
2538static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2539 struct task_struct *next)
2540{
2541 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2542 struct perf_event_context *next_ctx;
2543 struct perf_event_context *parent, *next_parent;
2544 struct perf_cpu_context *cpuctx;
2545 int do_switch = 1;
2546
2547 if (likely(!ctx))
2548 return;
2549
2550 cpuctx = __get_cpu_context(ctx);
2551 if (!cpuctx->task_ctx)
2552 return;
2553
2554 rcu_read_lock();
2555 next_ctx = next->perf_event_ctxp[ctxn];
2556 if (!next_ctx)
2557 goto unlock;
2558
2559 parent = rcu_dereference(ctx->parent_ctx);
2560 next_parent = rcu_dereference(next_ctx->parent_ctx);
2561
2562
2563 if (!parent && !next_parent)
2564 goto unlock;
2565
2566 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576 raw_spin_lock(&ctx->lock);
2577 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2578 if (context_equiv(ctx, next_ctx)) {
2579
2580
2581
2582
2583 task->perf_event_ctxp[ctxn] = next_ctx;
2584 next->perf_event_ctxp[ctxn] = ctx;
2585 ctx->task = next;
2586 next_ctx->task = task;
2587
2588 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2589
2590 do_switch = 0;
2591
2592 perf_event_sync_stat(ctx, next_ctx);
2593 }
2594 raw_spin_unlock(&next_ctx->lock);
2595 raw_spin_unlock(&ctx->lock);
2596 }
2597unlock:
2598 rcu_read_unlock();
2599
2600 if (do_switch) {
2601 raw_spin_lock(&ctx->lock);
2602 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2603 cpuctx->task_ctx = NULL;
2604 raw_spin_unlock(&ctx->lock);
2605 }
2606}
2607
2608void perf_sched_cb_dec(struct pmu *pmu)
2609{
2610 this_cpu_dec(perf_sched_cb_usages);
2611}
2612
2613void perf_sched_cb_inc(struct pmu *pmu)
2614{
2615 this_cpu_inc(perf_sched_cb_usages);
2616}
2617
2618
2619
2620
2621
2622static void perf_pmu_sched_task(struct task_struct *prev,
2623 struct task_struct *next,
2624 bool sched_in)
2625{
2626 struct perf_cpu_context *cpuctx;
2627 struct pmu *pmu;
2628 unsigned long flags;
2629
2630 if (prev == next)
2631 return;
2632
2633 local_irq_save(flags);
2634
2635 rcu_read_lock();
2636
2637 list_for_each_entry_rcu(pmu, &pmus, entry) {
2638 if (pmu->sched_task) {
2639 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2640
2641 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2642
2643 perf_pmu_disable(pmu);
2644
2645 pmu->sched_task(cpuctx->task_ctx, sched_in);
2646
2647 perf_pmu_enable(pmu);
2648
2649 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2650 }
2651 }
2652
2653 rcu_read_unlock();
2654
2655 local_irq_restore(flags);
2656}
2657
2658static void perf_event_switch(struct task_struct *task,
2659 struct task_struct *next_prev, bool sched_in);
2660
2661#define for_each_task_context_nr(ctxn) \
2662 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675void __perf_event_task_sched_out(struct task_struct *task,
2676 struct task_struct *next)
2677{
2678 int ctxn;
2679
2680 if (__this_cpu_read(perf_sched_cb_usages))
2681 perf_pmu_sched_task(task, next, false);
2682
2683 if (atomic_read(&nr_switch_events))
2684 perf_event_switch(task, next, false);
2685
2686 for_each_task_context_nr(ctxn)
2687 perf_event_context_sched_out(task, ctxn, next);
2688
2689
2690
2691
2692
2693
2694 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2695 perf_cgroup_sched_out(task, next);
2696}
2697
2698static void task_ctx_sched_out(struct perf_event_context *ctx)
2699{
2700 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2701
2702 if (!cpuctx->task_ctx)
2703 return;
2704
2705 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2706 return;
2707
2708 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2709 cpuctx->task_ctx = NULL;
2710}
2711
2712
2713
2714
2715static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2716 enum event_type_t event_type)
2717{
2718 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2719}
2720
2721static void
2722ctx_pinned_sched_in(struct perf_event_context *ctx,
2723 struct perf_cpu_context *cpuctx)
2724{
2725 struct perf_event *event;
2726
2727 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2728 if (event->state <= PERF_EVENT_STATE_OFF)
2729 continue;
2730 if (!event_filter_match(event))
2731 continue;
2732
2733
2734 if (is_cgroup_event(event))
2735 perf_cgroup_mark_enabled(event, ctx);
2736
2737 if (group_can_go_on(event, cpuctx, 1))
2738 group_sched_in(event, cpuctx, ctx);
2739
2740
2741
2742
2743
2744 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2745 update_group_times(event);
2746 event->state = PERF_EVENT_STATE_ERROR;
2747 }
2748 }
2749}
2750
2751static void
2752ctx_flexible_sched_in(struct perf_event_context *ctx,
2753 struct perf_cpu_context *cpuctx)
2754{
2755 struct perf_event *event;
2756 int can_add_hw = 1;
2757
2758 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2759
2760 if (event->state <= PERF_EVENT_STATE_OFF)
2761 continue;
2762
2763
2764
2765
2766 if (!event_filter_match(event))
2767 continue;
2768
2769
2770 if (is_cgroup_event(event))
2771 perf_cgroup_mark_enabled(event, ctx);
2772
2773 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2774 if (group_sched_in(event, cpuctx, ctx))
2775 can_add_hw = 0;
2776 }
2777 }
2778}
2779
2780static void
2781ctx_sched_in(struct perf_event_context *ctx,
2782 struct perf_cpu_context *cpuctx,
2783 enum event_type_t event_type,
2784 struct task_struct *task)
2785{
2786 u64 now;
2787 int is_active = ctx->is_active;
2788
2789 ctx->is_active |= event_type;
2790 if (likely(!ctx->nr_events))
2791 return;
2792
2793 now = perf_clock();
2794 ctx->timestamp = now;
2795 perf_cgroup_set_timestamp(task, ctx);
2796
2797
2798
2799
2800 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2801 ctx_pinned_sched_in(ctx, cpuctx);
2802
2803
2804 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2805 ctx_flexible_sched_in(ctx, cpuctx);
2806}
2807
2808static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2809 enum event_type_t event_type,
2810 struct task_struct *task)
2811{
2812 struct perf_event_context *ctx = &cpuctx->ctx;
2813
2814 ctx_sched_in(ctx, cpuctx, event_type, task);
2815}
2816
2817static void perf_event_context_sched_in(struct perf_event_context *ctx,
2818 struct task_struct *task)
2819{
2820 struct perf_cpu_context *cpuctx;
2821
2822 cpuctx = __get_cpu_context(ctx);
2823 if (cpuctx->task_ctx == ctx)
2824 return;
2825
2826 perf_ctx_lock(cpuctx, ctx);
2827 perf_pmu_disable(ctx->pmu);
2828
2829
2830
2831
2832
2833 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2834
2835 if (ctx->nr_events)
2836 cpuctx->task_ctx = ctx;
2837
2838 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2839
2840 perf_pmu_enable(ctx->pmu);
2841 perf_ctx_unlock(cpuctx, ctx);
2842}
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855void __perf_event_task_sched_in(struct task_struct *prev,
2856 struct task_struct *task)
2857{
2858 struct perf_event_context *ctx;
2859 int ctxn;
2860
2861 for_each_task_context_nr(ctxn) {
2862 ctx = task->perf_event_ctxp[ctxn];
2863 if (likely(!ctx))
2864 continue;
2865
2866 perf_event_context_sched_in(ctx, task);
2867 }
2868
2869
2870
2871
2872
2873 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2874 perf_cgroup_sched_in(prev, task);
2875
2876 if (atomic_read(&nr_switch_events))
2877 perf_event_switch(task, prev, true);
2878
2879 if (__this_cpu_read(perf_sched_cb_usages))
2880 perf_pmu_sched_task(prev, task, true);
2881}
2882
2883static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2884{
2885 u64 frequency = event->attr.sample_freq;
2886 u64 sec = NSEC_PER_SEC;
2887 u64 divisor, dividend;
2888
2889 int count_fls, nsec_fls, frequency_fls, sec_fls;
2890
2891 count_fls = fls64(count);
2892 nsec_fls = fls64(nsec);
2893 frequency_fls = fls64(frequency);
2894 sec_fls = 30;
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910#define REDUCE_FLS(a, b) \
2911do { \
2912 if (a##_fls > b##_fls) { \
2913 a >>= 1; \
2914 a##_fls--; \
2915 } else { \
2916 b >>= 1; \
2917 b##_fls--; \
2918 } \
2919} while (0)
2920
2921
2922
2923
2924
2925 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2926 REDUCE_FLS(nsec, frequency);
2927 REDUCE_FLS(sec, count);
2928 }
2929
2930 if (count_fls + sec_fls > 64) {
2931 divisor = nsec * frequency;
2932
2933 while (count_fls + sec_fls > 64) {
2934 REDUCE_FLS(count, sec);
2935 divisor >>= 1;
2936 }
2937
2938 dividend = count * sec;
2939 } else {
2940 dividend = count * sec;
2941
2942 while (nsec_fls + frequency_fls > 64) {
2943 REDUCE_FLS(nsec, frequency);
2944 dividend >>= 1;
2945 }
2946
2947 divisor = nsec * frequency;
2948 }
2949
2950 if (!divisor)
2951 return dividend;
2952
2953 return div64_u64(dividend, divisor);
2954}
2955
2956static DEFINE_PER_CPU(int, perf_throttled_count);
2957static DEFINE_PER_CPU(u64, perf_throttled_seq);
2958
2959static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2960{
2961 struct hw_perf_event *hwc = &event->hw;
2962 s64 period, sample_period;
2963 s64 delta;
2964
2965 period = perf_calculate_period(event, nsec, count);
2966
2967 delta = (s64)(period - hwc->sample_period);
2968 delta = (delta + 7) / 8;
2969
2970 sample_period = hwc->sample_period + delta;
2971
2972 if (!sample_period)
2973 sample_period = 1;
2974
2975 hwc->sample_period = sample_period;
2976
2977 if (local64_read(&hwc->period_left) > 8*sample_period) {
2978 if (disable)
2979 event->pmu->stop(event, PERF_EF_UPDATE);
2980
2981 local64_set(&hwc->period_left, 0);
2982
2983 if (disable)
2984 event->pmu->start(event, PERF_EF_RELOAD);
2985 }
2986}
2987
2988
2989
2990
2991
2992
2993static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2994 int needs_unthr)
2995{
2996 struct perf_event *event;
2997 struct hw_perf_event *hwc;
2998 u64 now, period = TICK_NSEC;
2999 s64 delta;
3000
3001
3002
3003
3004
3005
3006 if (!(ctx->nr_freq || needs_unthr))
3007 return;
3008
3009 raw_spin_lock(&ctx->lock);
3010 perf_pmu_disable(ctx->pmu);
3011
3012 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3013 if (event->state != PERF_EVENT_STATE_ACTIVE)
3014 continue;
3015
3016 if (!event_filter_match(event))
3017 continue;
3018
3019 perf_pmu_disable(event->pmu);
3020
3021 hwc = &event->hw;
3022
3023 if (hwc->interrupts == MAX_INTERRUPTS) {
3024 hwc->interrupts = 0;
3025 perf_log_throttle(event, 1);
3026 event->pmu->start(event, 0);
3027 }
3028
3029 if (!event->attr.freq || !event->attr.sample_freq)
3030 goto next;
3031
3032
3033
3034
3035 event->pmu->stop(event, PERF_EF_UPDATE);
3036
3037 now = local64_read(&event->count);
3038 delta = now - hwc->freq_count_stamp;
3039 hwc->freq_count_stamp = now;
3040
3041
3042
3043
3044
3045
3046
3047
3048 if (delta > 0)
3049 perf_adjust_period(event, period, delta, false);
3050
3051 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3052 next:
3053 perf_pmu_enable(event->pmu);
3054 }
3055
3056 perf_pmu_enable(ctx->pmu);
3057 raw_spin_unlock(&ctx->lock);
3058}
3059
3060
3061
3062
3063static void rotate_ctx(struct perf_event_context *ctx)
3064{
3065
3066
3067
3068
3069 if (!ctx->rotate_disable)
3070 list_rotate_left(&ctx->flexible_groups);
3071}
3072
3073static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3074{
3075 struct perf_event_context *ctx = NULL;
3076 int rotate = 0;
3077
3078 if (cpuctx->ctx.nr_events) {
3079 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3080 rotate = 1;
3081 }
3082
3083 ctx = cpuctx->task_ctx;
3084 if (ctx && ctx->nr_events) {
3085 if (ctx->nr_events != ctx->nr_active)
3086 rotate = 1;
3087 }
3088
3089 if (!rotate)
3090 goto done;
3091
3092 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3093 perf_pmu_disable(cpuctx->ctx.pmu);
3094
3095 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3096 if (ctx)
3097 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3098
3099 rotate_ctx(&cpuctx->ctx);
3100 if (ctx)
3101 rotate_ctx(ctx);
3102
3103 perf_event_sched_in(cpuctx, ctx, current);
3104
3105 perf_pmu_enable(cpuctx->ctx.pmu);
3106 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3107done:
3108
3109 return rotate;
3110}
3111
3112#ifdef CONFIG_NO_HZ_FULL
3113bool perf_event_can_stop_tick(void)
3114{
3115 if (atomic_read(&nr_freq_events) ||
3116 __this_cpu_read(perf_throttled_count))
3117 return false;
3118 else
3119 return true;
3120}
3121#endif
3122
3123void perf_event_task_tick(void)
3124{
3125 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3126 struct perf_event_context *ctx, *tmp;
3127 int throttled;
3128
3129 WARN_ON(!irqs_disabled());
3130
3131 __this_cpu_inc(perf_throttled_seq);
3132 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3133
3134 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3135 perf_adjust_freq_unthr_context(ctx, throttled);
3136}
3137
3138static int event_enable_on_exec(struct perf_event *event,
3139 struct perf_event_context *ctx)
3140{
3141 if (!event->attr.enable_on_exec)
3142 return 0;
3143
3144 event->attr.enable_on_exec = 0;
3145 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3146 return 0;
3147
3148 __perf_event_mark_enabled(event);
3149
3150 return 1;
3151}
3152
3153
3154
3155
3156
3157static void perf_event_enable_on_exec(int ctxn)
3158{
3159 struct perf_event_context *ctx, *clone_ctx = NULL;
3160 struct perf_event *event;
3161 unsigned long flags;
3162 int enabled = 0;
3163 int ret;
3164
3165 local_irq_save(flags);
3166 ctx = current->perf_event_ctxp[ctxn];
3167 if (!ctx || !ctx->nr_events)
3168 goto out;
3169
3170
3171
3172
3173
3174
3175
3176
3177 perf_cgroup_sched_out(current, NULL);
3178
3179 raw_spin_lock(&ctx->lock);
3180 task_ctx_sched_out(ctx);
3181
3182 list_for_each_entry(event, &ctx->event_list, event_entry) {
3183 ret = event_enable_on_exec(event, ctx);
3184 if (ret)
3185 enabled = 1;
3186 }
3187
3188
3189
3190
3191 if (enabled)
3192 clone_ctx = unclone_ctx(ctx);
3193
3194 raw_spin_unlock(&ctx->lock);
3195
3196
3197
3198
3199 perf_event_context_sched_in(ctx, ctx->task);
3200out:
3201 local_irq_restore(flags);
3202
3203 if (clone_ctx)
3204 put_ctx(clone_ctx);
3205}
3206
3207void perf_event_exec(void)
3208{
3209 int ctxn;
3210
3211 rcu_read_lock();
3212 for_each_task_context_nr(ctxn)
3213 perf_event_enable_on_exec(ctxn);
3214 rcu_read_unlock();
3215}
3216
3217struct perf_read_data {
3218 struct perf_event *event;
3219 bool group;
3220 int ret;
3221};
3222
3223
3224
3225
3226static void __perf_event_read(void *info)
3227{
3228 struct perf_read_data *data = info;
3229 struct perf_event *sub, *event = data->event;
3230 struct perf_event_context *ctx = event->ctx;
3231 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3232 struct pmu *pmu = event->pmu;
3233
3234
3235
3236
3237
3238
3239
3240
3241 if (ctx->task && cpuctx->task_ctx != ctx)
3242 return;
3243
3244 raw_spin_lock(&ctx->lock);
3245 if (ctx->is_active) {
3246 update_context_time(ctx);
3247 update_cgrp_time_from_event(event);
3248 }
3249
3250 update_event_times(event);
3251 if (event->state != PERF_EVENT_STATE_ACTIVE)
3252 goto unlock;
3253
3254 if (!data->group) {
3255 pmu->read(event);
3256 data->ret = 0;
3257 goto unlock;
3258 }
3259
3260 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3261
3262 pmu->read(event);
3263
3264 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3265 update_event_times(sub);
3266 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3267
3268
3269
3270
3271 sub->pmu->read(sub);
3272 }
3273 }
3274
3275 data->ret = pmu->commit_txn(pmu);
3276
3277unlock:
3278 raw_spin_unlock(&ctx->lock);
3279}
3280
3281static inline u64 perf_event_count(struct perf_event *event)
3282{
3283 if (event->pmu->count)
3284 return event->pmu->count(event);
3285
3286 return __perf_event_count(event);
3287}
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297u64 perf_event_read_local(struct perf_event *event)
3298{
3299 unsigned long flags;
3300 u64 val;
3301
3302
3303
3304
3305
3306 local_irq_save(flags);
3307
3308
3309 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3310 event->hw.target != current);
3311
3312
3313 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3314 event->cpu != smp_processor_id());
3315
3316
3317
3318
3319
3320 WARN_ON_ONCE(event->attr.inherit);
3321
3322
3323
3324
3325
3326 WARN_ON_ONCE(event->pmu->count);
3327
3328
3329
3330
3331
3332
3333 if (event->oncpu == smp_processor_id())
3334 event->pmu->read(event);
3335
3336 val = local64_read(&event->count);
3337 local_irq_restore(flags);
3338
3339 return val;
3340}
3341
3342static int perf_event_read(struct perf_event *event, bool group)
3343{
3344 int ret = 0;
3345
3346
3347
3348
3349
3350 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3351 struct perf_read_data data = {
3352 .event = event,
3353 .group = group,
3354 .ret = 0,
3355 };
3356 smp_call_function_single(event->oncpu,
3357 __perf_event_read, &data, 1);
3358 ret = data.ret;
3359 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3360 struct perf_event_context *ctx = event->ctx;
3361 unsigned long flags;
3362
3363 raw_spin_lock_irqsave(&ctx->lock, flags);
3364
3365
3366
3367
3368
3369 if (ctx->is_active) {
3370 update_context_time(ctx);
3371 update_cgrp_time_from_event(event);
3372 }
3373 if (group)
3374 update_group_times(event);
3375 else
3376 update_event_times(event);
3377 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3378 }
3379
3380 return ret;
3381}
3382
3383
3384
3385
3386static void __perf_event_init_context(struct perf_event_context *ctx)
3387{
3388 raw_spin_lock_init(&ctx->lock);
3389 mutex_init(&ctx->mutex);
3390 INIT_LIST_HEAD(&ctx->active_ctx_list);
3391 INIT_LIST_HEAD(&ctx->pinned_groups);
3392 INIT_LIST_HEAD(&ctx->flexible_groups);
3393 INIT_LIST_HEAD(&ctx->event_list);
3394 atomic_set(&ctx->refcount, 1);
3395 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3396}
3397
3398static struct perf_event_context *
3399alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3400{
3401 struct perf_event_context *ctx;
3402
3403 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3404 if (!ctx)
3405 return NULL;
3406
3407 __perf_event_init_context(ctx);
3408 if (task) {
3409 ctx->task = task;
3410 get_task_struct(task);
3411 }
3412 ctx->pmu = pmu;
3413
3414 return ctx;
3415}
3416
3417static struct task_struct *
3418find_lively_task_by_vpid(pid_t vpid)
3419{
3420 struct task_struct *task;
3421 int err;
3422
3423 rcu_read_lock();
3424 if (!vpid)
3425 task = current;
3426 else
3427 task = find_task_by_vpid(vpid);
3428 if (task)
3429 get_task_struct(task);
3430 rcu_read_unlock();
3431
3432 if (!task)
3433 return ERR_PTR(-ESRCH);
3434
3435
3436 err = -EACCES;
3437 if (!ptrace_may_access(task, PTRACE_MODE_READ))
3438 goto errout;
3439
3440 return task;
3441errout:
3442 put_task_struct(task);
3443 return ERR_PTR(err);
3444
3445}
3446
3447
3448
3449
3450static struct perf_event_context *
3451find_get_context(struct pmu *pmu, struct task_struct *task,
3452 struct perf_event *event)
3453{
3454 struct perf_event_context *ctx, *clone_ctx = NULL;
3455 struct perf_cpu_context *cpuctx;
3456 void *task_ctx_data = NULL;
3457 unsigned long flags;
3458 int ctxn, err;
3459 int cpu = event->cpu;
3460
3461 if (!task) {
3462
3463 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3464 return ERR_PTR(-EACCES);
3465
3466
3467
3468
3469
3470
3471 if (!cpu_online(cpu))
3472 return ERR_PTR(-ENODEV);
3473
3474 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3475 ctx = &cpuctx->ctx;
3476 get_ctx(ctx);
3477 ++ctx->pin_count;
3478
3479 return ctx;
3480 }
3481
3482 err = -EINVAL;
3483 ctxn = pmu->task_ctx_nr;
3484 if (ctxn < 0)
3485 goto errout;
3486
3487 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3488 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3489 if (!task_ctx_data) {
3490 err = -ENOMEM;
3491 goto errout;
3492 }
3493 }
3494
3495retry:
3496 ctx = perf_lock_task_context(task, ctxn, &flags);
3497 if (ctx) {
3498 clone_ctx = unclone_ctx(ctx);
3499 ++ctx->pin_count;
3500
3501 if (task_ctx_data && !ctx->task_ctx_data) {
3502 ctx->task_ctx_data = task_ctx_data;
3503 task_ctx_data = NULL;
3504 }
3505 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3506
3507 if (clone_ctx)
3508 put_ctx(clone_ctx);
3509 } else {
3510 ctx = alloc_perf_context(pmu, task);
3511 err = -ENOMEM;
3512 if (!ctx)
3513 goto errout;
3514
3515 if (task_ctx_data) {
3516 ctx->task_ctx_data = task_ctx_data;
3517 task_ctx_data = NULL;
3518 }
3519
3520 err = 0;
3521 mutex_lock(&task->perf_event_mutex);
3522
3523
3524
3525
3526 if (task->flags & PF_EXITING)
3527 err = -ESRCH;
3528 else if (task->perf_event_ctxp[ctxn])
3529 err = -EAGAIN;
3530 else {
3531 get_ctx(ctx);
3532 ++ctx->pin_count;
3533 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3534 }
3535 mutex_unlock(&task->perf_event_mutex);
3536
3537 if (unlikely(err)) {
3538 put_ctx(ctx);
3539
3540 if (err == -EAGAIN)
3541 goto retry;
3542 goto errout;
3543 }
3544 }
3545
3546 kfree(task_ctx_data);
3547 return ctx;
3548
3549errout:
3550 kfree(task_ctx_data);
3551 return ERR_PTR(err);
3552}
3553
3554static void perf_event_free_filter(struct perf_event *event);
3555static void perf_event_free_bpf_prog(struct perf_event *event);
3556
3557static void free_event_rcu(struct rcu_head *head)
3558{
3559 struct perf_event *event;
3560
3561 event = container_of(head, struct perf_event, rcu_head);
3562 if (event->ns)
3563 put_pid_ns(event->ns);
3564 perf_event_free_filter(event);
3565 kfree(event);
3566}
3567
3568static void ring_buffer_attach(struct perf_event *event,
3569 struct ring_buffer *rb);
3570
3571static void unaccount_event_cpu(struct perf_event *event, int cpu)
3572{
3573 if (event->parent)
3574 return;
3575
3576 if (is_cgroup_event(event))
3577 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3578}
3579
3580static void unaccount_event(struct perf_event *event)
3581{
3582 if (event->parent)
3583 return;
3584
3585 if (event->attach_state & PERF_ATTACH_TASK)
3586 static_key_slow_dec_deferred(&perf_sched_events);
3587 if (event->attr.mmap || event->attr.mmap_data)
3588 atomic_dec(&nr_mmap_events);
3589 if (event->attr.comm)
3590 atomic_dec(&nr_comm_events);
3591 if (event->attr.task)
3592 atomic_dec(&nr_task_events);
3593 if (event->attr.freq)
3594 atomic_dec(&nr_freq_events);
3595 if (event->attr.context_switch) {
3596 static_key_slow_dec_deferred(&perf_sched_events);
3597 atomic_dec(&nr_switch_events);
3598 }
3599 if (is_cgroup_event(event))
3600 static_key_slow_dec_deferred(&perf_sched_events);
3601 if (has_branch_stack(event))
3602 static_key_slow_dec_deferred(&perf_sched_events);
3603
3604 unaccount_event_cpu(event, event->cpu);
3605}
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619static int exclusive_event_init(struct perf_event *event)
3620{
3621 struct pmu *pmu = event->pmu;
3622
3623 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3624 return 0;
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639 if (event->attach_state & PERF_ATTACH_TASK) {
3640 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3641 return -EBUSY;
3642 } else {
3643 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3644 return -EBUSY;
3645 }
3646
3647 return 0;
3648}
3649
3650static void exclusive_event_destroy(struct perf_event *event)
3651{
3652 struct pmu *pmu = event->pmu;
3653
3654 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3655 return;
3656
3657
3658 if (event->attach_state & PERF_ATTACH_TASK)
3659 atomic_dec(&pmu->exclusive_cnt);
3660 else
3661 atomic_inc(&pmu->exclusive_cnt);
3662}
3663
3664static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3665{
3666 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3667 (e1->cpu == e2->cpu ||
3668 e1->cpu == -1 ||
3669 e2->cpu == -1))
3670 return true;
3671 return false;
3672}
3673
3674
3675static bool exclusive_event_installable(struct perf_event *event,
3676 struct perf_event_context *ctx)
3677{
3678 struct perf_event *iter_event;
3679 struct pmu *pmu = event->pmu;
3680
3681 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3682 return true;
3683
3684 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3685 if (exclusive_event_match(iter_event, event))
3686 return false;
3687 }
3688
3689 return true;
3690}
3691
3692static void __free_event(struct perf_event *event)
3693{
3694 if (!event->parent) {
3695 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3696 put_callchain_buffers();
3697 }
3698
3699 perf_event_free_bpf_prog(event);
3700
3701 if (event->destroy)
3702 event->destroy(event);
3703
3704 if (event->ctx)
3705 put_ctx(event->ctx);
3706
3707 if (event->pmu) {
3708 exclusive_event_destroy(event);
3709 module_put(event->pmu->module);
3710 }
3711
3712 call_rcu(&event->rcu_head, free_event_rcu);
3713}
3714
3715static void _free_event(struct perf_event *event)
3716{
3717 irq_work_sync(&event->pending);
3718
3719 unaccount_event(event);
3720
3721 if (event->rb) {
3722
3723
3724
3725
3726
3727
3728 mutex_lock(&event->mmap_mutex);
3729 ring_buffer_attach(event, NULL);
3730 mutex_unlock(&event->mmap_mutex);
3731 }
3732
3733 if (is_cgroup_event(event))
3734 perf_detach_cgroup(event);
3735
3736 __free_event(event);
3737}
3738
3739
3740
3741
3742
3743static void free_event(struct perf_event *event)
3744{
3745 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3746 "unexpected event refcount: %ld; ptr=%p\n",
3747 atomic_long_read(&event->refcount), event)) {
3748
3749 return;
3750 }
3751
3752 _free_event(event);
3753}
3754
3755
3756
3757
3758static void perf_remove_from_owner(struct perf_event *event)
3759{
3760 struct task_struct *owner;
3761
3762 rcu_read_lock();
3763 owner = ACCESS_ONCE(event->owner);
3764
3765
3766
3767
3768
3769
3770 smp_read_barrier_depends();
3771 if (owner) {
3772
3773
3774
3775
3776
3777 get_task_struct(owner);
3778 }
3779 rcu_read_unlock();
3780
3781 if (owner) {
3782
3783
3784
3785
3786
3787
3788
3789
3790 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3791
3792
3793
3794
3795
3796
3797
3798 if (event->owner)
3799 list_del_init(&event->owner_entry);
3800 mutex_unlock(&owner->perf_event_mutex);
3801 put_task_struct(owner);
3802 }
3803}
3804
3805static void put_event(struct perf_event *event)
3806{
3807 struct perf_event_context *ctx;
3808
3809 if (!atomic_long_dec_and_test(&event->refcount))
3810 return;
3811
3812 if (!is_kernel_event(event))
3813 perf_remove_from_owner(event);
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3828 WARN_ON_ONCE(ctx->parent_ctx);
3829 perf_remove_from_context(event, true);
3830 perf_event_ctx_unlock(event, ctx);
3831
3832 _free_event(event);
3833}
3834
3835int perf_event_release_kernel(struct perf_event *event)
3836{
3837 put_event(event);
3838 return 0;
3839}
3840EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3841
3842
3843
3844
3845static int perf_release(struct inode *inode, struct file *file)
3846{
3847 put_event(file->private_data);
3848 return 0;
3849}
3850
3851
3852
3853
3854static void orphans_remove_work(struct work_struct *work)
3855{
3856 struct perf_event_context *ctx;
3857 struct perf_event *event, *tmp;
3858
3859 ctx = container_of(work, struct perf_event_context,
3860 orphans_remove.work);
3861
3862 mutex_lock(&ctx->mutex);
3863 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3864 struct perf_event *parent_event = event->parent;
3865
3866 if (!is_orphaned_child(event))
3867 continue;
3868
3869 perf_remove_from_context(event, true);
3870
3871 mutex_lock(&parent_event->child_mutex);
3872 list_del_init(&event->child_list);
3873 mutex_unlock(&parent_event->child_mutex);
3874
3875 free_event(event);
3876 put_event(parent_event);
3877 }
3878
3879 raw_spin_lock_irq(&ctx->lock);
3880 ctx->orphans_remove_sched = false;
3881 raw_spin_unlock_irq(&ctx->lock);
3882 mutex_unlock(&ctx->mutex);
3883
3884 put_ctx(ctx);
3885}
3886
3887u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3888{
3889 struct perf_event *child;
3890 u64 total = 0;
3891
3892 *enabled = 0;
3893 *running = 0;
3894
3895 mutex_lock(&event->child_mutex);
3896
3897 (void)perf_event_read(event, false);
3898 total += perf_event_count(event);
3899
3900 *enabled += event->total_time_enabled +
3901 atomic64_read(&event->child_total_time_enabled);
3902 *running += event->total_time_running +
3903 atomic64_read(&event->child_total_time_running);
3904
3905 list_for_each_entry(child, &event->child_list, child_list) {
3906 (void)perf_event_read(child, false);
3907 total += perf_event_count(child);
3908 *enabled += child->total_time_enabled;
3909 *running += child->total_time_running;
3910 }
3911 mutex_unlock(&event->child_mutex);
3912
3913 return total;
3914}
3915EXPORT_SYMBOL_GPL(perf_event_read_value);
3916
3917static int __perf_read_group_add(struct perf_event *leader,
3918 u64 read_format, u64 *values)
3919{
3920 struct perf_event *sub;
3921 int n = 1;
3922 int ret;
3923
3924 ret = perf_event_read(leader, true);
3925 if (ret)
3926 return ret;
3927
3928
3929
3930
3931
3932
3933 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3934 values[n++] += leader->total_time_enabled +
3935 atomic64_read(&leader->child_total_time_enabled);
3936 }
3937
3938 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3939 values[n++] += leader->total_time_running +
3940 atomic64_read(&leader->child_total_time_running);
3941 }
3942
3943
3944
3945
3946 values[n++] += perf_event_count(leader);
3947 if (read_format & PERF_FORMAT_ID)
3948 values[n++] = primary_event_id(leader);
3949
3950 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3951 values[n++] += perf_event_count(sub);
3952 if (read_format & PERF_FORMAT_ID)
3953 values[n++] = primary_event_id(sub);
3954 }
3955
3956 return 0;
3957}
3958
3959static int perf_read_group(struct perf_event *event,
3960 u64 read_format, char __user *buf)
3961{
3962 struct perf_event *leader = event->group_leader, *child;
3963 struct perf_event_context *ctx = leader->ctx;
3964 int ret;
3965 u64 *values;
3966
3967 lockdep_assert_held(&ctx->mutex);
3968
3969 values = kzalloc(event->read_size, GFP_KERNEL);
3970 if (!values)
3971 return -ENOMEM;
3972
3973 values[0] = 1 + leader->nr_siblings;
3974
3975
3976
3977
3978
3979 mutex_lock(&leader->child_mutex);
3980
3981 ret = __perf_read_group_add(leader, read_format, values);
3982 if (ret)
3983 goto unlock;
3984
3985 list_for_each_entry(child, &leader->child_list, child_list) {
3986 ret = __perf_read_group_add(child, read_format, values);
3987 if (ret)
3988 goto unlock;
3989 }
3990
3991 mutex_unlock(&leader->child_mutex);
3992
3993 ret = event->read_size;
3994 if (copy_to_user(buf, values, event->read_size))
3995 ret = -EFAULT;
3996 goto out;
3997
3998unlock:
3999 mutex_unlock(&leader->child_mutex);
4000out:
4001 kfree(values);
4002 return ret;
4003}
4004
4005static int perf_read_one(struct perf_event *event,
4006 u64 read_format, char __user *buf)
4007{
4008 u64 enabled, running;
4009 u64 values[4];
4010 int n = 0;
4011
4012 values[n++] = perf_event_read_value(event, &enabled, &running);
4013 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4014 values[n++] = enabled;
4015 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4016 values[n++] = running;
4017 if (read_format & PERF_FORMAT_ID)
4018 values[n++] = primary_event_id(event);
4019
4020 if (copy_to_user(buf, values, n * sizeof(u64)))
4021 return -EFAULT;
4022
4023 return n * sizeof(u64);
4024}
4025
4026static bool is_event_hup(struct perf_event *event)
4027{
4028 bool no_children;
4029
4030 if (event->state != PERF_EVENT_STATE_EXIT)
4031 return false;
4032
4033 mutex_lock(&event->child_mutex);
4034 no_children = list_empty(&event->child_list);
4035 mutex_unlock(&event->child_mutex);
4036 return no_children;
4037}
4038
4039
4040
4041
4042static ssize_t
4043__perf_read(struct perf_event *event, char __user *buf, size_t count)
4044{
4045 u64 read_format = event->attr.read_format;
4046 int ret;
4047
4048
4049
4050
4051
4052
4053 if (event->state == PERF_EVENT_STATE_ERROR)
4054 return 0;
4055
4056 if (count < event->read_size)
4057 return -ENOSPC;
4058
4059 WARN_ON_ONCE(event->ctx->parent_ctx);
4060 if (read_format & PERF_FORMAT_GROUP)
4061 ret = perf_read_group(event, read_format, buf);
4062 else
4063 ret = perf_read_one(event, read_format, buf);
4064
4065 return ret;
4066}
4067
4068static ssize_t
4069perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4070{
4071 struct perf_event *event = file->private_data;
4072 struct perf_event_context *ctx;
4073 int ret;
4074
4075 ctx = perf_event_ctx_lock(event);
4076 ret = __perf_read(event, buf, count);
4077 perf_event_ctx_unlock(event, ctx);
4078
4079 return ret;
4080}
4081
4082static unsigned int perf_poll(struct file *file, poll_table *wait)
4083{
4084 struct perf_event *event = file->private_data;
4085 struct ring_buffer *rb;
4086 unsigned int events = POLLHUP;
4087
4088 poll_wait(file, &event->waitq, wait);
4089
4090 if (is_event_hup(event))
4091 return events;
4092
4093
4094
4095
4096
4097 mutex_lock(&event->mmap_mutex);
4098 rb = event->rb;
4099 if (rb)
4100 events = atomic_xchg(&rb->poll, 0);
4101 mutex_unlock(&event->mmap_mutex);
4102 return events;
4103}
4104
4105static void _perf_event_reset(struct perf_event *event)
4106{
4107 (void)perf_event_read(event, false);
4108 local64_set(&event->count, 0);
4109 perf_event_update_userpage(event);
4110}
4111
4112
4113
4114
4115
4116
4117
4118static void perf_event_for_each_child(struct perf_event *event,
4119 void (*func)(struct perf_event *))
4120{
4121 struct perf_event *child;
4122
4123 WARN_ON_ONCE(event->ctx->parent_ctx);
4124
4125 mutex_lock(&event->child_mutex);
4126 func(event);
4127 list_for_each_entry(child, &event->child_list, child_list)
4128 func(child);
4129 mutex_unlock(&event->child_mutex);
4130}
4131
4132static void perf_event_for_each(struct perf_event *event,
4133 void (*func)(struct perf_event *))
4134{
4135 struct perf_event_context *ctx = event->ctx;
4136 struct perf_event *sibling;
4137
4138 lockdep_assert_held(&ctx->mutex);
4139
4140 event = event->group_leader;
4141
4142 perf_event_for_each_child(event, func);
4143 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4144 perf_event_for_each_child(sibling, func);
4145}
4146
4147struct period_event {
4148 struct perf_event *event;
4149 u64 value;
4150};
4151
4152static int __perf_event_period(void *info)
4153{
4154 struct period_event *pe = info;
4155 struct perf_event *event = pe->event;
4156 struct perf_event_context *ctx = event->ctx;
4157 u64 value = pe->value;
4158 bool active;
4159
4160 raw_spin_lock(&ctx->lock);
4161 if (event->attr.freq) {
4162 event->attr.sample_freq = value;
4163 } else {
4164 event->attr.sample_period = value;
4165 event->hw.sample_period = value;
4166 }
4167
4168 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4169 if (active) {
4170 perf_pmu_disable(ctx->pmu);
4171 event->pmu->stop(event, PERF_EF_UPDATE);
4172 }
4173
4174 local64_set(&event->hw.period_left, 0);
4175
4176 if (active) {
4177 event->pmu->start(event, PERF_EF_RELOAD);
4178 perf_pmu_enable(ctx->pmu);
4179 }
4180 raw_spin_unlock(&ctx->lock);
4181
4182 return 0;
4183}
4184
4185static int perf_event_period(struct perf_event *event, u64 __user *arg)
4186{
4187 struct period_event pe = { .event = event, };
4188 struct perf_event_context *ctx = event->ctx;
4189 struct task_struct *task;
4190 u64 value;
4191
4192 if (!is_sampling_event(event))
4193 return -EINVAL;
4194
4195 if (copy_from_user(&value, arg, sizeof(value)))
4196 return -EFAULT;
4197
4198 if (!value)
4199 return -EINVAL;
4200
4201 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4202 return -EINVAL;
4203
4204 task = ctx->task;
4205 pe.value = value;
4206
4207 if (!task) {
4208 cpu_function_call(event->cpu, __perf_event_period, &pe);
4209 return 0;
4210 }
4211
4212retry:
4213 if (!task_function_call(task, __perf_event_period, &pe))
4214 return 0;
4215
4216 raw_spin_lock_irq(&ctx->lock);
4217 if (ctx->is_active) {
4218 raw_spin_unlock_irq(&ctx->lock);
4219 task = ctx->task;
4220 goto retry;
4221 }
4222
4223 if (event->attr.freq) {
4224 event->attr.sample_freq = value;
4225 } else {
4226 event->attr.sample_period = value;
4227 event->hw.sample_period = value;
4228 }
4229
4230 local64_set(&event->hw.period_left, 0);
4231 raw_spin_unlock_irq(&ctx->lock);
4232
4233 return 0;
4234}
4235
4236static const struct file_operations perf_fops;
4237
4238static inline int perf_fget_light(int fd, struct fd *p)
4239{
4240 struct fd f = fdget(fd);
4241 if (!f.file)
4242 return -EBADF;
4243
4244 if (f.file->f_op != &perf_fops) {
4245 fdput(f);
4246 return -EBADF;
4247 }
4248 *p = f;
4249 return 0;
4250}
4251
4252static int perf_event_set_output(struct perf_event *event,
4253 struct perf_event *output_event);
4254static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4255static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4256
4257static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4258{
4259 void (*func)(struct perf_event *);
4260 u32 flags = arg;
4261
4262 switch (cmd) {
4263 case PERF_EVENT_IOC_ENABLE:
4264 func = _perf_event_enable;
4265 break;
4266 case PERF_EVENT_IOC_DISABLE:
4267 func = _perf_event_disable;
4268 break;
4269 case PERF_EVENT_IOC_RESET:
4270 func = _perf_event_reset;
4271 break;
4272
4273 case PERF_EVENT_IOC_REFRESH:
4274 return _perf_event_refresh(event, arg);
4275
4276 case PERF_EVENT_IOC_PERIOD:
4277 return perf_event_period(event, (u64 __user *)arg);
4278
4279 case PERF_EVENT_IOC_ID:
4280 {
4281 u64 id = primary_event_id(event);
4282
4283 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4284 return -EFAULT;
4285 return 0;
4286 }
4287
4288 case PERF_EVENT_IOC_SET_OUTPUT:
4289 {
4290 int ret;
4291 if (arg != -1) {
4292 struct perf_event *output_event;
4293 struct fd output;
4294 ret = perf_fget_light(arg, &output);
4295 if (ret)
4296 return ret;
4297 output_event = output.file->private_data;
4298 ret = perf_event_set_output(event, output_event);
4299 fdput(output);
4300 } else {
4301 ret = perf_event_set_output(event, NULL);
4302 }
4303 return ret;
4304 }
4305
4306 case PERF_EVENT_IOC_SET_FILTER:
4307 return perf_event_set_filter(event, (void __user *)arg);
4308
4309 case PERF_EVENT_IOC_SET_BPF:
4310 return perf_event_set_bpf_prog(event, arg);
4311
4312 default:
4313 return -ENOTTY;
4314 }
4315
4316 if (flags & PERF_IOC_FLAG_GROUP)
4317 perf_event_for_each(event, func);
4318 else
4319 perf_event_for_each_child(event, func);
4320
4321 return 0;
4322}
4323
4324static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4325{
4326 struct perf_event *event = file->private_data;
4327 struct perf_event_context *ctx;
4328 long ret;
4329
4330 ctx = perf_event_ctx_lock(event);
4331 ret = _perf_ioctl(event, cmd, arg);
4332 perf_event_ctx_unlock(event, ctx);
4333
4334 return ret;
4335}
4336
4337#ifdef CONFIG_COMPAT
4338static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4339 unsigned long arg)
4340{
4341 switch (_IOC_NR(cmd)) {
4342 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4343 case _IOC_NR(PERF_EVENT_IOC_ID):
4344
4345 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4346 cmd &= ~IOCSIZE_MASK;
4347 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4348 }
4349 break;
4350 }
4351 return perf_ioctl(file, cmd, arg);
4352}
4353#else
4354# define perf_compat_ioctl NULL
4355#endif
4356
4357int perf_event_task_enable(void)
4358{
4359 struct perf_event_context *ctx;
4360 struct perf_event *event;
4361
4362 mutex_lock(¤t->perf_event_mutex);
4363 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4364 ctx = perf_event_ctx_lock(event);
4365 perf_event_for_each_child(event, _perf_event_enable);
4366 perf_event_ctx_unlock(event, ctx);
4367 }
4368 mutex_unlock(¤t->perf_event_mutex);
4369
4370 return 0;
4371}
4372
4373int perf_event_task_disable(void)
4374{
4375 struct perf_event_context *ctx;
4376 struct perf_event *event;
4377
4378 mutex_lock(¤t->perf_event_mutex);
4379 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4380 ctx = perf_event_ctx_lock(event);
4381 perf_event_for_each_child(event, _perf_event_disable);
4382 perf_event_ctx_unlock(event, ctx);
4383 }
4384 mutex_unlock(¤t->perf_event_mutex);
4385
4386 return 0;
4387}
4388
4389static int perf_event_index(struct perf_event *event)
4390{
4391 if (event->hw.state & PERF_HES_STOPPED)
4392 return 0;
4393
4394 if (event->state != PERF_EVENT_STATE_ACTIVE)
4395 return 0;
4396
4397 return event->pmu->event_idx(event);
4398}
4399
4400static void calc_timer_values(struct perf_event *event,
4401 u64 *now,
4402 u64 *enabled,
4403 u64 *running)
4404{
4405 u64 ctx_time;
4406
4407 *now = perf_clock();
4408 ctx_time = event->shadow_ctx_time + *now;
4409 *enabled = ctx_time - event->tstamp_enabled;
4410 *running = ctx_time - event->tstamp_running;
4411}
4412
4413static void perf_event_init_userpage(struct perf_event *event)
4414{
4415 struct perf_event_mmap_page *userpg;
4416 struct ring_buffer *rb;
4417
4418 rcu_read_lock();
4419 rb = rcu_dereference(event->rb);
4420 if (!rb)
4421 goto unlock;
4422
4423 userpg = rb->user_page;
4424
4425
4426 userpg->cap_bit0_is_deprecated = 1;
4427 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4428 userpg->data_offset = PAGE_SIZE;
4429 userpg->data_size = perf_data_size(rb);
4430
4431unlock:
4432 rcu_read_unlock();
4433}
4434
4435void __weak arch_perf_update_userpage(
4436 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4437{
4438}
4439
4440
4441
4442
4443
4444
4445void perf_event_update_userpage(struct perf_event *event)
4446{
4447 struct perf_event_mmap_page *userpg;
4448 struct ring_buffer *rb;
4449 u64 enabled, running, now;
4450
4451 rcu_read_lock();
4452 rb = rcu_dereference(event->rb);
4453 if (!rb)
4454 goto unlock;
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465 calc_timer_values(event, &now, &enabled, &running);
4466
4467 userpg = rb->user_page;
4468
4469
4470
4471
4472 preempt_disable();
4473 ++userpg->lock;
4474 barrier();
4475 userpg->index = perf_event_index(event);
4476 userpg->offset = perf_event_count(event);
4477 if (userpg->index)
4478 userpg->offset -= local64_read(&event->hw.prev_count);
4479
4480 userpg->time_enabled = enabled +
4481 atomic64_read(&event->child_total_time_enabled);
4482
4483 userpg->time_running = running +
4484 atomic64_read(&event->child_total_time_running);
4485
4486 arch_perf_update_userpage(event, userpg, now);
4487
4488 barrier();
4489 ++userpg->lock;
4490 preempt_enable();
4491unlock:
4492 rcu_read_unlock();
4493}
4494
4495static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4496{
4497 struct perf_event *event = vma->vm_file->private_data;
4498 struct ring_buffer *rb;
4499 int ret = VM_FAULT_SIGBUS;
4500
4501 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4502 if (vmf->pgoff == 0)
4503 ret = 0;
4504 return ret;
4505 }
4506
4507 rcu_read_lock();
4508 rb = rcu_dereference(event->rb);
4509 if (!rb)
4510 goto unlock;
4511
4512 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4513 goto unlock;
4514
4515 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4516 if (!vmf->page)
4517 goto unlock;
4518
4519 get_page(vmf->page);
4520 vmf->page->mapping = vma->vm_file->f_mapping;
4521 vmf->page->index = vmf->pgoff;
4522
4523 ret = 0;
4524unlock:
4525 rcu_read_unlock();
4526
4527 return ret;
4528}
4529
4530static void ring_buffer_attach(struct perf_event *event,
4531 struct ring_buffer *rb)
4532{
4533 struct ring_buffer *old_rb = NULL;
4534 unsigned long flags;
4535
4536 if (event->rb) {
4537
4538
4539
4540
4541 WARN_ON_ONCE(event->rcu_pending);
4542
4543 old_rb = event->rb;
4544 spin_lock_irqsave(&old_rb->event_lock, flags);
4545 list_del_rcu(&event->rb_entry);
4546 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4547
4548 event->rcu_batches = get_state_synchronize_rcu();
4549 event->rcu_pending = 1;
4550 }
4551
4552 if (rb) {
4553 if (event->rcu_pending) {
4554 cond_synchronize_rcu(event->rcu_batches);
4555 event->rcu_pending = 0;
4556 }
4557
4558 spin_lock_irqsave(&rb->event_lock, flags);
4559 list_add_rcu(&event->rb_entry, &rb->event_list);
4560 spin_unlock_irqrestore(&rb->event_lock, flags);
4561 }
4562
4563 rcu_assign_pointer(event->rb, rb);
4564
4565 if (old_rb) {
4566 ring_buffer_put(old_rb);
4567
4568
4569
4570
4571
4572 wake_up_all(&event->waitq);
4573 }
4574}
4575
4576static void ring_buffer_wakeup(struct perf_event *event)
4577{
4578 struct ring_buffer *rb;
4579
4580 rcu_read_lock();
4581 rb = rcu_dereference(event->rb);
4582 if (rb) {
4583 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4584 wake_up_all(&event->waitq);
4585 }
4586 rcu_read_unlock();
4587}
4588
4589struct ring_buffer *ring_buffer_get(struct perf_event *event)
4590{
4591 struct ring_buffer *rb;
4592
4593 rcu_read_lock();
4594 rb = rcu_dereference(event->rb);
4595 if (rb) {
4596 if (!atomic_inc_not_zero(&rb->refcount))
4597 rb = NULL;
4598 }
4599 rcu_read_unlock();
4600
4601 return rb;
4602}
4603
4604void ring_buffer_put(struct ring_buffer *rb)
4605{
4606 if (!atomic_dec_and_test(&rb->refcount))
4607 return;
4608
4609 WARN_ON_ONCE(!list_empty(&rb->event_list));
4610
4611 call_rcu(&rb->rcu_head, rb_free_rcu);
4612}
4613
4614static void perf_mmap_open(struct vm_area_struct *vma)
4615{
4616 struct perf_event *event = vma->vm_file->private_data;
4617
4618 atomic_inc(&event->mmap_count);
4619 atomic_inc(&event->rb->mmap_count);
4620
4621 if (vma->vm_pgoff)
4622 atomic_inc(&event->rb->aux_mmap_count);
4623
4624 if (event->pmu->event_mapped)
4625 event->pmu->event_mapped(event);
4626}
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636static void perf_mmap_close(struct vm_area_struct *vma)
4637{
4638 struct perf_event *event = vma->vm_file->private_data;
4639
4640 struct ring_buffer *rb = ring_buffer_get(event);
4641 struct user_struct *mmap_user = rb->mmap_user;
4642 int mmap_locked = rb->mmap_locked;
4643 unsigned long size = perf_data_size(rb);
4644
4645 if (event->pmu->event_unmapped)
4646 event->pmu->event_unmapped(event);
4647
4648
4649
4650
4651
4652
4653 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4654 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4655 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4656 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4657
4658 rb_free_aux(rb);
4659 mutex_unlock(&event->mmap_mutex);
4660 }
4661
4662 atomic_dec(&rb->mmap_count);
4663
4664 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4665 goto out_put;
4666
4667 ring_buffer_attach(event, NULL);
4668 mutex_unlock(&event->mmap_mutex);
4669
4670
4671 if (atomic_read(&rb->mmap_count))
4672 goto out_put;
4673
4674
4675
4676
4677
4678
4679again:
4680 rcu_read_lock();
4681 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4682 if (!atomic_long_inc_not_zero(&event->refcount)) {
4683
4684
4685
4686
4687 continue;
4688 }
4689 rcu_read_unlock();
4690
4691 mutex_lock(&event->mmap_mutex);
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702 if (event->rb == rb)
4703 ring_buffer_attach(event, NULL);
4704
4705 mutex_unlock(&event->mmap_mutex);
4706 put_event(event);
4707
4708
4709
4710
4711
4712 goto again;
4713 }
4714 rcu_read_unlock();
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4726 vma->vm_mm->pinned_vm -= mmap_locked;
4727 free_uid(mmap_user);
4728
4729out_put:
4730 ring_buffer_put(rb);
4731}
4732
4733static const struct vm_operations_struct perf_mmap_vmops = {
4734 .open = perf_mmap_open,
4735 .close = perf_mmap_close,
4736 .fault = perf_mmap_fault,
4737 .page_mkwrite = perf_mmap_fault,
4738};
4739
4740static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4741{
4742 struct perf_event *event = file->private_data;
4743 unsigned long user_locked, user_lock_limit;
4744 struct user_struct *user = current_user();
4745 unsigned long locked, lock_limit;
4746 struct ring_buffer *rb = NULL;
4747 unsigned long vma_size;
4748 unsigned long nr_pages;
4749 long user_extra = 0, extra = 0;
4750 int ret = 0, flags = 0;
4751
4752
4753
4754
4755
4756
4757 if (event->cpu == -1 && event->attr.inherit)
4758 return -EINVAL;
4759
4760 if (!(vma->vm_flags & VM_SHARED))
4761 return -EINVAL;
4762
4763 vma_size = vma->vm_end - vma->vm_start;
4764
4765 if (vma->vm_pgoff == 0) {
4766 nr_pages = (vma_size / PAGE_SIZE) - 1;
4767 } else {
4768
4769
4770
4771
4772
4773 u64 aux_offset, aux_size;
4774
4775 if (!event->rb)
4776 return -EINVAL;
4777
4778 nr_pages = vma_size / PAGE_SIZE;
4779
4780 mutex_lock(&event->mmap_mutex);
4781 ret = -EINVAL;
4782
4783 rb = event->rb;
4784 if (!rb)
4785 goto aux_unlock;
4786
4787 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4788 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4789
4790 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4791 goto aux_unlock;
4792
4793 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4794 goto aux_unlock;
4795
4796
4797 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4798 goto aux_unlock;
4799
4800 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4801 goto aux_unlock;
4802
4803
4804 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4805 goto aux_unlock;
4806
4807 if (!is_power_of_2(nr_pages))
4808 goto aux_unlock;
4809
4810 if (!atomic_inc_not_zero(&rb->mmap_count))
4811 goto aux_unlock;
4812
4813 if (rb_has_aux(rb)) {
4814 atomic_inc(&rb->aux_mmap_count);
4815 ret = 0;
4816 goto unlock;
4817 }
4818
4819 atomic_set(&rb->aux_mmap_count, 1);
4820 user_extra = nr_pages;
4821
4822 goto accounting;
4823 }
4824
4825
4826
4827
4828
4829 if (nr_pages != 0 && !is_power_of_2(nr_pages))
4830 return -EINVAL;
4831
4832 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4833 return -EINVAL;
4834
4835 WARN_ON_ONCE(event->ctx->parent_ctx);
4836again:
4837 mutex_lock(&event->mmap_mutex);
4838 if (event->rb) {
4839 if (event->rb->nr_pages != nr_pages) {
4840 ret = -EINVAL;
4841 goto unlock;
4842 }
4843
4844 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4845
4846
4847
4848
4849
4850 mutex_unlock(&event->mmap_mutex);
4851 goto again;
4852 }
4853
4854 goto unlock;
4855 }
4856
4857 user_extra = nr_pages + 1;
4858
4859accounting:
4860 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4861
4862
4863
4864
4865 user_lock_limit *= num_online_cpus();
4866
4867 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4868
4869 if (user_locked > user_lock_limit)
4870 extra = user_locked - user_lock_limit;
4871
4872 lock_limit = rlimit(RLIMIT_MEMLOCK);
4873 lock_limit >>= PAGE_SHIFT;
4874 locked = vma->vm_mm->pinned_vm + extra;
4875
4876 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4877 !capable(CAP_IPC_LOCK)) {
4878 ret = -EPERM;
4879 goto unlock;
4880 }
4881
4882 WARN_ON(!rb && event->rb);
4883
4884 if (vma->vm_flags & VM_WRITE)
4885 flags |= RING_BUFFER_WRITABLE;
4886
4887 if (!rb) {
4888 rb = rb_alloc(nr_pages,
4889 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4890 event->cpu, flags);
4891
4892 if (!rb) {
4893 ret = -ENOMEM;
4894 goto unlock;
4895 }
4896
4897 atomic_set(&rb->mmap_count, 1);
4898 rb->mmap_user = get_current_user();
4899 rb->mmap_locked = extra;
4900
4901 ring_buffer_attach(event, rb);
4902
4903 perf_event_init_userpage(event);
4904 perf_event_update_userpage(event);
4905 } else {
4906 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4907 event->attr.aux_watermark, flags);
4908 if (!ret)
4909 rb->aux_mmap_locked = extra;
4910 }
4911
4912unlock:
4913 if (!ret) {
4914 atomic_long_add(user_extra, &user->locked_vm);
4915 vma->vm_mm->pinned_vm += extra;
4916
4917 atomic_inc(&event->mmap_count);
4918 } else if (rb) {
4919 atomic_dec(&rb->mmap_count);
4920 }
4921aux_unlock:
4922 mutex_unlock(&event->mmap_mutex);
4923
4924
4925
4926
4927
4928 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4929 vma->vm_ops = &perf_mmap_vmops;
4930
4931 if (event->pmu->event_mapped)
4932 event->pmu->event_mapped(event);
4933
4934 return ret;
4935}
4936
4937static int perf_fasync(int fd, struct file *filp, int on)
4938{
4939 struct inode *inode = file_inode(filp);
4940 struct perf_event *event = filp->private_data;
4941 int retval;
4942
4943 mutex_lock(&inode->i_mutex);
4944 retval = fasync_helper(fd, filp, on, &event->fasync);
4945 mutex_unlock(&inode->i_mutex);
4946
4947 if (retval < 0)
4948 return retval;
4949
4950 return 0;
4951}
4952
4953static const struct file_operations perf_fops = {
4954 .llseek = no_llseek,
4955 .release = perf_release,
4956 .read = perf_read,
4957 .poll = perf_poll,
4958 .unlocked_ioctl = perf_ioctl,
4959 .compat_ioctl = perf_compat_ioctl,
4960 .mmap = perf_mmap,
4961 .fasync = perf_fasync,
4962};
4963
4964
4965
4966
4967
4968
4969
4970
4971static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4972{
4973
4974 if (event->parent)
4975 event = event->parent;
4976 return &event->fasync;
4977}
4978
4979void perf_event_wakeup(struct perf_event *event)
4980{
4981 ring_buffer_wakeup(event);
4982
4983 if (event->pending_kill) {
4984 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4985 event->pending_kill = 0;
4986 }
4987}
4988
4989static void perf_pending_event(struct irq_work *entry)
4990{
4991 struct perf_event *event = container_of(entry,
4992 struct perf_event, pending);
4993 int rctx;
4994
4995 rctx = perf_swevent_get_recursion_context();
4996
4997
4998
4999
5000
5001 if (event->pending_disable) {
5002 event->pending_disable = 0;
5003 __perf_event_disable(event);
5004 }
5005
5006 if (event->pending_wakeup) {
5007 event->pending_wakeup = 0;
5008 perf_event_wakeup(event);
5009 }
5010
5011 if (rctx >= 0)
5012 perf_swevent_put_recursion_context(rctx);
5013}
5014
5015
5016
5017
5018
5019
5020struct perf_guest_info_callbacks *perf_guest_cbs;
5021
5022int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5023{
5024 perf_guest_cbs = cbs;
5025 return 0;
5026}
5027EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5028
5029int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5030{
5031 perf_guest_cbs = NULL;
5032 return 0;
5033}
5034EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5035
5036static void
5037perf_output_sample_regs(struct perf_output_handle *handle,
5038 struct pt_regs *regs, u64 mask)
5039{
5040 int bit;
5041
5042 for_each_set_bit(bit, (const unsigned long *) &mask,
5043 sizeof(mask) * BITS_PER_BYTE) {
5044 u64 val;
5045
5046 val = perf_reg_value(regs, bit);
5047 perf_output_put(handle, val);
5048 }
5049}
5050
5051static void perf_sample_regs_user(struct perf_regs *regs_user,
5052 struct pt_regs *regs,
5053 struct pt_regs *regs_user_copy)
5054{
5055 if (user_mode(regs)) {
5056 regs_user->abi = perf_reg_abi(current);
5057 regs_user->regs = regs;
5058 } else if (current->mm) {
5059 perf_get_regs_user(regs_user, regs, regs_user_copy);
5060 } else {
5061 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5062 regs_user->regs = NULL;
5063 }
5064}
5065
5066static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5067 struct pt_regs *regs)
5068{
5069 regs_intr->regs = regs;
5070 regs_intr->abi = perf_reg_abi(current);
5071}
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081static u64 perf_ustack_task_size(struct pt_regs *regs)
5082{
5083 unsigned long addr = perf_user_stack_pointer(regs);
5084
5085 if (!addr || addr >= TASK_SIZE)
5086 return 0;
5087
5088 return TASK_SIZE - addr;
5089}
5090
5091static u16
5092perf_sample_ustack_size(u16 stack_size, u16 header_size,
5093 struct pt_regs *regs)
5094{
5095 u64 task_size;
5096
5097
5098 if (!regs)
5099 return 0;
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5112 stack_size = min(stack_size, (u16) task_size);
5113
5114
5115 header_size += 2 * sizeof(u64);
5116
5117
5118 if ((u16) (header_size + stack_size) < header_size) {
5119
5120
5121
5122
5123 stack_size = USHRT_MAX - header_size - sizeof(u64);
5124 stack_size = round_up(stack_size, sizeof(u64));
5125 }
5126
5127 return stack_size;
5128}
5129
5130static void
5131perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5132 struct pt_regs *regs)
5133{
5134
5135 if (!regs) {
5136 u64 size = 0;
5137 perf_output_put(handle, size);
5138 } else {
5139 unsigned long sp;
5140 unsigned int rem;
5141 u64 dyn_size;
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155 perf_output_put(handle, dump_size);
5156
5157
5158 sp = perf_user_stack_pointer(regs);
5159 rem = __output_copy_user(handle, (void *) sp, dump_size);
5160 dyn_size = dump_size - rem;
5161
5162 perf_output_skip(handle, rem);
5163
5164
5165 perf_output_put(handle, dyn_size);
5166 }
5167}
5168
5169static void __perf_event_header__init_id(struct perf_event_header *header,
5170 struct perf_sample_data *data,
5171 struct perf_event *event)
5172{
5173 u64 sample_type = event->attr.sample_type;
5174
5175 data->type = sample_type;
5176 header->size += event->id_header_size;
5177
5178 if (sample_type & PERF_SAMPLE_TID) {
5179
5180 data->tid_entry.pid = perf_event_pid(event, current);
5181 data->tid_entry.tid = perf_event_tid(event, current);
5182 }
5183
5184 if (sample_type & PERF_SAMPLE_TIME)
5185 data->time = perf_event_clock(event);
5186
5187 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5188 data->id = primary_event_id(event);
5189
5190 if (sample_type & PERF_SAMPLE_STREAM_ID)
5191 data->stream_id = event->id;
5192
5193 if (sample_type & PERF_SAMPLE_CPU) {
5194 data->cpu_entry.cpu = raw_smp_processor_id();
5195 data->cpu_entry.reserved = 0;
5196 }
5197}
5198
5199void perf_event_header__init_id(struct perf_event_header *header,
5200 struct perf_sample_data *data,
5201 struct perf_event *event)
5202{
5203 if (event->attr.sample_id_all)
5204 __perf_event_header__init_id(header, data, event);
5205}
5206
5207static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5208 struct perf_sample_data *data)
5209{
5210 u64 sample_type = data->type;
5211
5212 if (sample_type & PERF_SAMPLE_TID)
5213 perf_output_put(handle, data->tid_entry);
5214
5215 if (sample_type & PERF_SAMPLE_TIME)
5216 perf_output_put(handle, data->time);
5217
5218 if (sample_type & PERF_SAMPLE_ID)
5219 perf_output_put(handle, data->id);
5220
5221 if (sample_type & PERF_SAMPLE_STREAM_ID)
5222 perf_output_put(handle, data->stream_id);
5223
5224 if (sample_type & PERF_SAMPLE_CPU)
5225 perf_output_put(handle, data->cpu_entry);
5226
5227 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5228 perf_output_put(handle, data->id);
5229}
5230
5231void perf_event__output_id_sample(struct perf_event *event,
5232 struct perf_output_handle *handle,
5233 struct perf_sample_data *sample)
5234{
5235 if (event->attr.sample_id_all)
5236 __perf_event__output_id_sample(handle, sample);
5237}
5238
5239static void perf_output_read_one(struct perf_output_handle *handle,
5240 struct perf_event *event,
5241 u64 enabled, u64 running)
5242{
5243 u64 read_format = event->attr.read_format;
5244 u64 values[4];
5245 int n = 0;
5246
5247 values[n++] = perf_event_count(event);
5248 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5249 values[n++] = enabled +
5250 atomic64_read(&event->child_total_time_enabled);
5251 }
5252 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5253 values[n++] = running +
5254 atomic64_read(&event->child_total_time_running);
5255 }
5256 if (read_format & PERF_FORMAT_ID)
5257 values[n++] = primary_event_id(event);
5258
5259 __output_copy(handle, values, n * sizeof(u64));
5260}
5261
5262
5263
5264
5265static void perf_output_read_group(struct perf_output_handle *handle,
5266 struct perf_event *event,
5267 u64 enabled, u64 running)
5268{
5269 struct perf_event *leader = event->group_leader, *sub;
5270 u64 read_format = event->attr.read_format;
5271 u64 values[5];
5272 int n = 0;
5273
5274 values[n++] = 1 + leader->nr_siblings;
5275
5276 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5277 values[n++] = enabled;
5278
5279 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5280 values[n++] = running;
5281
5282 if (leader != event)
5283 leader->pmu->read(leader);
5284
5285 values[n++] = perf_event_count(leader);
5286 if (read_format & PERF_FORMAT_ID)
5287 values[n++] = primary_event_id(leader);
5288
5289 __output_copy(handle, values, n * sizeof(u64));
5290
5291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5292 n = 0;
5293
5294 if ((sub != event) &&
5295 (sub->state == PERF_EVENT_STATE_ACTIVE))
5296 sub->pmu->read(sub);
5297
5298 values[n++] = perf_event_count(sub);
5299 if (read_format & PERF_FORMAT_ID)
5300 values[n++] = primary_event_id(sub);
5301
5302 __output_copy(handle, values, n * sizeof(u64));
5303 }
5304}
5305
5306#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5307 PERF_FORMAT_TOTAL_TIME_RUNNING)
5308
5309static void perf_output_read(struct perf_output_handle *handle,
5310 struct perf_event *event)
5311{
5312 u64 enabled = 0, running = 0, now;
5313 u64 read_format = event->attr.read_format;
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5325 calc_timer_values(event, &now, &enabled, &running);
5326
5327 if (event->attr.read_format & PERF_FORMAT_GROUP)
5328 perf_output_read_group(handle, event, enabled, running);
5329 else
5330 perf_output_read_one(handle, event, enabled, running);
5331}
5332
5333void perf_output_sample(struct perf_output_handle *handle,
5334 struct perf_event_header *header,
5335 struct perf_sample_data *data,
5336 struct perf_event *event)
5337{
5338 u64 sample_type = data->type;
5339
5340 perf_output_put(handle, *header);
5341
5342 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5343 perf_output_put(handle, data->id);
5344
5345 if (sample_type & PERF_SAMPLE_IP)
5346 perf_output_put(handle, data->ip);
5347
5348 if (sample_type & PERF_SAMPLE_TID)
5349 perf_output_put(handle, data->tid_entry);
5350
5351 if (sample_type & PERF_SAMPLE_TIME)
5352 perf_output_put(handle, data->time);
5353
5354 if (sample_type & PERF_SAMPLE_ADDR)
5355 perf_output_put(handle, data->addr);
5356
5357 if (sample_type & PERF_SAMPLE_ID)
5358 perf_output_put(handle, data->id);
5359
5360 if (sample_type & PERF_SAMPLE_STREAM_ID)
5361 perf_output_put(handle, data->stream_id);
5362
5363 if (sample_type & PERF_SAMPLE_CPU)
5364 perf_output_put(handle, data->cpu_entry);
5365
5366 if (sample_type & PERF_SAMPLE_PERIOD)
5367 perf_output_put(handle, data->period);
5368
5369 if (sample_type & PERF_SAMPLE_READ)
5370 perf_output_read(handle, event);
5371
5372 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5373 if (data->callchain) {
5374 int size = 1;
5375
5376 if (data->callchain)
5377 size += data->callchain->nr;
5378
5379 size *= sizeof(u64);
5380
5381 __output_copy(handle, data->callchain, size);
5382 } else {
5383 u64 nr = 0;
5384 perf_output_put(handle, nr);
5385 }
5386 }
5387
5388 if (sample_type & PERF_SAMPLE_RAW) {
5389 if (data->raw) {
5390 u32 raw_size = data->raw->size;
5391 u32 real_size = round_up(raw_size + sizeof(u32),
5392 sizeof(u64)) - sizeof(u32);
5393 u64 zero = 0;
5394
5395 perf_output_put(handle, real_size);
5396 __output_copy(handle, data->raw->data, raw_size);
5397 if (real_size - raw_size)
5398 __output_copy(handle, &zero, real_size - raw_size);
5399 } else {
5400 struct {
5401 u32 size;
5402 u32 data;
5403 } raw = {
5404 .size = sizeof(u32),
5405 .data = 0,
5406 };
5407 perf_output_put(handle, raw);
5408 }
5409 }
5410
5411 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5412 if (data->br_stack) {
5413 size_t size;
5414
5415 size = data->br_stack->nr
5416 * sizeof(struct perf_branch_entry);
5417
5418 perf_output_put(handle, data->br_stack->nr);
5419 perf_output_copy(handle, data->br_stack->entries, size);
5420 } else {
5421
5422
5423
5424 u64 nr = 0;
5425 perf_output_put(handle, nr);
5426 }
5427 }
5428
5429 if (sample_type & PERF_SAMPLE_REGS_USER) {
5430 u64 abi = data->regs_user.abi;
5431
5432
5433
5434
5435
5436 perf_output_put(handle, abi);
5437
5438 if (abi) {
5439 u64 mask = event->attr.sample_regs_user;
5440 perf_output_sample_regs(handle,
5441 data->regs_user.regs,
5442 mask);
5443 }
5444 }
5445
5446 if (sample_type & PERF_SAMPLE_STACK_USER) {
5447 perf_output_sample_ustack(handle,
5448 data->stack_user_size,
5449 data->regs_user.regs);
5450 }
5451
5452 if (sample_type & PERF_SAMPLE_WEIGHT)
5453 perf_output_put(handle, data->weight);
5454
5455 if (sample_type & PERF_SAMPLE_DATA_SRC)
5456 perf_output_put(handle, data->data_src.val);
5457
5458 if (sample_type & PERF_SAMPLE_TRANSACTION)
5459 perf_output_put(handle, data->txn);
5460
5461 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5462 u64 abi = data->regs_intr.abi;
5463
5464
5465
5466
5467 perf_output_put(handle, abi);
5468
5469 if (abi) {
5470 u64 mask = event->attr.sample_regs_intr;
5471
5472 perf_output_sample_regs(handle,
5473 data->regs_intr.regs,
5474 mask);
5475 }
5476 }
5477
5478 if (!event->attr.watermark) {
5479 int wakeup_events = event->attr.wakeup_events;
5480
5481 if (wakeup_events) {
5482 struct ring_buffer *rb = handle->rb;
5483 int events = local_inc_return(&rb->events);
5484
5485 if (events >= wakeup_events) {
5486 local_sub(wakeup_events, &rb->events);
5487 local_inc(&rb->wakeup);
5488 }
5489 }
5490 }
5491}
5492
5493void perf_prepare_sample(struct perf_event_header *header,
5494 struct perf_sample_data *data,
5495 struct perf_event *event,
5496 struct pt_regs *regs)
5497{
5498 u64 sample_type = event->attr.sample_type;
5499
5500 header->type = PERF_RECORD_SAMPLE;
5501 header->size = sizeof(*header) + event->header_size;
5502
5503 header->misc = 0;
5504 header->misc |= perf_misc_flags(regs);
5505
5506 __perf_event_header__init_id(header, data, event);
5507
5508 if (sample_type & PERF_SAMPLE_IP)
5509 data->ip = perf_instruction_pointer(regs);
5510
5511 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5512 int size = 1;
5513
5514 data->callchain = perf_callchain(event, regs);
5515
5516 if (data->callchain)
5517 size += data->callchain->nr;
5518
5519 header->size += size * sizeof(u64);
5520 }
5521
5522 if (sample_type & PERF_SAMPLE_RAW) {
5523 int size = sizeof(u32);
5524
5525 if (data->raw)
5526 size += data->raw->size;
5527 else
5528 size += sizeof(u32);
5529
5530 header->size += round_up(size, sizeof(u64));
5531 }
5532
5533 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5534 int size = sizeof(u64);
5535 if (data->br_stack) {
5536 size += data->br_stack->nr
5537 * sizeof(struct perf_branch_entry);
5538 }
5539 header->size += size;
5540 }
5541
5542 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5543 perf_sample_regs_user(&data->regs_user, regs,
5544 &data->regs_user_copy);
5545
5546 if (sample_type & PERF_SAMPLE_REGS_USER) {
5547
5548 int size = sizeof(u64);
5549
5550 if (data->regs_user.regs) {
5551 u64 mask = event->attr.sample_regs_user;
5552 size += hweight64(mask) * sizeof(u64);
5553 }
5554
5555 header->size += size;
5556 }
5557
5558 if (sample_type & PERF_SAMPLE_STACK_USER) {
5559
5560
5561
5562
5563
5564
5565 u16 stack_size = event->attr.sample_stack_user;
5566 u16 size = sizeof(u64);
5567
5568 stack_size = perf_sample_ustack_size(stack_size, header->size,
5569 data->regs_user.regs);
5570
5571
5572
5573
5574
5575
5576 if (stack_size)
5577 size += sizeof(u64) + stack_size;
5578
5579 data->stack_user_size = stack_size;
5580 header->size += size;
5581 }
5582
5583 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5584
5585 int size = sizeof(u64);
5586
5587 perf_sample_regs_intr(&data->regs_intr, regs);
5588
5589 if (data->regs_intr.regs) {
5590 u64 mask = event->attr.sample_regs_intr;
5591
5592 size += hweight64(mask) * sizeof(u64);
5593 }
5594
5595 header->size += size;
5596 }
5597}
5598
5599void perf_event_output(struct perf_event *event,
5600 struct perf_sample_data *data,
5601 struct pt_regs *regs)
5602{
5603 struct perf_output_handle handle;
5604 struct perf_event_header header;
5605
5606
5607 rcu_read_lock();
5608
5609 perf_prepare_sample(&header, data, event, regs);
5610
5611 if (perf_output_begin(&handle, event, header.size))
5612 goto exit;
5613
5614 perf_output_sample(&handle, &header, data, event);
5615
5616 perf_output_end(&handle);
5617
5618exit:
5619 rcu_read_unlock();
5620}
5621
5622
5623
5624
5625
5626struct perf_read_event {
5627 struct perf_event_header header;
5628
5629 u32 pid;
5630 u32 tid;
5631};
5632
5633static void
5634perf_event_read_event(struct perf_event *event,
5635 struct task_struct *task)
5636{
5637 struct perf_output_handle handle;
5638 struct perf_sample_data sample;
5639 struct perf_read_event read_event = {
5640 .header = {
5641 .type = PERF_RECORD_READ,
5642 .misc = 0,
5643 .size = sizeof(read_event) + event->read_size,
5644 },
5645 .pid = perf_event_pid(event, task),
5646 .tid = perf_event_tid(event, task),
5647 };
5648 int ret;
5649
5650 perf_event_header__init_id(&read_event.header, &sample, event);
5651 ret = perf_output_begin(&handle, event, read_event.header.size);
5652 if (ret)
5653 return;
5654
5655 perf_output_put(&handle, read_event);
5656 perf_output_read(&handle, event);
5657 perf_event__output_id_sample(event, &handle, &sample);
5658
5659 perf_output_end(&handle);
5660}
5661
5662typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5663
5664static void
5665perf_event_aux_ctx(struct perf_event_context *ctx,
5666 perf_event_aux_output_cb output,
5667 void *data)
5668{
5669 struct perf_event *event;
5670
5671 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5672 if (event->state < PERF_EVENT_STATE_INACTIVE)
5673 continue;
5674 if (!event_filter_match(event))
5675 continue;
5676 output(event, data);
5677 }
5678}
5679
5680static void
5681perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5682 struct perf_event_context *task_ctx)
5683{
5684 rcu_read_lock();
5685 preempt_disable();
5686 perf_event_aux_ctx(task_ctx, output, data);
5687 preempt_enable();
5688 rcu_read_unlock();
5689}
5690
5691static void
5692perf_event_aux(perf_event_aux_output_cb output, void *data,
5693 struct perf_event_context *task_ctx)
5694{
5695 struct perf_cpu_context *cpuctx;
5696 struct perf_event_context *ctx;
5697 struct pmu *pmu;
5698 int ctxn;
5699
5700
5701
5702
5703
5704
5705
5706 if (task_ctx) {
5707 perf_event_aux_task_ctx(output, data, task_ctx);
5708 return;
5709 }
5710
5711 rcu_read_lock();
5712 list_for_each_entry_rcu(pmu, &pmus, entry) {
5713 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5714 if (cpuctx->unique_pmu != pmu)
5715 goto next;
5716 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5717 ctxn = pmu->task_ctx_nr;
5718 if (ctxn < 0)
5719 goto next;
5720 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5721 if (ctx)
5722 perf_event_aux_ctx(ctx, output, data);
5723next:
5724 put_cpu_ptr(pmu->pmu_cpu_context);
5725 }
5726 rcu_read_unlock();
5727}
5728
5729
5730
5731
5732
5733
5734
5735struct perf_task_event {
5736 struct task_struct *task;
5737 struct perf_event_context *task_ctx;
5738
5739 struct {
5740 struct perf_event_header header;
5741
5742 u32 pid;
5743 u32 ppid;
5744 u32 tid;
5745 u32 ptid;
5746 u64 time;
5747 } event_id;
5748};
5749
5750static int perf_event_task_match(struct perf_event *event)
5751{
5752 return event->attr.comm || event->attr.mmap ||
5753 event->attr.mmap2 || event->attr.mmap_data ||
5754 event->attr.task;
5755}
5756
5757static void perf_event_task_output(struct perf_event *event,
5758 void *data)
5759{
5760 struct perf_task_event *task_event = data;
5761 struct perf_output_handle handle;
5762 struct perf_sample_data sample;
5763 struct task_struct *task = task_event->task;
5764 int ret, size = task_event->event_id.header.size;
5765
5766 if (!perf_event_task_match(event))
5767 return;
5768
5769 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5770
5771 ret = perf_output_begin(&handle, event,
5772 task_event->event_id.header.size);
5773 if (ret)
5774 goto out;
5775
5776 task_event->event_id.pid = perf_event_pid(event, task);
5777 task_event->event_id.ppid = perf_event_pid(event, current);
5778
5779 task_event->event_id.tid = perf_event_tid(event, task);
5780 task_event->event_id.ptid = perf_event_tid(event, current);
5781
5782 task_event->event_id.time = perf_event_clock(event);
5783
5784 perf_output_put(&handle, task_event->event_id);
5785
5786 perf_event__output_id_sample(event, &handle, &sample);
5787
5788 perf_output_end(&handle);
5789out:
5790 task_event->event_id.header.size = size;
5791}
5792
5793static void perf_event_task(struct task_struct *task,
5794 struct perf_event_context *task_ctx,
5795 int new)
5796{
5797 struct perf_task_event task_event;
5798
5799 if (!atomic_read(&nr_comm_events) &&
5800 !atomic_read(&nr_mmap_events) &&
5801 !atomic_read(&nr_task_events))
5802 return;
5803
5804 task_event = (struct perf_task_event){
5805 .task = task,
5806 .task_ctx = task_ctx,
5807 .event_id = {
5808 .header = {
5809 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5810 .misc = 0,
5811 .size = sizeof(task_event.event_id),
5812 },
5813
5814
5815
5816
5817
5818 },
5819 };
5820
5821 perf_event_aux(perf_event_task_output,
5822 &task_event,
5823 task_ctx);
5824}
5825
5826void perf_event_fork(struct task_struct *task)
5827{
5828 perf_event_task(task, NULL, 1);
5829}
5830
5831
5832
5833
5834
5835struct perf_comm_event {
5836 struct task_struct *task;
5837 char *comm;
5838 int comm_size;
5839
5840 struct {
5841 struct perf_event_header header;
5842
5843 u32 pid;
5844 u32 tid;
5845 } event_id;
5846};
5847
5848static int perf_event_comm_match(struct perf_event *event)
5849{
5850 return event->attr.comm;
5851}
5852
5853static void perf_event_comm_output(struct perf_event *event,
5854 void *data)
5855{
5856 struct perf_comm_event *comm_event = data;
5857 struct perf_output_handle handle;
5858 struct perf_sample_data sample;
5859 int size = comm_event->event_id.header.size;
5860 int ret;
5861
5862 if (!perf_event_comm_match(event))
5863 return;
5864
5865 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5866 ret = perf_output_begin(&handle, event,
5867 comm_event->event_id.header.size);
5868
5869 if (ret)
5870 goto out;
5871
5872 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5873 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5874
5875 perf_output_put(&handle, comm_event->event_id);
5876 __output_copy(&handle, comm_event->comm,
5877 comm_event->comm_size);
5878
5879 perf_event__output_id_sample(event, &handle, &sample);
5880
5881 perf_output_end(&handle);
5882out:
5883 comm_event->event_id.header.size = size;
5884}
5885
5886static void perf_event_comm_event(struct perf_comm_event *comm_event)
5887{
5888 char comm[TASK_COMM_LEN];
5889 unsigned int size;
5890
5891 memset(comm, 0, sizeof(comm));
5892 strlcpy(comm, comm_event->task->comm, sizeof(comm));
5893 size = ALIGN(strlen(comm)+1, sizeof(u64));
5894
5895 comm_event->comm = comm;
5896 comm_event->comm_size = size;
5897
5898 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5899
5900 perf_event_aux(perf_event_comm_output,
5901 comm_event,
5902 NULL);
5903}
5904
5905void perf_event_comm(struct task_struct *task, bool exec)
5906{
5907 struct perf_comm_event comm_event;
5908
5909 if (!atomic_read(&nr_comm_events))
5910 return;
5911
5912 comm_event = (struct perf_comm_event){
5913 .task = task,
5914
5915
5916 .event_id = {
5917 .header = {
5918 .type = PERF_RECORD_COMM,
5919 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5920
5921 },
5922
5923
5924 },
5925 };
5926
5927 perf_event_comm_event(&comm_event);
5928}
5929
5930
5931
5932
5933
5934struct perf_mmap_event {
5935 struct vm_area_struct *vma;
5936
5937 const char *file_name;
5938 int file_size;
5939 int maj, min;
5940 u64 ino;
5941 u64 ino_generation;
5942 u32 prot, flags;
5943
5944 struct {
5945 struct perf_event_header header;
5946
5947 u32 pid;
5948 u32 tid;
5949 u64 start;
5950 u64 len;
5951 u64 pgoff;
5952 } event_id;
5953};
5954
5955static int perf_event_mmap_match(struct perf_event *event,
5956 void *data)
5957{
5958 struct perf_mmap_event *mmap_event = data;
5959 struct vm_area_struct *vma = mmap_event->vma;
5960 int executable = vma->vm_flags & VM_EXEC;
5961
5962 return (!executable && event->attr.mmap_data) ||
5963 (executable && (event->attr.mmap || event->attr.mmap2));
5964}
5965
5966static void perf_event_mmap_output(struct perf_event *event,
5967 void *data)
5968{
5969 struct perf_mmap_event *mmap_event = data;
5970 struct perf_output_handle handle;
5971 struct perf_sample_data sample;
5972 int size = mmap_event->event_id.header.size;
5973 int ret;
5974
5975 if (!perf_event_mmap_match(event, data))
5976 return;
5977
5978 if (event->attr.mmap2) {
5979 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5980 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5981 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5982 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5983 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5984 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5985 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5986 }
5987
5988 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5989 ret = perf_output_begin(&handle, event,
5990 mmap_event->event_id.header.size);
5991 if (ret)
5992 goto out;
5993
5994 mmap_event->event_id.pid = perf_event_pid(event, current);
5995 mmap_event->event_id.tid = perf_event_tid(event, current);
5996
5997 perf_output_put(&handle, mmap_event->event_id);
5998
5999 if (event->attr.mmap2) {
6000 perf_output_put(&handle, mmap_event->maj);
6001 perf_output_put(&handle, mmap_event->min);
6002 perf_output_put(&handle, mmap_event->ino);
6003 perf_output_put(&handle, mmap_event->ino_generation);
6004 perf_output_put(&handle, mmap_event->prot);
6005 perf_output_put(&handle, mmap_event->flags);
6006 }
6007
6008 __output_copy(&handle, mmap_event->file_name,
6009 mmap_event->file_size);
6010
6011 perf_event__output_id_sample(event, &handle, &sample);
6012
6013 perf_output_end(&handle);
6014out:
6015 mmap_event->event_id.header.size = size;
6016}
6017
6018static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6019{
6020 struct vm_area_struct *vma = mmap_event->vma;
6021 struct file *file = vma->vm_file;
6022 int maj = 0, min = 0;
6023 u64 ino = 0, gen = 0;
6024 u32 prot = 0, flags = 0;
6025 unsigned int size;
6026 char tmp[16];
6027 char *buf = NULL;
6028 char *name;
6029
6030 if (file) {
6031 struct inode *inode;
6032 dev_t dev;
6033
6034 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6035 if (!buf) {
6036 name = "//enomem";
6037 goto cpy_name;
6038 }
6039
6040
6041
6042
6043
6044 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6045 if (IS_ERR(name)) {
6046 name = "//toolong";
6047 goto cpy_name;
6048 }
6049 inode = file_inode(vma->vm_file);
6050 dev = inode->i_sb->s_dev;
6051 ino = inode->i_ino;
6052 gen = inode->i_generation;
6053 maj = MAJOR(dev);
6054 min = MINOR(dev);
6055
6056 if (vma->vm_flags & VM_READ)
6057 prot |= PROT_READ;
6058 if (vma->vm_flags & VM_WRITE)
6059 prot |= PROT_WRITE;
6060 if (vma->vm_flags & VM_EXEC)
6061 prot |= PROT_EXEC;
6062
6063 if (vma->vm_flags & VM_MAYSHARE)
6064 flags = MAP_SHARED;
6065 else
6066 flags = MAP_PRIVATE;
6067
6068 if (vma->vm_flags & VM_DENYWRITE)
6069 flags |= MAP_DENYWRITE;
6070 if (vma->vm_flags & VM_MAYEXEC)
6071 flags |= MAP_EXECUTABLE;
6072 if (vma->vm_flags & VM_LOCKED)
6073 flags |= MAP_LOCKED;
6074 if (vma->vm_flags & VM_HUGETLB)
6075 flags |= MAP_HUGETLB;
6076
6077 goto got_name;
6078 } else {
6079 if (vma->vm_ops && vma->vm_ops->name) {
6080 name = (char *) vma->vm_ops->name(vma);
6081 if (name)
6082 goto cpy_name;
6083 }
6084
6085 name = (char *)arch_vma_name(vma);
6086 if (name)
6087 goto cpy_name;
6088
6089 if (vma->vm_start <= vma->vm_mm->start_brk &&
6090 vma->vm_end >= vma->vm_mm->brk) {
6091 name = "[heap]";
6092 goto cpy_name;
6093 }
6094 if (vma->vm_start <= vma->vm_mm->start_stack &&
6095 vma->vm_end >= vma->vm_mm->start_stack) {
6096 name = "[stack]";
6097 goto cpy_name;
6098 }
6099
6100 name = "//anon";
6101 goto cpy_name;
6102 }
6103
6104cpy_name:
6105 strlcpy(tmp, name, sizeof(tmp));
6106 name = tmp;
6107got_name:
6108
6109
6110
6111
6112
6113 size = strlen(name)+1;
6114 while (!IS_ALIGNED(size, sizeof(u64)))
6115 name[size++] = '\0';
6116
6117 mmap_event->file_name = name;
6118 mmap_event->file_size = size;
6119 mmap_event->maj = maj;
6120 mmap_event->min = min;
6121 mmap_event->ino = ino;
6122 mmap_event->ino_generation = gen;
6123 mmap_event->prot = prot;
6124 mmap_event->flags = flags;
6125
6126 if (!(vma->vm_flags & VM_EXEC))
6127 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6128
6129 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6130
6131 perf_event_aux(perf_event_mmap_output,
6132 mmap_event,
6133 NULL);
6134
6135 kfree(buf);
6136}
6137
6138void perf_event_mmap(struct vm_area_struct *vma)
6139{
6140 struct perf_mmap_event mmap_event;
6141
6142 if (!atomic_read(&nr_mmap_events))
6143 return;
6144
6145 mmap_event = (struct perf_mmap_event){
6146 .vma = vma,
6147
6148
6149 .event_id = {
6150 .header = {
6151 .type = PERF_RECORD_MMAP,
6152 .misc = PERF_RECORD_MISC_USER,
6153
6154 },
6155
6156
6157 .start = vma->vm_start,
6158 .len = vma->vm_end - vma->vm_start,
6159 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
6160 },
6161
6162
6163
6164
6165
6166
6167 };
6168
6169 perf_event_mmap_event(&mmap_event);
6170}
6171
6172void perf_event_aux_event(struct perf_event *event, unsigned long head,
6173 unsigned long size, u64 flags)
6174{
6175 struct perf_output_handle handle;
6176 struct perf_sample_data sample;
6177 struct perf_aux_event {
6178 struct perf_event_header header;
6179 u64 offset;
6180 u64 size;
6181 u64 flags;
6182 } rec = {
6183 .header = {
6184 .type = PERF_RECORD_AUX,
6185 .misc = 0,
6186 .size = sizeof(rec),
6187 },
6188 .offset = head,
6189 .size = size,
6190 .flags = flags,
6191 };
6192 int ret;
6193
6194 perf_event_header__init_id(&rec.header, &sample, event);
6195 ret = perf_output_begin(&handle, event, rec.header.size);
6196
6197 if (ret)
6198 return;
6199
6200 perf_output_put(&handle, rec);
6201 perf_event__output_id_sample(event, &handle, &sample);
6202
6203 perf_output_end(&handle);
6204}
6205
6206
6207
6208
6209void perf_log_lost_samples(struct perf_event *event, u64 lost)
6210{
6211 struct perf_output_handle handle;
6212 struct perf_sample_data sample;
6213 int ret;
6214
6215 struct {
6216 struct perf_event_header header;
6217 u64 lost;
6218 } lost_samples_event = {
6219 .header = {
6220 .type = PERF_RECORD_LOST_SAMPLES,
6221 .misc = 0,
6222 .size = sizeof(lost_samples_event),
6223 },
6224 .lost = lost,
6225 };
6226
6227 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6228
6229 ret = perf_output_begin(&handle, event,
6230 lost_samples_event.header.size);
6231 if (ret)
6232 return;
6233
6234 perf_output_put(&handle, lost_samples_event);
6235 perf_event__output_id_sample(event, &handle, &sample);
6236 perf_output_end(&handle);
6237}
6238
6239
6240
6241
6242
6243struct perf_switch_event {
6244 struct task_struct *task;
6245 struct task_struct *next_prev;
6246
6247 struct {
6248 struct perf_event_header header;
6249 u32 next_prev_pid;
6250 u32 next_prev_tid;
6251 } event_id;
6252};
6253
6254static int perf_event_switch_match(struct perf_event *event)
6255{
6256 return event->attr.context_switch;
6257}
6258
6259static void perf_event_switch_output(struct perf_event *event, void *data)
6260{
6261 struct perf_switch_event *se = data;
6262 struct perf_output_handle handle;
6263 struct perf_sample_data sample;
6264 int ret;
6265
6266 if (!perf_event_switch_match(event))
6267 return;
6268
6269
6270 if (event->ctx->task) {
6271 se->event_id.header.type = PERF_RECORD_SWITCH;
6272 se->event_id.header.size = sizeof(se->event_id.header);
6273 } else {
6274 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6275 se->event_id.header.size = sizeof(se->event_id);
6276 se->event_id.next_prev_pid =
6277 perf_event_pid(event, se->next_prev);
6278 se->event_id.next_prev_tid =
6279 perf_event_tid(event, se->next_prev);
6280 }
6281
6282 perf_event_header__init_id(&se->event_id.header, &sample, event);
6283
6284 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6285 if (ret)
6286 return;
6287
6288 if (event->ctx->task)
6289 perf_output_put(&handle, se->event_id.header);
6290 else
6291 perf_output_put(&handle, se->event_id);
6292
6293 perf_event__output_id_sample(event, &handle, &sample);
6294
6295 perf_output_end(&handle);
6296}
6297
6298static void perf_event_switch(struct task_struct *task,
6299 struct task_struct *next_prev, bool sched_in)
6300{
6301 struct perf_switch_event switch_event;
6302
6303
6304
6305 switch_event = (struct perf_switch_event){
6306 .task = task,
6307 .next_prev = next_prev,
6308 .event_id = {
6309 .header = {
6310
6311 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6312
6313 },
6314
6315
6316 },
6317 };
6318
6319 perf_event_aux(perf_event_switch_output,
6320 &switch_event,
6321 NULL);
6322}
6323
6324
6325
6326
6327
6328static void perf_log_throttle(struct perf_event *event, int enable)
6329{
6330 struct perf_output_handle handle;
6331 struct perf_sample_data sample;
6332 int ret;
6333
6334 struct {
6335 struct perf_event_header header;
6336 u64 time;
6337 u64 id;
6338 u64 stream_id;
6339 } throttle_event = {
6340 .header = {
6341 .type = PERF_RECORD_THROTTLE,
6342 .misc = 0,
6343 .size = sizeof(throttle_event),
6344 },
6345 .time = perf_event_clock(event),
6346 .id = primary_event_id(event),
6347 .stream_id = event->id,
6348 };
6349
6350 if (enable)
6351 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6352
6353 perf_event_header__init_id(&throttle_event.header, &sample, event);
6354
6355 ret = perf_output_begin(&handle, event,
6356 throttle_event.header.size);
6357 if (ret)
6358 return;
6359
6360 perf_output_put(&handle, throttle_event);
6361 perf_event__output_id_sample(event, &handle, &sample);
6362 perf_output_end(&handle);
6363}
6364
6365static void perf_log_itrace_start(struct perf_event *event)
6366{
6367 struct perf_output_handle handle;
6368 struct perf_sample_data sample;
6369 struct perf_aux_event {
6370 struct perf_event_header header;
6371 u32 pid;
6372 u32 tid;
6373 } rec;
6374 int ret;
6375
6376 if (event->parent)
6377 event = event->parent;
6378
6379 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6380 event->hw.itrace_started)
6381 return;
6382
6383 rec.header.type = PERF_RECORD_ITRACE_START;
6384 rec.header.misc = 0;
6385 rec.header.size = sizeof(rec);
6386 rec.pid = perf_event_pid(event, current);
6387 rec.tid = perf_event_tid(event, current);
6388
6389 perf_event_header__init_id(&rec.header, &sample, event);
6390 ret = perf_output_begin(&handle, event, rec.header.size);
6391
6392 if (ret)
6393 return;
6394
6395 perf_output_put(&handle, rec);
6396 perf_event__output_id_sample(event, &handle, &sample);
6397
6398 perf_output_end(&handle);
6399}
6400
6401
6402
6403
6404
6405static int __perf_event_overflow(struct perf_event *event,
6406 int throttle, struct perf_sample_data *data,
6407 struct pt_regs *regs)
6408{
6409 int events = atomic_read(&event->event_limit);
6410 struct hw_perf_event *hwc = &event->hw;
6411 u64 seq;
6412 int ret = 0;
6413
6414
6415
6416
6417
6418 if (unlikely(!is_sampling_event(event)))
6419 return 0;
6420
6421 seq = __this_cpu_read(perf_throttled_seq);
6422 if (seq != hwc->interrupts_seq) {
6423 hwc->interrupts_seq = seq;
6424 hwc->interrupts = 1;
6425 } else {
6426 hwc->interrupts++;
6427 if (unlikely(throttle
6428 && hwc->interrupts >= max_samples_per_tick)) {
6429 __this_cpu_inc(perf_throttled_count);
6430 hwc->interrupts = MAX_INTERRUPTS;
6431 perf_log_throttle(event, 0);
6432 tick_nohz_full_kick();
6433 ret = 1;
6434 }
6435 }
6436
6437 if (event->attr.freq) {
6438 u64 now = perf_clock();
6439 s64 delta = now - hwc->freq_time_stamp;
6440
6441 hwc->freq_time_stamp = now;
6442
6443 if (delta > 0 && delta < 2*TICK_NSEC)
6444 perf_adjust_period(event, delta, hwc->last_period, true);
6445 }
6446
6447
6448
6449
6450
6451
6452 event->pending_kill = POLL_IN;
6453 if (events && atomic_dec_and_test(&event->event_limit)) {
6454 ret = 1;
6455 event->pending_kill = POLL_HUP;
6456 event->pending_disable = 1;
6457 irq_work_queue(&event->pending);
6458 }
6459
6460 if (event->overflow_handler)
6461 event->overflow_handler(event, data, regs);
6462 else
6463 perf_event_output(event, data, regs);
6464
6465 if (*perf_event_fasync(event) && event->pending_kill) {
6466 event->pending_wakeup = 1;
6467 irq_work_queue(&event->pending);
6468 }
6469
6470 return ret;
6471}
6472
6473int perf_event_overflow(struct perf_event *event,
6474 struct perf_sample_data *data,
6475 struct pt_regs *regs)
6476{
6477 return __perf_event_overflow(event, 1, data, regs);
6478}
6479
6480
6481
6482
6483
6484struct swevent_htable {
6485 struct swevent_hlist *swevent_hlist;
6486 struct mutex hlist_mutex;
6487 int hlist_refcount;
6488
6489
6490 int recursion[PERF_NR_CONTEXTS];
6491};
6492
6493static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6494
6495
6496
6497
6498
6499
6500
6501
6502u64 perf_swevent_set_period(struct perf_event *event)
6503{
6504 struct hw_perf_event *hwc = &event->hw;
6505 u64 period = hwc->last_period;
6506 u64 nr, offset;
6507 s64 old, val;
6508
6509 hwc->last_period = hwc->sample_period;
6510
6511again:
6512 old = val = local64_read(&hwc->period_left);
6513 if (val < 0)
6514 return 0;
6515
6516 nr = div64_u64(period + val, period);
6517 offset = nr * period;
6518 val -= offset;
6519 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6520 goto again;
6521
6522 return nr;
6523}
6524
6525static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6526 struct perf_sample_data *data,
6527 struct pt_regs *regs)
6528{
6529 struct hw_perf_event *hwc = &event->hw;
6530 int throttle = 0;
6531
6532 if (!overflow)
6533 overflow = perf_swevent_set_period(event);
6534
6535 if (hwc->interrupts == MAX_INTERRUPTS)
6536 return;
6537
6538 for (; overflow; overflow--) {
6539 if (__perf_event_overflow(event, throttle,
6540 data, regs)) {
6541
6542
6543
6544
6545 break;
6546 }
6547 throttle = 1;
6548 }
6549}
6550
6551static void perf_swevent_event(struct perf_event *event, u64 nr,
6552 struct perf_sample_data *data,
6553 struct pt_regs *regs)
6554{
6555 struct hw_perf_event *hwc = &event->hw;
6556
6557 local64_add(nr, &event->count);
6558
6559 if (!regs)
6560 return;
6561
6562 if (!is_sampling_event(event))
6563 return;
6564
6565 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6566 data->period = nr;
6567 return perf_swevent_overflow(event, 1, data, regs);
6568 } else
6569 data->period = event->hw.last_period;
6570
6571 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6572 return perf_swevent_overflow(event, 1, data, regs);
6573
6574 if (local64_add_negative(nr, &hwc->period_left))
6575 return;
6576
6577 perf_swevent_overflow(event, 0, data, regs);
6578}
6579
6580static int perf_exclude_event(struct perf_event *event,
6581 struct pt_regs *regs)
6582{
6583 if (event->hw.state & PERF_HES_STOPPED)
6584 return 1;
6585
6586 if (regs) {
6587 if (event->attr.exclude_user && user_mode(regs))
6588 return 1;
6589
6590 if (event->attr.exclude_kernel && !user_mode(regs))
6591 return 1;
6592 }
6593
6594 return 0;
6595}
6596
6597static int perf_swevent_match(struct perf_event *event,
6598 enum perf_type_id type,
6599 u32 event_id,
6600 struct perf_sample_data *data,
6601 struct pt_regs *regs)
6602{
6603 if (event->attr.type != type)
6604 return 0;
6605
6606 if (event->attr.config != event_id)
6607 return 0;
6608
6609 if (perf_exclude_event(event, regs))
6610 return 0;
6611
6612 return 1;
6613}
6614
6615static inline u64 swevent_hash(u64 type, u32 event_id)
6616{
6617 u64 val = event_id | (type << 32);
6618
6619 return hash_64(val, SWEVENT_HLIST_BITS);
6620}
6621
6622static inline struct hlist_head *
6623__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6624{
6625 u64 hash = swevent_hash(type, event_id);
6626
6627 return &hlist->heads[hash];
6628}
6629
6630
6631static inline struct hlist_head *
6632find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6633{
6634 struct swevent_hlist *hlist;
6635
6636 hlist = rcu_dereference(swhash->swevent_hlist);
6637 if (!hlist)
6638 return NULL;
6639
6640 return __find_swevent_head(hlist, type, event_id);
6641}
6642
6643
6644static inline struct hlist_head *
6645find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6646{
6647 struct swevent_hlist *hlist;
6648 u32 event_id = event->attr.config;
6649 u64 type = event->attr.type;
6650
6651
6652
6653
6654
6655
6656 hlist = rcu_dereference_protected(swhash->swevent_hlist,
6657 lockdep_is_held(&event->ctx->lock));
6658 if (!hlist)
6659 return NULL;
6660
6661 return __find_swevent_head(hlist, type, event_id);
6662}
6663
6664static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6665 u64 nr,
6666 struct perf_sample_data *data,
6667 struct pt_regs *regs)
6668{
6669 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6670 struct perf_event *event;
6671 struct hlist_head *head;
6672
6673 rcu_read_lock();
6674 head = find_swevent_head_rcu(swhash, type, event_id);
6675 if (!head)
6676 goto end;
6677
6678 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6679 if (perf_swevent_match(event, type, event_id, data, regs))
6680 perf_swevent_event(event, nr, data, regs);
6681 }
6682end:
6683 rcu_read_unlock();
6684}
6685
6686DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6687
6688int perf_swevent_get_recursion_context(void)
6689{
6690 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6691
6692 return get_recursion_context(swhash->recursion);
6693}
6694EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6695
6696inline void perf_swevent_put_recursion_context(int rctx)
6697{
6698 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6699
6700 put_recursion_context(swhash->recursion, rctx);
6701}
6702
6703void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6704{
6705 struct perf_sample_data data;
6706
6707 if (WARN_ON_ONCE(!regs))
6708 return;
6709
6710 perf_sample_data_init(&data, addr, 0);
6711 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6712}
6713
6714void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6715{
6716 int rctx;
6717
6718 preempt_disable_notrace();
6719 rctx = perf_swevent_get_recursion_context();
6720 if (unlikely(rctx < 0))
6721 goto fail;
6722
6723 ___perf_sw_event(event_id, nr, regs, addr);
6724
6725 perf_swevent_put_recursion_context(rctx);
6726fail:
6727 preempt_enable_notrace();
6728}
6729
6730static void perf_swevent_read(struct perf_event *event)
6731{
6732}
6733
6734static int perf_swevent_add(struct perf_event *event, int flags)
6735{
6736 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6737 struct hw_perf_event *hwc = &event->hw;
6738 struct hlist_head *head;
6739
6740 if (is_sampling_event(event)) {
6741 hwc->last_period = hwc->sample_period;
6742 perf_swevent_set_period(event);
6743 }
6744
6745 hwc->state = !(flags & PERF_EF_START);
6746
6747 head = find_swevent_head(swhash, event);
6748 if (WARN_ON_ONCE(!head))
6749 return -EINVAL;
6750
6751 hlist_add_head_rcu(&event->hlist_entry, head);
6752 perf_event_update_userpage(event);
6753
6754 return 0;
6755}
6756
6757static void perf_swevent_del(struct perf_event *event, int flags)
6758{
6759 hlist_del_rcu(&event->hlist_entry);
6760}
6761
6762static void perf_swevent_start(struct perf_event *event, int flags)
6763{
6764 event->hw.state = 0;
6765}
6766
6767static void perf_swevent_stop(struct perf_event *event, int flags)
6768{
6769 event->hw.state = PERF_HES_STOPPED;
6770}
6771
6772
6773static inline struct swevent_hlist *
6774swevent_hlist_deref(struct swevent_htable *swhash)
6775{
6776 return rcu_dereference_protected(swhash->swevent_hlist,
6777 lockdep_is_held(&swhash->hlist_mutex));
6778}
6779
6780static void swevent_hlist_release(struct swevent_htable *swhash)
6781{
6782 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6783
6784 if (!hlist)
6785 return;
6786
6787 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6788 kfree_rcu(hlist, rcu_head);
6789}
6790
6791static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6792{
6793 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6794
6795 mutex_lock(&swhash->hlist_mutex);
6796
6797 if (!--swhash->hlist_refcount)
6798 swevent_hlist_release(swhash);
6799
6800 mutex_unlock(&swhash->hlist_mutex);
6801}
6802
6803static void swevent_hlist_put(struct perf_event *event)
6804{
6805 int cpu;
6806
6807 for_each_possible_cpu(cpu)
6808 swevent_hlist_put_cpu(event, cpu);
6809}
6810
6811static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6812{
6813 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6814 int err = 0;
6815
6816 mutex_lock(&swhash->hlist_mutex);
6817 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6818 struct swevent_hlist *hlist;
6819
6820 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6821 if (!hlist) {
6822 err = -ENOMEM;
6823 goto exit;
6824 }
6825 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6826 }
6827 swhash->hlist_refcount++;
6828exit:
6829 mutex_unlock(&swhash->hlist_mutex);
6830
6831 return err;
6832}
6833
6834static int swevent_hlist_get(struct perf_event *event)
6835{
6836 int err;
6837 int cpu, failed_cpu;
6838
6839 get_online_cpus();
6840 for_each_possible_cpu(cpu) {
6841 err = swevent_hlist_get_cpu(event, cpu);
6842 if (err) {
6843 failed_cpu = cpu;
6844 goto fail;
6845 }
6846 }
6847 put_online_cpus();
6848
6849 return 0;
6850fail:
6851 for_each_possible_cpu(cpu) {
6852 if (cpu == failed_cpu)
6853 break;
6854 swevent_hlist_put_cpu(event, cpu);
6855 }
6856
6857 put_online_cpus();
6858 return err;
6859}
6860
6861struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6862
6863static void sw_perf_event_destroy(struct perf_event *event)
6864{
6865 u64 event_id = event->attr.config;
6866
6867 WARN_ON(event->parent);
6868
6869 static_key_slow_dec(&perf_swevent_enabled[event_id]);
6870 swevent_hlist_put(event);
6871}
6872
6873static int perf_swevent_init(struct perf_event *event)
6874{
6875 u64 event_id = event->attr.config;
6876
6877 if (event->attr.type != PERF_TYPE_SOFTWARE)
6878 return -ENOENT;
6879
6880
6881
6882
6883 if (has_branch_stack(event))
6884 return -EOPNOTSUPP;
6885
6886 switch (event_id) {
6887 case PERF_COUNT_SW_CPU_CLOCK:
6888 case PERF_COUNT_SW_TASK_CLOCK:
6889 return -ENOENT;
6890
6891 default:
6892 break;
6893 }
6894
6895 if (event_id >= PERF_COUNT_SW_MAX)
6896 return -ENOENT;
6897
6898 if (!event->parent) {
6899 int err;
6900
6901 err = swevent_hlist_get(event);
6902 if (err)
6903 return err;
6904
6905 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6906 event->destroy = sw_perf_event_destroy;
6907 }
6908
6909 return 0;
6910}
6911
6912static struct pmu perf_swevent = {
6913 .task_ctx_nr = perf_sw_context,
6914
6915 .capabilities = PERF_PMU_CAP_NO_NMI,
6916
6917 .event_init = perf_swevent_init,
6918 .add = perf_swevent_add,
6919 .del = perf_swevent_del,
6920 .start = perf_swevent_start,
6921 .stop = perf_swevent_stop,
6922 .read = perf_swevent_read,
6923};
6924
6925#ifdef CONFIG_EVENT_TRACING
6926
6927static int perf_tp_filter_match(struct perf_event *event,
6928 struct perf_sample_data *data)
6929{
6930 void *record = data->raw->data;
6931
6932
6933 if (event->parent)
6934 event = event->parent;
6935
6936 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6937 return 1;
6938 return 0;
6939}
6940
6941static int perf_tp_event_match(struct perf_event *event,
6942 struct perf_sample_data *data,
6943 struct pt_regs *regs)
6944{
6945 if (event->hw.state & PERF_HES_STOPPED)
6946 return 0;
6947
6948
6949
6950 if (event->attr.exclude_kernel)
6951 return 0;
6952
6953 if (!perf_tp_filter_match(event, data))
6954 return 0;
6955
6956 return 1;
6957}
6958
6959void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6960 struct pt_regs *regs, struct hlist_head *head, int rctx,
6961 struct task_struct *task)
6962{
6963 struct perf_sample_data data;
6964 struct perf_event *event;
6965
6966 struct perf_raw_record raw = {
6967 .size = entry_size,
6968 .data = record,
6969 };
6970
6971 perf_sample_data_init(&data, addr, 0);
6972 data.raw = &raw;
6973
6974 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6975 if (perf_tp_event_match(event, &data, regs))
6976 perf_swevent_event(event, count, &data, regs);
6977 }
6978
6979
6980
6981
6982
6983 if (task && task != current) {
6984 struct perf_event_context *ctx;
6985 struct trace_entry *entry = record;
6986
6987 rcu_read_lock();
6988 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6989 if (!ctx)
6990 goto unlock;
6991
6992 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6993 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6994 continue;
6995 if (event->attr.config != entry->type)
6996 continue;
6997 if (perf_tp_event_match(event, &data, regs))
6998 perf_swevent_event(event, count, &data, regs);
6999 }
7000unlock:
7001 rcu_read_unlock();
7002 }
7003
7004 perf_swevent_put_recursion_context(rctx);
7005}
7006EXPORT_SYMBOL_GPL(perf_tp_event);
7007
7008static void tp_perf_event_destroy(struct perf_event *event)
7009{
7010 perf_trace_destroy(event);
7011}
7012
7013static int perf_tp_event_init(struct perf_event *event)
7014{
7015 int err;
7016
7017 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7018 return -ENOENT;
7019
7020
7021
7022
7023 if (has_branch_stack(event))
7024 return -EOPNOTSUPP;
7025
7026 err = perf_trace_init(event);
7027 if (err)
7028 return err;
7029
7030 event->destroy = tp_perf_event_destroy;
7031
7032 return 0;
7033}
7034
7035static struct pmu perf_tracepoint = {
7036 .task_ctx_nr = perf_sw_context,
7037
7038 .event_init = perf_tp_event_init,
7039 .add = perf_trace_add,
7040 .del = perf_trace_del,
7041 .start = perf_swevent_start,
7042 .stop = perf_swevent_stop,
7043 .read = perf_swevent_read,
7044};
7045
7046static inline void perf_tp_register(void)
7047{
7048 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
7049}
7050
7051static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7052{
7053 char *filter_str;
7054 int ret;
7055
7056 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7057 return -EINVAL;
7058
7059 filter_str = strndup_user(arg, PAGE_SIZE);
7060 if (IS_ERR(filter_str))
7061 return PTR_ERR(filter_str);
7062
7063 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
7064
7065 kfree(filter_str);
7066 return ret;
7067}
7068
7069static void perf_event_free_filter(struct perf_event *event)
7070{
7071 ftrace_profile_free_filter(event);
7072}
7073
7074static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7075{
7076 struct bpf_prog *prog;
7077
7078 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7079 return -EINVAL;
7080
7081 if (event->tp_event->prog)
7082 return -EEXIST;
7083
7084 if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
7085
7086 return -EINVAL;
7087
7088 prog = bpf_prog_get(prog_fd);
7089 if (IS_ERR(prog))
7090 return PTR_ERR(prog);
7091
7092 if (prog->type != BPF_PROG_TYPE_KPROBE) {
7093
7094 bpf_prog_put(prog);
7095 return -EINVAL;
7096 }
7097
7098 event->tp_event->prog = prog;
7099
7100 return 0;
7101}
7102
7103static void perf_event_free_bpf_prog(struct perf_event *event)
7104{
7105 struct bpf_prog *prog;
7106
7107 if (!event->tp_event)
7108 return;
7109
7110 prog = event->tp_event->prog;
7111 if (prog) {
7112 event->tp_event->prog = NULL;
7113 bpf_prog_put(prog);
7114 }
7115}
7116
7117#else
7118
7119static inline void perf_tp_register(void)
7120{
7121}
7122
7123static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7124{
7125 return -ENOENT;
7126}
7127
7128static void perf_event_free_filter(struct perf_event *event)
7129{
7130}
7131
7132static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7133{
7134 return -ENOENT;
7135}
7136
7137static void perf_event_free_bpf_prog(struct perf_event *event)
7138{
7139}
7140#endif
7141
7142#ifdef CONFIG_HAVE_HW_BREAKPOINT
7143void perf_bp_event(struct perf_event *bp, void *data)
7144{
7145 struct perf_sample_data sample;
7146 struct pt_regs *regs = data;
7147
7148 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
7149
7150 if (!bp->hw.state && !perf_exclude_event(bp, regs))
7151 perf_swevent_event(bp, 1, &sample, regs);
7152}
7153#endif
7154
7155
7156
7157
7158
7159static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
7160{
7161 enum hrtimer_restart ret = HRTIMER_RESTART;
7162 struct perf_sample_data data;
7163 struct pt_regs *regs;
7164 struct perf_event *event;
7165 u64 period;
7166
7167 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
7168
7169 if (event->state != PERF_EVENT_STATE_ACTIVE)
7170 return HRTIMER_NORESTART;
7171
7172 event->pmu->read(event);
7173
7174 perf_sample_data_init(&data, 0, event->hw.last_period);
7175 regs = get_irq_regs();
7176
7177 if (regs && !perf_exclude_event(event, regs)) {
7178 if (!(event->attr.exclude_idle && is_idle_task(current)))
7179 if (__perf_event_overflow(event, 1, &data, regs))
7180 ret = HRTIMER_NORESTART;
7181 }
7182
7183 period = max_t(u64, 10000, event->hw.sample_period);
7184 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
7185
7186 return ret;
7187}
7188
7189static void perf_swevent_start_hrtimer(struct perf_event *event)
7190{
7191 struct hw_perf_event *hwc = &event->hw;
7192 s64 period;
7193
7194 if (!is_sampling_event(event))
7195 return;
7196
7197 period = local64_read(&hwc->period_left);
7198 if (period) {
7199 if (period < 0)
7200 period = 10000;
7201
7202 local64_set(&hwc->period_left, 0);
7203 } else {
7204 period = max_t(u64, 10000, hwc->sample_period);
7205 }
7206 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
7207 HRTIMER_MODE_REL_PINNED);
7208}
7209
7210static void perf_swevent_cancel_hrtimer(struct perf_event *event)
7211{
7212 struct hw_perf_event *hwc = &event->hw;
7213
7214 if (is_sampling_event(event)) {
7215 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
7216 local64_set(&hwc->period_left, ktime_to_ns(remaining));
7217
7218 hrtimer_cancel(&hwc->hrtimer);
7219 }
7220}
7221
7222static void perf_swevent_init_hrtimer(struct perf_event *event)
7223{
7224 struct hw_perf_event *hwc = &event->hw;
7225
7226 if (!is_sampling_event(event))
7227 return;
7228
7229 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
7230 hwc->hrtimer.function = perf_swevent_hrtimer;
7231
7232
7233
7234
7235
7236 if (event->attr.freq) {
7237 long freq = event->attr.sample_freq;
7238
7239 event->attr.sample_period = NSEC_PER_SEC / freq;
7240 hwc->sample_period = event->attr.sample_period;
7241 local64_set(&hwc->period_left, hwc->sample_period);
7242 hwc->last_period = hwc->sample_period;
7243 event->attr.freq = 0;
7244 }
7245}
7246
7247
7248
7249
7250
7251static void cpu_clock_event_update(struct perf_event *event)
7252{
7253 s64 prev;
7254 u64 now;
7255
7256 now = local_clock();
7257 prev = local64_xchg(&event->hw.prev_count, now);
7258 local64_add(now - prev, &event->count);
7259}
7260
7261static void cpu_clock_event_start(struct perf_event *event, int flags)
7262{
7263 local64_set(&event->hw.prev_count, local_clock());
7264 perf_swevent_start_hrtimer(event);
7265}
7266
7267static void cpu_clock_event_stop(struct perf_event *event, int flags)
7268{
7269 perf_swevent_cancel_hrtimer(event);
7270 cpu_clock_event_update(event);
7271}
7272
7273static int cpu_clock_event_add(struct perf_event *event, int flags)
7274{
7275 if (flags & PERF_EF_START)
7276 cpu_clock_event_start(event, flags);
7277 perf_event_update_userpage(event);
7278
7279 return 0;
7280}
7281
7282static void cpu_clock_event_del(struct perf_event *event, int flags)
7283{
7284 cpu_clock_event_stop(event, flags);
7285}
7286
7287static void cpu_clock_event_read(struct perf_event *event)
7288{
7289 cpu_clock_event_update(event);
7290}
7291
7292static int cpu_clock_event_init(struct perf_event *event)
7293{
7294 if (event->attr.type != PERF_TYPE_SOFTWARE)
7295 return -ENOENT;
7296
7297 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
7298 return -ENOENT;
7299
7300
7301
7302
7303 if (has_branch_stack(event))
7304 return -EOPNOTSUPP;
7305
7306 perf_swevent_init_hrtimer(event);
7307
7308 return 0;
7309}
7310
7311static struct pmu perf_cpu_clock = {
7312 .task_ctx_nr = perf_sw_context,
7313
7314 .capabilities = PERF_PMU_CAP_NO_NMI,
7315
7316 .event_init = cpu_clock_event_init,
7317 .add = cpu_clock_event_add,
7318 .del = cpu_clock_event_del,
7319 .start = cpu_clock_event_start,
7320 .stop = cpu_clock_event_stop,
7321 .read = cpu_clock_event_read,
7322};
7323
7324
7325
7326
7327
7328static void task_clock_event_update(struct perf_event *event, u64 now)
7329{
7330 u64 prev;
7331 s64 delta;
7332
7333 prev = local64_xchg(&event->hw.prev_count, now);
7334 delta = now - prev;
7335 local64_add(delta, &event->count);
7336}
7337
7338static void task_clock_event_start(struct perf_event *event, int flags)
7339{
7340 local64_set(&event->hw.prev_count, event->ctx->time);
7341 perf_swevent_start_hrtimer(event);
7342}
7343
7344static void task_clock_event_stop(struct perf_event *event, int flags)
7345{
7346 perf_swevent_cancel_hrtimer(event);
7347 task_clock_event_update(event, event->ctx->time);
7348}
7349
7350static int task_clock_event_add(struct perf_event *event, int flags)
7351{
7352 if (flags & PERF_EF_START)
7353 task_clock_event_start(event, flags);
7354 perf_event_update_userpage(event);
7355
7356 return 0;
7357}
7358
7359static void task_clock_event_del(struct perf_event *event, int flags)
7360{
7361 task_clock_event_stop(event, PERF_EF_UPDATE);
7362}
7363
7364static void task_clock_event_read(struct perf_event *event)
7365{
7366 u64 now = perf_clock();
7367 u64 delta = now - event->ctx->timestamp;
7368 u64 time = event->ctx->time + delta;
7369
7370 task_clock_event_update(event, time);
7371}
7372
7373static int task_clock_event_init(struct perf_event *event)
7374{
7375 if (event->attr.type != PERF_TYPE_SOFTWARE)
7376 return -ENOENT;
7377
7378 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7379 return -ENOENT;
7380
7381
7382
7383
7384 if (has_branch_stack(event))
7385 return -EOPNOTSUPP;
7386
7387 perf_swevent_init_hrtimer(event);
7388
7389 return 0;
7390}
7391
7392static struct pmu perf_task_clock = {
7393 .task_ctx_nr = perf_sw_context,
7394
7395 .capabilities = PERF_PMU_CAP_NO_NMI,
7396
7397 .event_init = task_clock_event_init,
7398 .add = task_clock_event_add,
7399 .del = task_clock_event_del,
7400 .start = task_clock_event_start,
7401 .stop = task_clock_event_stop,
7402 .read = task_clock_event_read,
7403};
7404
7405static void perf_pmu_nop_void(struct pmu *pmu)
7406{
7407}
7408
7409static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
7410{
7411}
7412
7413static int perf_pmu_nop_int(struct pmu *pmu)
7414{
7415 return 0;
7416}
7417
7418static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
7419
7420static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
7421{
7422 __this_cpu_write(nop_txn_flags, flags);
7423
7424 if (flags & ~PERF_PMU_TXN_ADD)
7425 return;
7426
7427 perf_pmu_disable(pmu);
7428}
7429
7430static int perf_pmu_commit_txn(struct pmu *pmu)
7431{
7432 unsigned int flags = __this_cpu_read(nop_txn_flags);
7433
7434 __this_cpu_write(nop_txn_flags, 0);
7435
7436 if (flags & ~PERF_PMU_TXN_ADD)
7437 return 0;
7438
7439 perf_pmu_enable(pmu);
7440 return 0;
7441}
7442
7443static void perf_pmu_cancel_txn(struct pmu *pmu)
7444{
7445 unsigned int flags = __this_cpu_read(nop_txn_flags);
7446
7447 __this_cpu_write(nop_txn_flags, 0);
7448
7449 if (flags & ~PERF_PMU_TXN_ADD)
7450 return;
7451
7452 perf_pmu_enable(pmu);
7453}
7454
7455static int perf_event_idx_default(struct perf_event *event)
7456{
7457 return 0;
7458}
7459
7460
7461
7462
7463
7464static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7465{
7466 struct pmu *pmu;
7467
7468 if (ctxn < 0)
7469 return NULL;
7470
7471 list_for_each_entry(pmu, &pmus, entry) {
7472 if (pmu->task_ctx_nr == ctxn)
7473 return pmu->pmu_cpu_context;
7474 }
7475
7476 return NULL;
7477}
7478
7479static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7480{
7481 int cpu;
7482
7483 for_each_possible_cpu(cpu) {
7484 struct perf_cpu_context *cpuctx;
7485
7486 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7487
7488 if (cpuctx->unique_pmu == old_pmu)
7489 cpuctx->unique_pmu = pmu;
7490 }
7491}
7492
7493static void free_pmu_context(struct pmu *pmu)
7494{
7495 struct pmu *i;
7496
7497 mutex_lock(&pmus_lock);
7498
7499
7500
7501 list_for_each_entry(i, &pmus, entry) {
7502 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7503 update_pmu_context(i, pmu);
7504 goto out;
7505 }
7506 }
7507
7508 free_percpu(pmu->pmu_cpu_context);
7509out:
7510 mutex_unlock(&pmus_lock);
7511}
7512static struct idr pmu_idr;
7513
7514static ssize_t
7515type_show(struct device *dev, struct device_attribute *attr, char *page)
7516{
7517 struct pmu *pmu = dev_get_drvdata(dev);
7518
7519 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7520}
7521static DEVICE_ATTR_RO(type);
7522
7523static ssize_t
7524perf_event_mux_interval_ms_show(struct device *dev,
7525 struct device_attribute *attr,
7526 char *page)
7527{
7528 struct pmu *pmu = dev_get_drvdata(dev);
7529
7530 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7531}
7532
7533static DEFINE_MUTEX(mux_interval_mutex);
7534
7535static ssize_t
7536perf_event_mux_interval_ms_store(struct device *dev,
7537 struct device_attribute *attr,
7538 const char *buf, size_t count)
7539{
7540 struct pmu *pmu = dev_get_drvdata(dev);
7541 int timer, cpu, ret;
7542
7543 ret = kstrtoint(buf, 0, &timer);
7544 if (ret)
7545 return ret;
7546
7547 if (timer < 1)
7548 return -EINVAL;
7549
7550
7551 if (timer == pmu->hrtimer_interval_ms)
7552 return count;
7553
7554 mutex_lock(&mux_interval_mutex);
7555 pmu->hrtimer_interval_ms = timer;
7556
7557
7558 get_online_cpus();
7559 for_each_online_cpu(cpu) {
7560 struct perf_cpu_context *cpuctx;
7561 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7562 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7563
7564 cpu_function_call(cpu,
7565 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
7566 }
7567 put_online_cpus();
7568 mutex_unlock(&mux_interval_mutex);
7569
7570 return count;
7571}
7572static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7573
7574static struct attribute *pmu_dev_attrs[] = {
7575 &dev_attr_type.attr,
7576 &dev_attr_perf_event_mux_interval_ms.attr,
7577 NULL,
7578};
7579ATTRIBUTE_GROUPS(pmu_dev);
7580
7581static int pmu_bus_running;
7582static struct bus_type pmu_bus = {
7583 .name = "event_source",
7584 .dev_groups = pmu_dev_groups,
7585};
7586
7587static void pmu_dev_release(struct device *dev)
7588{
7589 kfree(dev);
7590}
7591
7592static int pmu_dev_alloc(struct pmu *pmu)
7593{
7594 int ret = -ENOMEM;
7595
7596 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7597 if (!pmu->dev)
7598 goto out;
7599
7600 pmu->dev->groups = pmu->attr_groups;
7601 device_initialize(pmu->dev);
7602 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7603 if (ret)
7604 goto free_dev;
7605
7606 dev_set_drvdata(pmu->dev, pmu);
7607 pmu->dev->bus = &pmu_bus;
7608 pmu->dev->release = pmu_dev_release;
7609 ret = device_add(pmu->dev);
7610 if (ret)
7611 goto free_dev;
7612
7613out:
7614 return ret;
7615
7616free_dev:
7617 put_device(pmu->dev);
7618 goto out;
7619}
7620
7621static struct lock_class_key cpuctx_mutex;
7622static struct lock_class_key cpuctx_lock;
7623
7624int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7625{
7626 int cpu, ret;
7627
7628 mutex_lock(&pmus_lock);
7629 ret = -ENOMEM;
7630 pmu->pmu_disable_count = alloc_percpu(int);
7631 if (!pmu->pmu_disable_count)
7632 goto unlock;
7633
7634 pmu->type = -1;
7635 if (!name)
7636 goto skip_type;
7637 pmu->name = name;
7638
7639 if (type < 0) {
7640 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7641 if (type < 0) {
7642 ret = type;
7643 goto free_pdc;
7644 }
7645 }
7646 pmu->type = type;
7647
7648 if (pmu_bus_running) {
7649 ret = pmu_dev_alloc(pmu);
7650 if (ret)
7651 goto free_idr;
7652 }
7653
7654skip_type:
7655 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7656 if (pmu->pmu_cpu_context)
7657 goto got_cpu_context;
7658
7659 ret = -ENOMEM;
7660 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7661 if (!pmu->pmu_cpu_context)
7662 goto free_dev;
7663
7664 for_each_possible_cpu(cpu) {
7665 struct perf_cpu_context *cpuctx;
7666
7667 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7668 __perf_event_init_context(&cpuctx->ctx);
7669 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7670 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7671 cpuctx->ctx.pmu = pmu;
7672
7673 __perf_mux_hrtimer_init(cpuctx, cpu);
7674
7675 cpuctx->unique_pmu = pmu;
7676 }
7677
7678got_cpu_context:
7679 if (!pmu->start_txn) {
7680 if (pmu->pmu_enable) {
7681
7682
7683
7684
7685
7686 pmu->start_txn = perf_pmu_start_txn;
7687 pmu->commit_txn = perf_pmu_commit_txn;
7688 pmu->cancel_txn = perf_pmu_cancel_txn;
7689 } else {
7690 pmu->start_txn = perf_pmu_nop_txn;
7691 pmu->commit_txn = perf_pmu_nop_int;
7692 pmu->cancel_txn = perf_pmu_nop_void;
7693 }
7694 }
7695
7696 if (!pmu->pmu_enable) {
7697 pmu->pmu_enable = perf_pmu_nop_void;
7698 pmu->pmu_disable = perf_pmu_nop_void;
7699 }
7700
7701 if (!pmu->event_idx)
7702 pmu->event_idx = perf_event_idx_default;
7703
7704 list_add_rcu(&pmu->entry, &pmus);
7705 atomic_set(&pmu->exclusive_cnt, 0);
7706 ret = 0;
7707unlock:
7708 mutex_unlock(&pmus_lock);
7709
7710 return ret;
7711
7712free_dev:
7713 device_del(pmu->dev);
7714 put_device(pmu->dev);
7715
7716free_idr:
7717 if (pmu->type >= PERF_TYPE_MAX)
7718 idr_remove(&pmu_idr, pmu->type);
7719
7720free_pdc:
7721 free_percpu(pmu->pmu_disable_count);
7722 goto unlock;
7723}
7724EXPORT_SYMBOL_GPL(perf_pmu_register);
7725
7726void perf_pmu_unregister(struct pmu *pmu)
7727{
7728 mutex_lock(&pmus_lock);
7729 list_del_rcu(&pmu->entry);
7730 mutex_unlock(&pmus_lock);
7731
7732
7733
7734
7735
7736 synchronize_srcu(&pmus_srcu);
7737 synchronize_rcu();
7738
7739 free_percpu(pmu->pmu_disable_count);
7740 if (pmu->type >= PERF_TYPE_MAX)
7741 idr_remove(&pmu_idr, pmu->type);
7742 device_del(pmu->dev);
7743 put_device(pmu->dev);
7744 free_pmu_context(pmu);
7745}
7746EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7747
7748static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7749{
7750 struct perf_event_context *ctx = NULL;
7751 int ret;
7752
7753 if (!try_module_get(pmu->module))
7754 return -ENODEV;
7755
7756 if (event->group_leader != event) {
7757
7758
7759
7760
7761 ctx = perf_event_ctx_lock_nested(event->group_leader,
7762 SINGLE_DEPTH_NESTING);
7763 BUG_ON(!ctx);
7764 }
7765
7766 event->pmu = pmu;
7767 ret = pmu->event_init(event);
7768
7769 if (ctx)
7770 perf_event_ctx_unlock(event->group_leader, ctx);
7771
7772 if (ret)
7773 module_put(pmu->module);
7774
7775 return ret;
7776}
7777
7778static struct pmu *perf_init_event(struct perf_event *event)
7779{
7780 struct pmu *pmu = NULL;
7781 int idx;
7782 int ret;
7783
7784 idx = srcu_read_lock(&pmus_srcu);
7785
7786 rcu_read_lock();
7787 pmu = idr_find(&pmu_idr, event->attr.type);
7788 rcu_read_unlock();
7789 if (pmu) {
7790 ret = perf_try_init_event(pmu, event);
7791 if (ret)
7792 pmu = ERR_PTR(ret);
7793 goto unlock;
7794 }
7795
7796 list_for_each_entry_rcu(pmu, &pmus, entry) {
7797 ret = perf_try_init_event(pmu, event);
7798 if (!ret)
7799 goto unlock;
7800
7801 if (ret != -ENOENT) {
7802 pmu = ERR_PTR(ret);
7803 goto unlock;
7804 }
7805 }
7806 pmu = ERR_PTR(-ENOENT);
7807unlock:
7808 srcu_read_unlock(&pmus_srcu, idx);
7809
7810 return pmu;
7811}
7812
7813static void account_event_cpu(struct perf_event *event, int cpu)
7814{
7815 if (event->parent)
7816 return;
7817
7818 if (is_cgroup_event(event))
7819 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7820}
7821
7822static void account_event(struct perf_event *event)
7823{
7824 if (event->parent)
7825 return;
7826
7827 if (event->attach_state & PERF_ATTACH_TASK)
7828 static_key_slow_inc(&perf_sched_events.key);
7829 if (event->attr.mmap || event->attr.mmap_data)
7830 atomic_inc(&nr_mmap_events);
7831 if (event->attr.comm)
7832 atomic_inc(&nr_comm_events);
7833 if (event->attr.task)
7834 atomic_inc(&nr_task_events);
7835 if (event->attr.freq) {
7836 if (atomic_inc_return(&nr_freq_events) == 1)
7837 tick_nohz_full_kick_all();
7838 }
7839 if (event->attr.context_switch) {
7840 atomic_inc(&nr_switch_events);
7841 static_key_slow_inc(&perf_sched_events.key);
7842 }
7843 if (has_branch_stack(event))
7844 static_key_slow_inc(&perf_sched_events.key);
7845 if (is_cgroup_event(event))
7846 static_key_slow_inc(&perf_sched_events.key);
7847
7848 account_event_cpu(event, event->cpu);
7849}
7850
7851
7852
7853
7854static struct perf_event *
7855perf_event_alloc(struct perf_event_attr *attr, int cpu,
7856 struct task_struct *task,
7857 struct perf_event *group_leader,
7858 struct perf_event *parent_event,
7859 perf_overflow_handler_t overflow_handler,
7860 void *context, int cgroup_fd)
7861{
7862 struct pmu *pmu;
7863 struct perf_event *event;
7864 struct hw_perf_event *hwc;
7865 long err = -EINVAL;
7866
7867 if ((unsigned)cpu >= nr_cpu_ids) {
7868 if (!task || cpu != -1)
7869 return ERR_PTR(-EINVAL);
7870 }
7871
7872 event = kzalloc(sizeof(*event), GFP_KERNEL);
7873 if (!event)
7874 return ERR_PTR(-ENOMEM);
7875
7876
7877
7878
7879
7880 if (!group_leader)
7881 group_leader = event;
7882
7883 mutex_init(&event->child_mutex);
7884 INIT_LIST_HEAD(&event->child_list);
7885
7886 INIT_LIST_HEAD(&event->group_entry);
7887 INIT_LIST_HEAD(&event->event_entry);
7888 INIT_LIST_HEAD(&event->sibling_list);
7889 INIT_LIST_HEAD(&event->rb_entry);
7890 INIT_LIST_HEAD(&event->active_entry);
7891 INIT_HLIST_NODE(&event->hlist_entry);
7892
7893
7894 init_waitqueue_head(&event->waitq);
7895 init_irq_work(&event->pending, perf_pending_event);
7896
7897 mutex_init(&event->mmap_mutex);
7898
7899 atomic_long_set(&event->refcount, 1);
7900 event->cpu = cpu;
7901 event->attr = *attr;
7902 event->group_leader = group_leader;
7903 event->pmu = NULL;
7904 event->oncpu = -1;
7905
7906 event->parent = parent_event;
7907
7908 event->ns = get_pid_ns(task_active_pid_ns(current));
7909 event->id = atomic64_inc_return(&perf_event_id);
7910
7911 event->state = PERF_EVENT_STATE_INACTIVE;
7912
7913 if (task) {
7914 event->attach_state = PERF_ATTACH_TASK;
7915
7916
7917
7918
7919
7920 event->hw.target = task;
7921 }
7922
7923 event->clock = &local_clock;
7924 if (parent_event)
7925 event->clock = parent_event->clock;
7926
7927 if (!overflow_handler && parent_event) {
7928 overflow_handler = parent_event->overflow_handler;
7929 context = parent_event->overflow_handler_context;
7930 }
7931
7932 event->overflow_handler = overflow_handler;
7933 event->overflow_handler_context = context;
7934
7935 perf_event__state_init(event);
7936
7937 pmu = NULL;
7938
7939 hwc = &event->hw;
7940 hwc->sample_period = attr->sample_period;
7941 if (attr->freq && attr->sample_freq)
7942 hwc->sample_period = 1;
7943 hwc->last_period = hwc->sample_period;
7944
7945 local64_set(&hwc->period_left, hwc->sample_period);
7946
7947
7948
7949
7950 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7951 goto err_ns;
7952
7953 if (!has_branch_stack(event))
7954 event->attr.branch_sample_type = 0;
7955
7956 if (cgroup_fd != -1) {
7957 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7958 if (err)
7959 goto err_ns;
7960 }
7961
7962 pmu = perf_init_event(event);
7963 if (!pmu)
7964 goto err_ns;
7965 else if (IS_ERR(pmu)) {
7966 err = PTR_ERR(pmu);
7967 goto err_ns;
7968 }
7969
7970 err = exclusive_event_init(event);
7971 if (err)
7972 goto err_pmu;
7973
7974 if (!event->parent) {
7975 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7976 err = get_callchain_buffers();
7977 if (err)
7978 goto err_per_task;
7979 }
7980 }
7981
7982 return event;
7983
7984err_per_task:
7985 exclusive_event_destroy(event);
7986
7987err_pmu:
7988 if (event->destroy)
7989 event->destroy(event);
7990 module_put(pmu->module);
7991err_ns:
7992 if (is_cgroup_event(event))
7993 perf_detach_cgroup(event);
7994 if (event->ns)
7995 put_pid_ns(event->ns);
7996 kfree(event);
7997
7998 return ERR_PTR(err);
7999}
8000
8001static int perf_copy_attr(struct perf_event_attr __user *uattr,
8002 struct perf_event_attr *attr)
8003{
8004 u32 size;
8005 int ret;
8006
8007 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
8008 return -EFAULT;
8009
8010
8011
8012
8013 memset(attr, 0, sizeof(*attr));
8014
8015 ret = get_user(size, &uattr->size);
8016 if (ret)
8017 return ret;
8018
8019 if (size > PAGE_SIZE)
8020 goto err_size;
8021
8022 if (!size)
8023 size = PERF_ATTR_SIZE_VER0;
8024
8025 if (size < PERF_ATTR_SIZE_VER0)
8026 goto err_size;
8027
8028
8029
8030
8031
8032
8033
8034 if (size > sizeof(*attr)) {
8035 unsigned char __user *addr;
8036 unsigned char __user *end;
8037 unsigned char val;
8038
8039 addr = (void __user *)uattr + sizeof(*attr);
8040 end = (void __user *)uattr + size;
8041
8042 for (; addr < end; addr++) {
8043 ret = get_user(val, addr);
8044 if (ret)
8045 return ret;
8046 if (val)
8047 goto err_size;
8048 }
8049 size = sizeof(*attr);
8050 }
8051
8052 ret = copy_from_user(attr, uattr, size);
8053 if (ret)
8054 return -EFAULT;
8055
8056 if (attr->__reserved_1)
8057 return -EINVAL;
8058
8059 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
8060 return -EINVAL;
8061
8062 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
8063 return -EINVAL;
8064
8065 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
8066 u64 mask = attr->branch_sample_type;
8067
8068
8069 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
8070 return -EINVAL;
8071
8072
8073 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
8074 return -EINVAL;
8075
8076
8077 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
8078
8079
8080 if (!attr->exclude_kernel)
8081 mask |= PERF_SAMPLE_BRANCH_KERNEL;
8082
8083 if (!attr->exclude_user)
8084 mask |= PERF_SAMPLE_BRANCH_USER;
8085
8086 if (!attr->exclude_hv)
8087 mask |= PERF_SAMPLE_BRANCH_HV;
8088
8089
8090
8091 attr->branch_sample_type = mask;
8092 }
8093
8094 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
8095 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8096 return -EACCES;
8097 }
8098
8099 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
8100 ret = perf_reg_validate(attr->sample_regs_user);
8101 if (ret)
8102 return ret;
8103 }
8104
8105 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
8106 if (!arch_perf_have_user_stack_dump())
8107 return -ENOSYS;
8108
8109
8110
8111
8112
8113
8114 if (attr->sample_stack_user >= USHRT_MAX)
8115 ret = -EINVAL;
8116 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
8117 ret = -EINVAL;
8118 }
8119
8120 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
8121 ret = perf_reg_validate(attr->sample_regs_intr);
8122out:
8123 return ret;
8124
8125err_size:
8126 put_user(sizeof(*attr), &uattr->size);
8127 ret = -E2BIG;
8128 goto out;
8129}
8130
8131static int
8132perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
8133{
8134 struct ring_buffer *rb = NULL;
8135 int ret = -EINVAL;
8136
8137 if (!output_event)
8138 goto set;
8139
8140
8141 if (event == output_event)
8142 goto out;
8143
8144
8145
8146
8147 if (output_event->cpu != event->cpu)
8148 goto out;
8149
8150
8151
8152
8153 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
8154 goto out;
8155
8156
8157
8158
8159 if (output_event->clock != event->clock)
8160 goto out;
8161
8162
8163
8164
8165 if (has_aux(event) && has_aux(output_event) &&
8166 event->pmu != output_event->pmu)
8167 goto out;
8168
8169set:
8170 mutex_lock(&event->mmap_mutex);
8171
8172 if (atomic_read(&event->mmap_count))
8173 goto unlock;
8174
8175 if (output_event) {
8176
8177 rb = ring_buffer_get(output_event);
8178 if (!rb)
8179 goto unlock;
8180 }
8181
8182 ring_buffer_attach(event, rb);
8183
8184 ret = 0;
8185unlock:
8186 mutex_unlock(&event->mmap_mutex);
8187
8188out:
8189 return ret;
8190}
8191
8192static void mutex_lock_double(struct mutex *a, struct mutex *b)
8193{
8194 if (b < a)
8195 swap(a, b);
8196
8197 mutex_lock(a);
8198 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
8199}
8200
8201static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
8202{
8203 bool nmi_safe = false;
8204
8205 switch (clk_id) {
8206 case CLOCK_MONOTONIC:
8207 event->clock = &ktime_get_mono_fast_ns;
8208 nmi_safe = true;
8209 break;
8210
8211 case CLOCK_MONOTONIC_RAW:
8212 event->clock = &ktime_get_raw_fast_ns;
8213 nmi_safe = true;
8214 break;
8215
8216 case CLOCK_REALTIME:
8217 event->clock = &ktime_get_real_ns;
8218 break;
8219
8220 case CLOCK_BOOTTIME:
8221 event->clock = &ktime_get_boot_ns;
8222 break;
8223
8224 case CLOCK_TAI:
8225 event->clock = &ktime_get_tai_ns;
8226 break;
8227
8228 default:
8229 return -EINVAL;
8230 }
8231
8232 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
8233 return -EINVAL;
8234
8235 return 0;
8236}
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246SYSCALL_DEFINE5(perf_event_open,
8247 struct perf_event_attr __user *, attr_uptr,
8248 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
8249{
8250 struct perf_event *group_leader = NULL, *output_event = NULL;
8251 struct perf_event *event, *sibling;
8252 struct perf_event_attr attr;
8253 struct perf_event_context *ctx, *uninitialized_var(gctx);
8254 struct file *event_file = NULL;
8255 struct fd group = {NULL, 0};
8256 struct task_struct *task = NULL;
8257 struct pmu *pmu;
8258 int event_fd;
8259 int move_group = 0;
8260 int err;
8261 int f_flags = O_RDWR;
8262 int cgroup_fd = -1;
8263
8264
8265 if (flags & ~PERF_FLAG_ALL)
8266 return -EINVAL;
8267
8268 err = perf_copy_attr(attr_uptr, &attr);
8269 if (err)
8270 return err;
8271
8272 if (!attr.exclude_kernel) {
8273 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8274 return -EACCES;
8275 }
8276
8277 if (attr.freq) {
8278 if (attr.sample_freq > sysctl_perf_event_sample_rate)
8279 return -EINVAL;
8280 } else {
8281 if (attr.sample_period & (1ULL << 63))
8282 return -EINVAL;
8283 }
8284
8285
8286
8287
8288
8289
8290
8291 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
8292 return -EINVAL;
8293
8294 if (flags & PERF_FLAG_FD_CLOEXEC)
8295 f_flags |= O_CLOEXEC;
8296
8297 event_fd = get_unused_fd_flags(f_flags);
8298 if (event_fd < 0)
8299 return event_fd;
8300
8301 if (group_fd != -1) {
8302 err = perf_fget_light(group_fd, &group);
8303 if (err)
8304 goto err_fd;
8305 group_leader = group.file->private_data;
8306 if (flags & PERF_FLAG_FD_OUTPUT)
8307 output_event = group_leader;
8308 if (flags & PERF_FLAG_FD_NO_GROUP)
8309 group_leader = NULL;
8310 }
8311
8312 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
8313 task = find_lively_task_by_vpid(pid);
8314 if (IS_ERR(task)) {
8315 err = PTR_ERR(task);
8316 goto err_group_fd;
8317 }
8318 }
8319
8320 if (task && group_leader &&
8321 group_leader->attr.inherit != attr.inherit) {
8322 err = -EINVAL;
8323 goto err_task;
8324 }
8325
8326 get_online_cpus();
8327
8328 if (flags & PERF_FLAG_PID_CGROUP)
8329 cgroup_fd = pid;
8330
8331 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
8332 NULL, NULL, cgroup_fd);
8333 if (IS_ERR(event)) {
8334 err = PTR_ERR(event);
8335 goto err_cpus;
8336 }
8337
8338 if (is_sampling_event(event)) {
8339 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
8340 err = -ENOTSUPP;
8341 goto err_alloc;
8342 }
8343 }
8344
8345 account_event(event);
8346
8347
8348
8349
8350
8351 pmu = event->pmu;
8352
8353 if (attr.use_clockid) {
8354 err = perf_event_set_clock(event, attr.clockid);
8355 if (err)
8356 goto err_alloc;
8357 }
8358
8359 if (group_leader &&
8360 (is_software_event(event) != is_software_event(group_leader))) {
8361 if (is_software_event(event)) {
8362
8363
8364
8365
8366
8367
8368
8369
8370 pmu = group_leader->pmu;
8371 } else if (is_software_event(group_leader) &&
8372 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
8373
8374
8375
8376
8377
8378 move_group = 1;
8379 }
8380 }
8381
8382
8383
8384
8385 ctx = find_get_context(pmu, task, event);
8386 if (IS_ERR(ctx)) {
8387 err = PTR_ERR(ctx);
8388 goto err_alloc;
8389 }
8390
8391 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
8392 err = -EBUSY;
8393 goto err_context;
8394 }
8395
8396 if (task) {
8397 put_task_struct(task);
8398 task = NULL;
8399 }
8400
8401
8402
8403
8404 if (group_leader) {
8405 err = -EINVAL;
8406
8407
8408
8409
8410
8411 if (group_leader->group_leader != group_leader)
8412 goto err_context;
8413
8414
8415 if (group_leader->clock != event->clock)
8416 goto err_context;
8417
8418
8419
8420
8421
8422 if (move_group) {
8423
8424
8425
8426
8427 if (group_leader->ctx->task != ctx->task)
8428 goto err_context;
8429
8430
8431
8432
8433
8434
8435 if (group_leader->cpu != event->cpu)
8436 goto err_context;
8437 } else {
8438 if (group_leader->ctx != ctx)
8439 goto err_context;
8440 }
8441
8442
8443
8444
8445 if (attr.exclusive || attr.pinned)
8446 goto err_context;
8447 }
8448
8449 if (output_event) {
8450 err = perf_event_set_output(event, output_event);
8451 if (err)
8452 goto err_context;
8453 }
8454
8455 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8456 f_flags);
8457 if (IS_ERR(event_file)) {
8458 err = PTR_ERR(event_file);
8459 goto err_context;
8460 }
8461
8462 if (move_group) {
8463 gctx = group_leader->ctx;
8464 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8465 } else {
8466 mutex_lock(&ctx->mutex);
8467 }
8468
8469 if (!perf_event_validate_size(event)) {
8470 err = -E2BIG;
8471 goto err_locked;
8472 }
8473
8474
8475
8476
8477
8478 if (!exclusive_event_installable(event, ctx)) {
8479
8480 WARN_ON_ONCE(move_group);
8481
8482 err = -EBUSY;
8483 goto err_locked;
8484 }
8485
8486 WARN_ON_ONCE(ctx->parent_ctx);
8487
8488 if (move_group) {
8489
8490
8491
8492
8493 perf_remove_from_context(group_leader, false);
8494
8495 list_for_each_entry(sibling, &group_leader->sibling_list,
8496 group_entry) {
8497 perf_remove_from_context(sibling, false);
8498 put_ctx(gctx);
8499 }
8500
8501
8502
8503
8504
8505 synchronize_rcu();
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517 list_for_each_entry(sibling, &group_leader->sibling_list,
8518 group_entry) {
8519 perf_event__state_init(sibling);
8520 perf_install_in_context(ctx, sibling, sibling->cpu);
8521 get_ctx(ctx);
8522 }
8523
8524
8525
8526
8527
8528
8529 perf_event__state_init(group_leader);
8530 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8531 get_ctx(ctx);
8532
8533
8534
8535
8536
8537
8538 put_ctx(gctx);
8539 }
8540
8541
8542
8543
8544
8545
8546
8547 perf_event__header_size(event);
8548 perf_event__id_header_size(event);
8549
8550 perf_install_in_context(ctx, event, event->cpu);
8551 perf_unpin_context(ctx);
8552
8553 if (move_group)
8554 mutex_unlock(&gctx->mutex);
8555 mutex_unlock(&ctx->mutex);
8556
8557 put_online_cpus();
8558
8559 event->owner = current;
8560
8561 mutex_lock(¤t->perf_event_mutex);
8562 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
8563 mutex_unlock(¤t->perf_event_mutex);
8564
8565
8566
8567
8568
8569
8570
8571 fdput(group);
8572 fd_install(event_fd, event_file);
8573 return event_fd;
8574
8575err_locked:
8576 if (move_group)
8577 mutex_unlock(&gctx->mutex);
8578 mutex_unlock(&ctx->mutex);
8579
8580 fput(event_file);
8581err_context:
8582 perf_unpin_context(ctx);
8583 put_ctx(ctx);
8584err_alloc:
8585 free_event(event);
8586err_cpus:
8587 put_online_cpus();
8588err_task:
8589 if (task)
8590 put_task_struct(task);
8591err_group_fd:
8592 fdput(group);
8593err_fd:
8594 put_unused_fd(event_fd);
8595 return err;
8596}
8597
8598
8599
8600
8601
8602
8603
8604
8605struct perf_event *
8606perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8607 struct task_struct *task,
8608 perf_overflow_handler_t overflow_handler,
8609 void *context)
8610{
8611 struct perf_event_context *ctx;
8612 struct perf_event *event;
8613 int err;
8614
8615
8616
8617
8618
8619 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8620 overflow_handler, context, -1);
8621 if (IS_ERR(event)) {
8622 err = PTR_ERR(event);
8623 goto err;
8624 }
8625
8626
8627 event->owner = EVENT_OWNER_KERNEL;
8628
8629 account_event(event);
8630
8631 ctx = find_get_context(event->pmu, task, event);
8632 if (IS_ERR(ctx)) {
8633 err = PTR_ERR(ctx);
8634 goto err_free;
8635 }
8636
8637 WARN_ON_ONCE(ctx->parent_ctx);
8638 mutex_lock(&ctx->mutex);
8639 if (!exclusive_event_installable(event, ctx)) {
8640 mutex_unlock(&ctx->mutex);
8641 perf_unpin_context(ctx);
8642 put_ctx(ctx);
8643 err = -EBUSY;
8644 goto err_free;
8645 }
8646
8647 perf_install_in_context(ctx, event, cpu);
8648 perf_unpin_context(ctx);
8649 mutex_unlock(&ctx->mutex);
8650
8651 return event;
8652
8653err_free:
8654 free_event(event);
8655err:
8656 return ERR_PTR(err);
8657}
8658EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8659
8660void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8661{
8662 struct perf_event_context *src_ctx;
8663 struct perf_event_context *dst_ctx;
8664 struct perf_event *event, *tmp;
8665 LIST_HEAD(events);
8666
8667 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8668 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8669
8670
8671
8672
8673
8674 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8675 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8676 event_entry) {
8677 perf_remove_from_context(event, false);
8678 unaccount_event_cpu(event, src_cpu);
8679 put_ctx(src_ctx);
8680 list_add(&event->migrate_entry, &events);
8681 }
8682
8683
8684
8685
8686 synchronize_rcu();
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8697 if (event->group_leader == event)
8698 continue;
8699
8700 list_del(&event->migrate_entry);
8701 if (event->state >= PERF_EVENT_STATE_OFF)
8702 event->state = PERF_EVENT_STATE_INACTIVE;
8703 account_event_cpu(event, dst_cpu);
8704 perf_install_in_context(dst_ctx, event, dst_cpu);
8705 get_ctx(dst_ctx);
8706 }
8707
8708
8709
8710
8711
8712 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8713 list_del(&event->migrate_entry);
8714 if (event->state >= PERF_EVENT_STATE_OFF)
8715 event->state = PERF_EVENT_STATE_INACTIVE;
8716 account_event_cpu(event, dst_cpu);
8717 perf_install_in_context(dst_ctx, event, dst_cpu);
8718 get_ctx(dst_ctx);
8719 }
8720 mutex_unlock(&dst_ctx->mutex);
8721 mutex_unlock(&src_ctx->mutex);
8722}
8723EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8724
8725static void sync_child_event(struct perf_event *child_event,
8726 struct task_struct *child)
8727{
8728 struct perf_event *parent_event = child_event->parent;
8729 u64 child_val;
8730
8731 if (child_event->attr.inherit_stat)
8732 perf_event_read_event(child_event, child);
8733
8734 child_val = perf_event_count(child_event);
8735
8736
8737
8738
8739 atomic64_add(child_val, &parent_event->child_count);
8740 atomic64_add(child_event->total_time_enabled,
8741 &parent_event->child_total_time_enabled);
8742 atomic64_add(child_event->total_time_running,
8743 &parent_event->child_total_time_running);
8744
8745
8746
8747
8748 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8749 mutex_lock(&parent_event->child_mutex);
8750 list_del_init(&child_event->child_list);
8751 mutex_unlock(&parent_event->child_mutex);
8752
8753
8754
8755
8756
8757 perf_event_wakeup(parent_event);
8758
8759
8760
8761
8762
8763 put_event(parent_event);
8764}
8765
8766static void
8767__perf_event_exit_task(struct perf_event *child_event,
8768 struct perf_event_context *child_ctx,
8769 struct task_struct *child)
8770{
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783 perf_remove_from_context(child_event, !!child_event->parent);
8784
8785
8786
8787
8788
8789
8790 if (child_event->parent) {
8791 sync_child_event(child_event, child);
8792 free_event(child_event);
8793 } else {
8794 child_event->state = PERF_EVENT_STATE_EXIT;
8795 perf_event_wakeup(child_event);
8796 }
8797}
8798
8799static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8800{
8801 struct perf_event *child_event, *next;
8802 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8803 unsigned long flags;
8804
8805 if (likely(!child->perf_event_ctxp[ctxn]))
8806 return;
8807
8808 local_irq_save(flags);
8809
8810
8811
8812
8813
8814
8815 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8816
8817
8818
8819
8820
8821
8822 raw_spin_lock(&child_ctx->lock);
8823 task_ctx_sched_out(child_ctx);
8824 child->perf_event_ctxp[ctxn] = NULL;
8825
8826
8827
8828
8829
8830
8831 clone_ctx = unclone_ctx(child_ctx);
8832 update_context_time(child_ctx);
8833 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8834
8835 if (clone_ctx)
8836 put_ctx(clone_ctx);
8837
8838
8839
8840
8841
8842
8843 perf_event_task(child, child_ctx, 0);
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855 mutex_lock(&child_ctx->mutex);
8856
8857 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8858 __perf_event_exit_task(child_event, child_ctx, child);
8859
8860 mutex_unlock(&child_ctx->mutex);
8861
8862 put_ctx(child_ctx);
8863}
8864
8865
8866
8867
8868void perf_event_exit_task(struct task_struct *child)
8869{
8870 struct perf_event *event, *tmp;
8871 int ctxn;
8872
8873 mutex_lock(&child->perf_event_mutex);
8874 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8875 owner_entry) {
8876 list_del_init(&event->owner_entry);
8877
8878
8879
8880
8881
8882
8883 smp_wmb();
8884 event->owner = NULL;
8885 }
8886 mutex_unlock(&child->perf_event_mutex);
8887
8888 for_each_task_context_nr(ctxn)
8889 perf_event_exit_task_context(child, ctxn);
8890
8891
8892
8893
8894
8895
8896
8897 perf_event_task(child, NULL, 0);
8898}
8899
8900static void perf_free_event(struct perf_event *event,
8901 struct perf_event_context *ctx)
8902{
8903 struct perf_event *parent = event->parent;
8904
8905 if (WARN_ON_ONCE(!parent))
8906 return;
8907
8908 mutex_lock(&parent->child_mutex);
8909 list_del_init(&event->child_list);
8910 mutex_unlock(&parent->child_mutex);
8911
8912 put_event(parent);
8913
8914 raw_spin_lock_irq(&ctx->lock);
8915 perf_group_detach(event);
8916 list_del_event(event, ctx);
8917 raw_spin_unlock_irq(&ctx->lock);
8918 free_event(event);
8919}
8920
8921
8922
8923
8924
8925
8926
8927
8928void perf_event_free_task(struct task_struct *task)
8929{
8930 struct perf_event_context *ctx;
8931 struct perf_event *event, *tmp;
8932 int ctxn;
8933
8934 for_each_task_context_nr(ctxn) {
8935 ctx = task->perf_event_ctxp[ctxn];
8936 if (!ctx)
8937 continue;
8938
8939 mutex_lock(&ctx->mutex);
8940again:
8941 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8942 group_entry)
8943 perf_free_event(event, ctx);
8944
8945 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8946 group_entry)
8947 perf_free_event(event, ctx);
8948
8949 if (!list_empty(&ctx->pinned_groups) ||
8950 !list_empty(&ctx->flexible_groups))
8951 goto again;
8952
8953 mutex_unlock(&ctx->mutex);
8954
8955 put_ctx(ctx);
8956 }
8957}
8958
8959void perf_event_delayed_put(struct task_struct *task)
8960{
8961 int ctxn;
8962
8963 for_each_task_context_nr(ctxn)
8964 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8965}
8966
8967struct perf_event *perf_event_get(unsigned int fd)
8968{
8969 int err;
8970 struct fd f;
8971 struct perf_event *event;
8972
8973 err = perf_fget_light(fd, &f);
8974 if (err)
8975 return ERR_PTR(err);
8976
8977 event = f.file->private_data;
8978 atomic_long_inc(&event->refcount);
8979 fdput(f);
8980
8981 return event;
8982}
8983
8984const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
8985{
8986 if (!event)
8987 return ERR_PTR(-EINVAL);
8988
8989 return &event->attr;
8990}
8991
8992
8993
8994
8995static struct perf_event *
8996inherit_event(struct perf_event *parent_event,
8997 struct task_struct *parent,
8998 struct perf_event_context *parent_ctx,
8999 struct task_struct *child,
9000 struct perf_event *group_leader,
9001 struct perf_event_context *child_ctx)
9002{
9003 enum perf_event_active_state parent_state = parent_event->state;
9004 struct perf_event *child_event;
9005 unsigned long flags;
9006
9007
9008
9009
9010
9011
9012
9013 if (parent_event->parent)
9014 parent_event = parent_event->parent;
9015
9016 child_event = perf_event_alloc(&parent_event->attr,
9017 parent_event->cpu,
9018 child,
9019 group_leader, parent_event,
9020 NULL, NULL, -1);
9021 if (IS_ERR(child_event))
9022 return child_event;
9023
9024 if (is_orphaned_event(parent_event) ||
9025 !atomic_long_inc_not_zero(&parent_event->refcount)) {
9026 free_event(child_event);
9027 return NULL;
9028 }
9029
9030 get_ctx(child_ctx);
9031
9032
9033
9034
9035
9036
9037 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
9038 child_event->state = PERF_EVENT_STATE_INACTIVE;
9039 else
9040 child_event->state = PERF_EVENT_STATE_OFF;
9041
9042 if (parent_event->attr.freq) {
9043 u64 sample_period = parent_event->hw.sample_period;
9044 struct hw_perf_event *hwc = &child_event->hw;
9045
9046 hwc->sample_period = sample_period;
9047 hwc->last_period = sample_period;
9048
9049 local64_set(&hwc->period_left, sample_period);
9050 }
9051
9052 child_event->ctx = child_ctx;
9053 child_event->overflow_handler = parent_event->overflow_handler;
9054 child_event->overflow_handler_context
9055 = parent_event->overflow_handler_context;
9056
9057
9058
9059
9060 perf_event__header_size(child_event);
9061 perf_event__id_header_size(child_event);
9062
9063
9064
9065
9066 raw_spin_lock_irqsave(&child_ctx->lock, flags);
9067 add_event_to_ctx(child_event, child_ctx);
9068 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
9069
9070
9071
9072
9073 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
9074 mutex_lock(&parent_event->child_mutex);
9075 list_add_tail(&child_event->child_list, &parent_event->child_list);
9076 mutex_unlock(&parent_event->child_mutex);
9077
9078 return child_event;
9079}
9080
9081static int inherit_group(struct perf_event *parent_event,
9082 struct task_struct *parent,
9083 struct perf_event_context *parent_ctx,
9084 struct task_struct *child,
9085 struct perf_event_context *child_ctx)
9086{
9087 struct perf_event *leader;
9088 struct perf_event *sub;
9089 struct perf_event *child_ctr;
9090
9091 leader = inherit_event(parent_event, parent, parent_ctx,
9092 child, NULL, child_ctx);
9093 if (IS_ERR(leader))
9094 return PTR_ERR(leader);
9095 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
9096 child_ctr = inherit_event(sub, parent, parent_ctx,
9097 child, leader, child_ctx);
9098 if (IS_ERR(child_ctr))
9099 return PTR_ERR(child_ctr);
9100 }
9101 return 0;
9102}
9103
9104static int
9105inherit_task_group(struct perf_event *event, struct task_struct *parent,
9106 struct perf_event_context *parent_ctx,
9107 struct task_struct *child, int ctxn,
9108 int *inherited_all)
9109{
9110 int ret;
9111 struct perf_event_context *child_ctx;
9112
9113 if (!event->attr.inherit) {
9114 *inherited_all = 0;
9115 return 0;
9116 }
9117
9118 child_ctx = child->perf_event_ctxp[ctxn];
9119 if (!child_ctx) {
9120
9121
9122
9123
9124
9125
9126
9127 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
9128 if (!child_ctx)
9129 return -ENOMEM;
9130
9131 child->perf_event_ctxp[ctxn] = child_ctx;
9132 }
9133
9134 ret = inherit_group(event, parent, parent_ctx,
9135 child, child_ctx);
9136
9137 if (ret)
9138 *inherited_all = 0;
9139
9140 return ret;
9141}
9142
9143
9144
9145
9146static int perf_event_init_context(struct task_struct *child, int ctxn)
9147{
9148 struct perf_event_context *child_ctx, *parent_ctx;
9149 struct perf_event_context *cloned_ctx;
9150 struct perf_event *event;
9151 struct task_struct *parent = current;
9152 int inherited_all = 1;
9153 unsigned long flags;
9154 int ret = 0;
9155
9156 if (likely(!parent->perf_event_ctxp[ctxn]))
9157 return 0;
9158
9159
9160
9161
9162
9163 parent_ctx = perf_pin_task_context(parent, ctxn);
9164 if (!parent_ctx)
9165 return 0;
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178 mutex_lock(&parent_ctx->mutex);
9179
9180
9181
9182
9183
9184 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
9185 ret = inherit_task_group(event, parent, parent_ctx,
9186 child, ctxn, &inherited_all);
9187 if (ret)
9188 break;
9189 }
9190
9191
9192
9193
9194
9195
9196 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9197 parent_ctx->rotate_disable = 1;
9198 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9199
9200 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
9201 ret = inherit_task_group(event, parent, parent_ctx,
9202 child, ctxn, &inherited_all);
9203 if (ret)
9204 break;
9205 }
9206
9207 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9208 parent_ctx->rotate_disable = 0;
9209
9210 child_ctx = child->perf_event_ctxp[ctxn];
9211
9212 if (child_ctx && inherited_all) {
9213
9214
9215
9216
9217
9218
9219
9220 cloned_ctx = parent_ctx->parent_ctx;
9221 if (cloned_ctx) {
9222 child_ctx->parent_ctx = cloned_ctx;
9223 child_ctx->parent_gen = parent_ctx->parent_gen;
9224 } else {
9225 child_ctx->parent_ctx = parent_ctx;
9226 child_ctx->parent_gen = parent_ctx->generation;
9227 }
9228 get_ctx(child_ctx->parent_ctx);
9229 }
9230
9231 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9232 mutex_unlock(&parent_ctx->mutex);
9233
9234 perf_unpin_context(parent_ctx);
9235 put_ctx(parent_ctx);
9236
9237 return ret;
9238}
9239
9240
9241
9242
9243int perf_event_init_task(struct task_struct *child)
9244{
9245 int ctxn, ret;
9246
9247 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
9248 mutex_init(&child->perf_event_mutex);
9249 INIT_LIST_HEAD(&child->perf_event_list);
9250
9251 for_each_task_context_nr(ctxn) {
9252 ret = perf_event_init_context(child, ctxn);
9253 if (ret) {
9254 perf_event_free_task(child);
9255 return ret;
9256 }
9257 }
9258
9259 return 0;
9260}
9261
9262static void __init perf_event_init_all_cpus(void)
9263{
9264 struct swevent_htable *swhash;
9265 int cpu;
9266
9267 for_each_possible_cpu(cpu) {
9268 swhash = &per_cpu(swevent_htable, cpu);
9269 mutex_init(&swhash->hlist_mutex);
9270 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
9271 }
9272}
9273
9274static void perf_event_init_cpu(int cpu)
9275{
9276 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9277
9278 mutex_lock(&swhash->hlist_mutex);
9279 if (swhash->hlist_refcount > 0) {
9280 struct swevent_hlist *hlist;
9281
9282 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
9283 WARN_ON(!hlist);
9284 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9285 }
9286 mutex_unlock(&swhash->hlist_mutex);
9287}
9288
9289#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
9290static void __perf_event_exit_context(void *__info)
9291{
9292 struct remove_event re = { .detach_group = true };
9293 struct perf_event_context *ctx = __info;
9294
9295 rcu_read_lock();
9296 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
9297 __perf_remove_from_context(&re);
9298 rcu_read_unlock();
9299}
9300
9301static void perf_event_exit_cpu_context(int cpu)
9302{
9303 struct perf_event_context *ctx;
9304 struct pmu *pmu;
9305 int idx;
9306
9307 idx = srcu_read_lock(&pmus_srcu);
9308 list_for_each_entry_rcu(pmu, &pmus, entry) {
9309 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
9310
9311 mutex_lock(&ctx->mutex);
9312 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
9313 mutex_unlock(&ctx->mutex);
9314 }
9315 srcu_read_unlock(&pmus_srcu, idx);
9316}
9317
9318static void perf_event_exit_cpu(int cpu)
9319{
9320 perf_event_exit_cpu_context(cpu);
9321}
9322#else
9323static inline void perf_event_exit_cpu(int cpu) { }
9324#endif
9325
9326static int
9327perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
9328{
9329 int cpu;
9330
9331 for_each_online_cpu(cpu)
9332 perf_event_exit_cpu(cpu);
9333
9334 return NOTIFY_OK;
9335}
9336
9337
9338
9339
9340
9341static struct notifier_block perf_reboot_notifier = {
9342 .notifier_call = perf_reboot,
9343 .priority = INT_MIN,
9344};
9345
9346static int
9347perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
9348{
9349 unsigned int cpu = (long)hcpu;
9350
9351 switch (action & ~CPU_TASKS_FROZEN) {
9352
9353 case CPU_UP_PREPARE:
9354 case CPU_DOWN_FAILED:
9355 perf_event_init_cpu(cpu);
9356 break;
9357
9358 case CPU_UP_CANCELED:
9359 case CPU_DOWN_PREPARE:
9360 perf_event_exit_cpu(cpu);
9361 break;
9362 default:
9363 break;
9364 }
9365
9366 return NOTIFY_OK;
9367}
9368
9369void __init perf_event_init(void)
9370{
9371 int ret;
9372
9373 idr_init(&pmu_idr);
9374
9375 perf_event_init_all_cpus();
9376 init_srcu_struct(&pmus_srcu);
9377 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
9378 perf_pmu_register(&perf_cpu_clock, NULL, -1);
9379 perf_pmu_register(&perf_task_clock, NULL, -1);
9380 perf_tp_register();
9381 perf_cpu_notifier(perf_cpu_notify);
9382 register_reboot_notifier(&perf_reboot_notifier);
9383
9384 ret = init_hw_breakpoint();
9385 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
9386
9387
9388 jump_label_rate_limit(&perf_sched_events, HZ);
9389
9390
9391
9392
9393
9394 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
9395 != 1024);
9396}
9397
9398ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
9399 char *page)
9400{
9401 struct perf_pmu_events_attr *pmu_attr =
9402 container_of(attr, struct perf_pmu_events_attr, attr);
9403
9404 if (pmu_attr->event_str)
9405 return sprintf(page, "%s\n", pmu_attr->event_str);
9406
9407 return 0;
9408}
9409
9410static int __init perf_event_sysfs_init(void)
9411{
9412 struct pmu *pmu;
9413 int ret;
9414
9415 mutex_lock(&pmus_lock);
9416
9417 ret = bus_register(&pmu_bus);
9418 if (ret)
9419 goto unlock;
9420
9421 list_for_each_entry(pmu, &pmus, entry) {
9422 if (!pmu->name || pmu->type < 0)
9423 continue;
9424
9425 ret = pmu_dev_alloc(pmu);
9426 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
9427 }
9428 pmu_bus_running = 1;
9429 ret = 0;
9430
9431unlock:
9432 mutex_unlock(&pmus_lock);
9433
9434 return ret;
9435}
9436device_initcall(perf_event_sysfs_init);
9437
9438#ifdef CONFIG_CGROUP_PERF
9439static struct cgroup_subsys_state *
9440perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9441{
9442 struct perf_cgroup *jc;
9443
9444 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
9445 if (!jc)
9446 return ERR_PTR(-ENOMEM);
9447
9448 jc->info = alloc_percpu(struct perf_cgroup_info);
9449 if (!jc->info) {
9450 kfree(jc);
9451 return ERR_PTR(-ENOMEM);
9452 }
9453
9454 return &jc->css;
9455}
9456
9457static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9458{
9459 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9460
9461 free_percpu(jc->info);
9462 kfree(jc);
9463}
9464
9465static int __perf_cgroup_move(void *info)
9466{
9467 struct task_struct *task = info;
9468 rcu_read_lock();
9469 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9470 rcu_read_unlock();
9471 return 0;
9472}
9473
9474static void perf_cgroup_attach(struct cgroup_taskset *tset)
9475{
9476 struct task_struct *task;
9477 struct cgroup_subsys_state *css;
9478
9479 cgroup_taskset_for_each(task, css, tset)
9480 task_function_call(task, __perf_cgroup_move, task);
9481}
9482
9483struct cgroup_subsys perf_event_cgrp_subsys = {
9484 .css_alloc = perf_cgroup_css_alloc,
9485 .css_free = perf_cgroup_css_free,
9486 .attach = perf_cgroup_attach,
9487};
9488#endif
9489