1
2
3
4
5
6
7
8
9
10
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/tick.h>
22#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
26#include <linux/reboot.h>
27#include <linux/vmstat.h>
28#include <linux/device.h>
29#include <linux/export.h>
30#include <linux/vmalloc.h>
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/ftrace_event.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47
48#include "internal.h"
49
50#include <asm/irq_regs.h>
51
52static struct workqueue_struct *perf_wq;
53
54struct remote_function_call {
55 struct task_struct *p;
56 int (*func)(void *info);
57 void *info;
58 int ret;
59};
60
61static void remote_function(void *data)
62{
63 struct remote_function_call *tfc = data;
64 struct task_struct *p = tfc->p;
65
66 if (p) {
67 tfc->ret = -EAGAIN;
68 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
69 return;
70 }
71
72 tfc->ret = tfc->func(tfc->info);
73}
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88static int
89task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
90{
91 struct remote_function_call data = {
92 .p = p,
93 .func = func,
94 .info = info,
95 .ret = -ESRCH,
96 };
97
98 if (task_curr(p))
99 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
100
101 return data.ret;
102}
103
104
105
106
107
108
109
110
111
112
113static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
114{
115 struct remote_function_call data = {
116 .p = NULL,
117 .func = func,
118 .info = info,
119 .ret = -ENXIO,
120 };
121
122 smp_call_function_single(cpu, remote_function, &data, 1);
123
124 return data.ret;
125}
126
127#define EVENT_OWNER_KERNEL ((void *) -1)
128
129static bool is_kernel_event(struct perf_event *event)
130{
131 return event->owner == EVENT_OWNER_KERNEL;
132}
133
134#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
135 PERF_FLAG_FD_OUTPUT |\
136 PERF_FLAG_PID_CGROUP |\
137 PERF_FLAG_FD_CLOEXEC)
138
139
140
141
142#define PERF_SAMPLE_BRANCH_PERM_PLM \
143 (PERF_SAMPLE_BRANCH_KERNEL |\
144 PERF_SAMPLE_BRANCH_HV)
145
146enum event_type_t {
147 EVENT_FLEXIBLE = 0x1,
148 EVENT_PINNED = 0x2,
149 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
150};
151
152
153
154
155
156struct static_key_deferred perf_sched_events __read_mostly;
157static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
158static DEFINE_PER_CPU(int, perf_sched_cb_usages);
159
160static atomic_t nr_mmap_events __read_mostly;
161static atomic_t nr_comm_events __read_mostly;
162static atomic_t nr_task_events __read_mostly;
163static atomic_t nr_freq_events __read_mostly;
164
165static LIST_HEAD(pmus);
166static DEFINE_MUTEX(pmus_lock);
167static struct srcu_struct pmus_srcu;
168
169
170
171
172
173
174
175
176int sysctl_perf_event_paranoid __read_mostly = 1;
177
178
179int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
180
181
182
183
184#define DEFAULT_MAX_SAMPLE_RATE 100000
185#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
186#define DEFAULT_CPU_TIME_MAX_PERCENT 25
187
188int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
189
190static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
191static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
192
193static int perf_sample_allowed_ns __read_mostly =
194 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
195
196void update_perf_cpu_limits(void)
197{
198 u64 tmp = perf_sample_period_ns;
199
200 tmp *= sysctl_perf_cpu_time_max_percent;
201 do_div(tmp, 100);
202 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
203}
204
205static int perf_rotate_context(struct perf_cpu_context *cpuctx);
206
207int perf_proc_update_handler(struct ctl_table *table, int write,
208 void __user *buffer, size_t *lenp,
209 loff_t *ppos)
210{
211 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
212
213 if (ret || !write)
214 return ret;
215
216 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
217 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
224
225int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp,
227 loff_t *ppos)
228{
229 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
230
231 if (ret || !write)
232 return ret;
233
234 update_perf_cpu_limits();
235
236 return 0;
237}
238
239
240
241
242
243
244
245#define NR_ACCUMULATED_SAMPLES 128
246static DEFINE_PER_CPU(u64, running_sample_length);
247
248static void perf_duration_warn(struct irq_work *w)
249{
250 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
251 u64 avg_local_sample_len;
252 u64 local_samples_len;
253
254 local_samples_len = __this_cpu_read(running_sample_length);
255 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
256
257 printk_ratelimited(KERN_WARNING
258 "perf interrupt took too long (%lld > %lld), lowering "
259 "kernel.perf_event_max_sample_rate to %d\n",
260 avg_local_sample_len, allowed_ns >> 1,
261 sysctl_perf_event_sample_rate);
262}
263
264static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
265
266void perf_sample_event_took(u64 sample_len_ns)
267{
268 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
269 u64 avg_local_sample_len;
270 u64 local_samples_len;
271
272 if (allowed_ns == 0)
273 return;
274
275
276 local_samples_len = __this_cpu_read(running_sample_length);
277 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
278 local_samples_len += sample_len_ns;
279 __this_cpu_write(running_sample_length, local_samples_len);
280
281
282
283
284
285
286 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
287
288 if (avg_local_sample_len <= allowed_ns)
289 return;
290
291 if (max_samples_per_tick <= 1)
292 return;
293
294 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
295 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
296 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
297
298 update_perf_cpu_limits();
299
300 if (!irq_work_queue(&perf_duration_work)) {
301 early_printk("perf interrupt took too long (%lld > %lld), lowering "
302 "kernel.perf_event_max_sample_rate to %d\n",
303 avg_local_sample_len, allowed_ns >> 1,
304 sysctl_perf_event_sample_rate);
305 }
306}
307
308static atomic64_t perf_event_id;
309
310static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
311 enum event_type_t event_type);
312
313static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
314 enum event_type_t event_type,
315 struct task_struct *task);
316
317static void update_context_time(struct perf_event_context *ctx);
318static u64 perf_event_time(struct perf_event *event);
319
320void __weak perf_event_print_debug(void) { }
321
322extern __weak const char *perf_pmu_name(void)
323{
324 return "pmu";
325}
326
327static inline u64 perf_clock(void)
328{
329 return local_clock();
330}
331
332static inline u64 perf_event_clock(struct perf_event *event)
333{
334 return event->clock();
335}
336
337static inline struct perf_cpu_context *
338__get_cpu_context(struct perf_event_context *ctx)
339{
340 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
341}
342
343static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
344 struct perf_event_context *ctx)
345{
346 raw_spin_lock(&cpuctx->ctx.lock);
347 if (ctx)
348 raw_spin_lock(&ctx->lock);
349}
350
351static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
352 struct perf_event_context *ctx)
353{
354 if (ctx)
355 raw_spin_unlock(&ctx->lock);
356 raw_spin_unlock(&cpuctx->ctx.lock);
357}
358
359#ifdef CONFIG_CGROUP_PERF
360
361static inline bool
362perf_cgroup_match(struct perf_event *event)
363{
364 struct perf_event_context *ctx = event->ctx;
365 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
366
367
368 if (!event->cgrp)
369 return true;
370
371
372 if (!cpuctx->cgrp)
373 return false;
374
375
376
377
378
379
380
381 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
382 event->cgrp->css.cgroup);
383}
384
385static inline void perf_detach_cgroup(struct perf_event *event)
386{
387 css_put(&event->cgrp->css);
388 event->cgrp = NULL;
389}
390
391static inline int is_cgroup_event(struct perf_event *event)
392{
393 return event->cgrp != NULL;
394}
395
396static inline u64 perf_cgroup_event_time(struct perf_event *event)
397{
398 struct perf_cgroup_info *t;
399
400 t = per_cpu_ptr(event->cgrp->info, event->cpu);
401 return t->time;
402}
403
404static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
405{
406 struct perf_cgroup_info *info;
407 u64 now;
408
409 now = perf_clock();
410
411 info = this_cpu_ptr(cgrp->info);
412
413 info->time += now - info->timestamp;
414 info->timestamp = now;
415}
416
417static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
418{
419 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
420 if (cgrp_out)
421 __update_cgrp_time(cgrp_out);
422}
423
424static inline void update_cgrp_time_from_event(struct perf_event *event)
425{
426 struct perf_cgroup *cgrp;
427
428
429
430
431
432 if (!is_cgroup_event(event))
433 return;
434
435 cgrp = perf_cgroup_from_task(current);
436
437
438
439 if (cgrp == event->cgrp)
440 __update_cgrp_time(event->cgrp);
441}
442
443static inline void
444perf_cgroup_set_timestamp(struct task_struct *task,
445 struct perf_event_context *ctx)
446{
447 struct perf_cgroup *cgrp;
448 struct perf_cgroup_info *info;
449
450
451
452
453
454
455 if (!task || !ctx->nr_cgroups)
456 return;
457
458 cgrp = perf_cgroup_from_task(task);
459 info = this_cpu_ptr(cgrp->info);
460 info->timestamp = ctx->timestamp;
461}
462
463#define PERF_CGROUP_SWOUT 0x1
464#define PERF_CGROUP_SWIN 0x2
465
466
467
468
469
470
471
472void perf_cgroup_switch(struct task_struct *task, int mode)
473{
474 struct perf_cpu_context *cpuctx;
475 struct pmu *pmu;
476 unsigned long flags;
477
478
479
480
481
482
483 local_irq_save(flags);
484
485
486
487
488
489 rcu_read_lock();
490
491 list_for_each_entry_rcu(pmu, &pmus, entry) {
492 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
493 if (cpuctx->unique_pmu != pmu)
494 continue;
495
496
497
498
499
500
501
502
503 if (cpuctx->ctx.nr_cgroups > 0) {
504 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
505 perf_pmu_disable(cpuctx->ctx.pmu);
506
507 if (mode & PERF_CGROUP_SWOUT) {
508 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
509
510
511
512
513 cpuctx->cgrp = NULL;
514 }
515
516 if (mode & PERF_CGROUP_SWIN) {
517 WARN_ON_ONCE(cpuctx->cgrp);
518
519
520
521
522
523 cpuctx->cgrp = perf_cgroup_from_task(task);
524 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
525 }
526 perf_pmu_enable(cpuctx->ctx.pmu);
527 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
528 }
529 }
530
531 rcu_read_unlock();
532
533 local_irq_restore(flags);
534}
535
536static inline void perf_cgroup_sched_out(struct task_struct *task,
537 struct task_struct *next)
538{
539 struct perf_cgroup *cgrp1;
540 struct perf_cgroup *cgrp2 = NULL;
541
542
543
544
545 cgrp1 = perf_cgroup_from_task(task);
546
547
548
549
550
551 if (next)
552 cgrp2 = perf_cgroup_from_task(next);
553
554
555
556
557
558
559 if (cgrp1 != cgrp2)
560 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
561}
562
563static inline void perf_cgroup_sched_in(struct task_struct *prev,
564 struct task_struct *task)
565{
566 struct perf_cgroup *cgrp1;
567 struct perf_cgroup *cgrp2 = NULL;
568
569
570
571
572 cgrp1 = perf_cgroup_from_task(task);
573
574
575 cgrp2 = perf_cgroup_from_task(prev);
576
577
578
579
580
581
582 if (cgrp1 != cgrp2)
583 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
584}
585
586static inline int perf_cgroup_connect(int fd, struct perf_event *event,
587 struct perf_event_attr *attr,
588 struct perf_event *group_leader)
589{
590 struct perf_cgroup *cgrp;
591 struct cgroup_subsys_state *css;
592 struct fd f = fdget(fd);
593 int ret = 0;
594
595 if (!f.file)
596 return -EBADF;
597
598 css = css_tryget_online_from_dir(f.file->f_path.dentry,
599 &perf_event_cgrp_subsys);
600 if (IS_ERR(css)) {
601 ret = PTR_ERR(css);
602 goto out;
603 }
604
605 cgrp = container_of(css, struct perf_cgroup, css);
606 event->cgrp = cgrp;
607
608
609
610
611
612
613 if (group_leader && group_leader->cgrp != cgrp) {
614 perf_detach_cgroup(event);
615 ret = -EINVAL;
616 }
617out:
618 fdput(f);
619 return ret;
620}
621
622static inline void
623perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
624{
625 struct perf_cgroup_info *t;
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 event->shadow_ctx_time = now - t->timestamp;
628}
629
630static inline void
631perf_cgroup_defer_enabled(struct perf_event *event)
632{
633
634
635
636
637
638
639 if (is_cgroup_event(event) && !perf_cgroup_match(event))
640 event->cgrp_defer_enabled = 1;
641}
642
643static inline void
644perf_cgroup_mark_enabled(struct perf_event *event,
645 struct perf_event_context *ctx)
646{
647 struct perf_event *sub;
648 u64 tstamp = perf_event_time(event);
649
650 if (!event->cgrp_defer_enabled)
651 return;
652
653 event->cgrp_defer_enabled = 0;
654
655 event->tstamp_enabled = tstamp - event->total_time_enabled;
656 list_for_each_entry(sub, &event->sibling_list, group_entry) {
657 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
658 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
659 sub->cgrp_defer_enabled = 0;
660 }
661 }
662}
663#else
664
665static inline bool
666perf_cgroup_match(struct perf_event *event)
667{
668 return true;
669}
670
671static inline void perf_detach_cgroup(struct perf_event *event)
672{}
673
674static inline int is_cgroup_event(struct perf_event *event)
675{
676 return 0;
677}
678
679static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
680{
681 return 0;
682}
683
684static inline void update_cgrp_time_from_event(struct perf_event *event)
685{
686}
687
688static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
689{
690}
691
692static inline void perf_cgroup_sched_out(struct task_struct *task,
693 struct task_struct *next)
694{
695}
696
697static inline void perf_cgroup_sched_in(struct task_struct *prev,
698 struct task_struct *task)
699{
700}
701
702static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
703 struct perf_event_attr *attr,
704 struct perf_event *group_leader)
705{
706 return -EINVAL;
707}
708
709static inline void
710perf_cgroup_set_timestamp(struct task_struct *task,
711 struct perf_event_context *ctx)
712{
713}
714
715void
716perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
717{
718}
719
720static inline void
721perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
722{
723}
724
725static inline u64 perf_cgroup_event_time(struct perf_event *event)
726{
727 return 0;
728}
729
730static inline void
731perf_cgroup_defer_enabled(struct perf_event *event)
732{
733}
734
735static inline void
736perf_cgroup_mark_enabled(struct perf_event *event,
737 struct perf_event_context *ctx)
738{
739}
740#endif
741
742
743
744
745
746#define PERF_CPU_HRTIMER (1000 / HZ)
747
748
749
750static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
751{
752 struct perf_cpu_context *cpuctx;
753 enum hrtimer_restart ret = HRTIMER_NORESTART;
754 int rotations = 0;
755
756 WARN_ON(!irqs_disabled());
757
758 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
759
760 rotations = perf_rotate_context(cpuctx);
761
762
763
764
765 if (rotations) {
766 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
767 ret = HRTIMER_RESTART;
768 }
769
770 return ret;
771}
772
773
774void perf_cpu_hrtimer_cancel(int cpu)
775{
776 struct perf_cpu_context *cpuctx;
777 struct pmu *pmu;
778 unsigned long flags;
779
780 if (WARN_ON(cpu != smp_processor_id()))
781 return;
782
783 local_irq_save(flags);
784
785 rcu_read_lock();
786
787 list_for_each_entry_rcu(pmu, &pmus, entry) {
788 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
789
790 if (pmu->task_ctx_nr == perf_sw_context)
791 continue;
792
793 hrtimer_cancel(&cpuctx->hrtimer);
794 }
795
796 rcu_read_unlock();
797
798 local_irq_restore(flags);
799}
800
801static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
802{
803 struct hrtimer *hr = &cpuctx->hrtimer;
804 struct pmu *pmu = cpuctx->ctx.pmu;
805 int timer;
806
807
808 if (pmu->task_ctx_nr == perf_sw_context)
809 return;
810
811
812
813
814
815 timer = pmu->hrtimer_interval_ms;
816 if (timer < 1)
817 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
818
819 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
820
821 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
822 hr->function = perf_cpu_hrtimer_handler;
823}
824
825static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
826{
827 struct hrtimer *hr = &cpuctx->hrtimer;
828 struct pmu *pmu = cpuctx->ctx.pmu;
829
830
831 if (pmu->task_ctx_nr == perf_sw_context)
832 return;
833
834 if (hrtimer_active(hr))
835 return;
836
837 if (!hrtimer_callback_running(hr))
838 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
839 0, HRTIMER_MODE_REL_PINNED, 0);
840}
841
842void perf_pmu_disable(struct pmu *pmu)
843{
844 int *count = this_cpu_ptr(pmu->pmu_disable_count);
845 if (!(*count)++)
846 pmu->pmu_disable(pmu);
847}
848
849void perf_pmu_enable(struct pmu *pmu)
850{
851 int *count = this_cpu_ptr(pmu->pmu_disable_count);
852 if (!--(*count))
853 pmu->pmu_enable(pmu);
854}
855
856static DEFINE_PER_CPU(struct list_head, active_ctx_list);
857
858
859
860
861
862
863
864static void perf_event_ctx_activate(struct perf_event_context *ctx)
865{
866 struct list_head *head = this_cpu_ptr(&active_ctx_list);
867
868 WARN_ON(!irqs_disabled());
869
870 WARN_ON(!list_empty(&ctx->active_ctx_list));
871
872 list_add(&ctx->active_ctx_list, head);
873}
874
875static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
876{
877 WARN_ON(!irqs_disabled());
878
879 WARN_ON(list_empty(&ctx->active_ctx_list));
880
881 list_del_init(&ctx->active_ctx_list);
882}
883
884static void get_ctx(struct perf_event_context *ctx)
885{
886 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
887}
888
889static void free_ctx(struct rcu_head *head)
890{
891 struct perf_event_context *ctx;
892
893 ctx = container_of(head, struct perf_event_context, rcu_head);
894 kfree(ctx->task_ctx_data);
895 kfree(ctx);
896}
897
898static void put_ctx(struct perf_event_context *ctx)
899{
900 if (atomic_dec_and_test(&ctx->refcount)) {
901 if (ctx->parent_ctx)
902 put_ctx(ctx->parent_ctx);
903 if (ctx->task)
904 put_task_struct(ctx->task);
905 call_rcu(&ctx->rcu_head, free_ctx);
906 }
907}
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970static struct perf_event_context *
971perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
972{
973 struct perf_event_context *ctx;
974
975again:
976 rcu_read_lock();
977 ctx = ACCESS_ONCE(event->ctx);
978 if (!atomic_inc_not_zero(&ctx->refcount)) {
979 rcu_read_unlock();
980 goto again;
981 }
982 rcu_read_unlock();
983
984 mutex_lock_nested(&ctx->mutex, nesting);
985 if (event->ctx != ctx) {
986 mutex_unlock(&ctx->mutex);
987 put_ctx(ctx);
988 goto again;
989 }
990
991 return ctx;
992}
993
994static inline struct perf_event_context *
995perf_event_ctx_lock(struct perf_event *event)
996{
997 return perf_event_ctx_lock_nested(event, 0);
998}
999
1000static void perf_event_ctx_unlock(struct perf_event *event,
1001 struct perf_event_context *ctx)
1002{
1003 mutex_unlock(&ctx->mutex);
1004 put_ctx(ctx);
1005}
1006
1007
1008
1009
1010
1011
1012static __must_check struct perf_event_context *
1013unclone_ctx(struct perf_event_context *ctx)
1014{
1015 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1016
1017 lockdep_assert_held(&ctx->lock);
1018
1019 if (parent_ctx)
1020 ctx->parent_ctx = NULL;
1021 ctx->generation++;
1022
1023 return parent_ctx;
1024}
1025
1026static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1027{
1028
1029
1030
1031 if (event->parent)
1032 event = event->parent;
1033
1034 return task_tgid_nr_ns(p, event->ns);
1035}
1036
1037static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1038{
1039
1040
1041
1042 if (event->parent)
1043 event = event->parent;
1044
1045 return task_pid_nr_ns(p, event->ns);
1046}
1047
1048
1049
1050
1051
1052static u64 primary_event_id(struct perf_event *event)
1053{
1054 u64 id = event->id;
1055
1056 if (event->parent)
1057 id = event->parent->id;
1058
1059 return id;
1060}
1061
1062
1063
1064
1065
1066
1067static struct perf_event_context *
1068perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1069{
1070 struct perf_event_context *ctx;
1071
1072retry:
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082 preempt_disable();
1083 rcu_read_lock();
1084 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1085 if (ctx) {
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096 raw_spin_lock_irqsave(&ctx->lock, *flags);
1097 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1098 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1099 rcu_read_unlock();
1100 preempt_enable();
1101 goto retry;
1102 }
1103
1104 if (!atomic_inc_not_zero(&ctx->refcount)) {
1105 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1106 ctx = NULL;
1107 }
1108 }
1109 rcu_read_unlock();
1110 preempt_enable();
1111 return ctx;
1112}
1113
1114
1115
1116
1117
1118
1119static struct perf_event_context *
1120perf_pin_task_context(struct task_struct *task, int ctxn)
1121{
1122 struct perf_event_context *ctx;
1123 unsigned long flags;
1124
1125 ctx = perf_lock_task_context(task, ctxn, &flags);
1126 if (ctx) {
1127 ++ctx->pin_count;
1128 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1129 }
1130 return ctx;
1131}
1132
1133static void perf_unpin_context(struct perf_event_context *ctx)
1134{
1135 unsigned long flags;
1136
1137 raw_spin_lock_irqsave(&ctx->lock, flags);
1138 --ctx->pin_count;
1139 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1140}
1141
1142
1143
1144
1145static void update_context_time(struct perf_event_context *ctx)
1146{
1147 u64 now = perf_clock();
1148
1149 ctx->time += now - ctx->timestamp;
1150 ctx->timestamp = now;
1151}
1152
1153static u64 perf_event_time(struct perf_event *event)
1154{
1155 struct perf_event_context *ctx = event->ctx;
1156
1157 if (is_cgroup_event(event))
1158 return perf_cgroup_event_time(event);
1159
1160 return ctx ? ctx->time : 0;
1161}
1162
1163
1164
1165
1166
1167static void update_event_times(struct perf_event *event)
1168{
1169 struct perf_event_context *ctx = event->ctx;
1170 u64 run_end;
1171
1172 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1173 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1174 return;
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185 if (is_cgroup_event(event))
1186 run_end = perf_cgroup_event_time(event);
1187 else if (ctx->is_active)
1188 run_end = ctx->time;
1189 else
1190 run_end = event->tstamp_stopped;
1191
1192 event->total_time_enabled = run_end - event->tstamp_enabled;
1193
1194 if (event->state == PERF_EVENT_STATE_INACTIVE)
1195 run_end = event->tstamp_stopped;
1196 else
1197 run_end = perf_event_time(event);
1198
1199 event->total_time_running = run_end - event->tstamp_running;
1200
1201}
1202
1203
1204
1205
1206static void update_group_times(struct perf_event *leader)
1207{
1208 struct perf_event *event;
1209
1210 update_event_times(leader);
1211 list_for_each_entry(event, &leader->sibling_list, group_entry)
1212 update_event_times(event);
1213}
1214
1215static struct list_head *
1216ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1217{
1218 if (event->attr.pinned)
1219 return &ctx->pinned_groups;
1220 else
1221 return &ctx->flexible_groups;
1222}
1223
1224
1225
1226
1227
1228static void
1229list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1230{
1231 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1232 event->attach_state |= PERF_ATTACH_CONTEXT;
1233
1234
1235
1236
1237
1238
1239 if (event->group_leader == event) {
1240 struct list_head *list;
1241
1242 if (is_software_event(event))
1243 event->group_flags |= PERF_GROUP_SOFTWARE;
1244
1245 list = ctx_group_list(event, ctx);
1246 list_add_tail(&event->group_entry, list);
1247 }
1248
1249 if (is_cgroup_event(event))
1250 ctx->nr_cgroups++;
1251
1252 list_add_rcu(&event->event_entry, &ctx->event_list);
1253 ctx->nr_events++;
1254 if (event->attr.inherit_stat)
1255 ctx->nr_stat++;
1256
1257 ctx->generation++;
1258}
1259
1260
1261
1262
1263static inline void perf_event__state_init(struct perf_event *event)
1264{
1265 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1266 PERF_EVENT_STATE_INACTIVE;
1267}
1268
1269
1270
1271
1272
1273static void perf_event__read_size(struct perf_event *event)
1274{
1275 int entry = sizeof(u64);
1276 int size = 0;
1277 int nr = 1;
1278
1279 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1280 size += sizeof(u64);
1281
1282 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1283 size += sizeof(u64);
1284
1285 if (event->attr.read_format & PERF_FORMAT_ID)
1286 entry += sizeof(u64);
1287
1288 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1289 nr += event->group_leader->nr_siblings;
1290 size += sizeof(u64);
1291 }
1292
1293 size += entry * nr;
1294 event->read_size = size;
1295}
1296
1297static void perf_event__header_size(struct perf_event *event)
1298{
1299 struct perf_sample_data *data;
1300 u64 sample_type = event->attr.sample_type;
1301 u16 size = 0;
1302
1303 perf_event__read_size(event);
1304
1305 if (sample_type & PERF_SAMPLE_IP)
1306 size += sizeof(data->ip);
1307
1308 if (sample_type & PERF_SAMPLE_ADDR)
1309 size += sizeof(data->addr);
1310
1311 if (sample_type & PERF_SAMPLE_PERIOD)
1312 size += sizeof(data->period);
1313
1314 if (sample_type & PERF_SAMPLE_WEIGHT)
1315 size += sizeof(data->weight);
1316
1317 if (sample_type & PERF_SAMPLE_READ)
1318 size += event->read_size;
1319
1320 if (sample_type & PERF_SAMPLE_DATA_SRC)
1321 size += sizeof(data->data_src.val);
1322
1323 if (sample_type & PERF_SAMPLE_TRANSACTION)
1324 size += sizeof(data->txn);
1325
1326 event->header_size = size;
1327}
1328
1329static void perf_event__id_header_size(struct perf_event *event)
1330{
1331 struct perf_sample_data *data;
1332 u64 sample_type = event->attr.sample_type;
1333 u16 size = 0;
1334
1335 if (sample_type & PERF_SAMPLE_TID)
1336 size += sizeof(data->tid_entry);
1337
1338 if (sample_type & PERF_SAMPLE_TIME)
1339 size += sizeof(data->time);
1340
1341 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1342 size += sizeof(data->id);
1343
1344 if (sample_type & PERF_SAMPLE_ID)
1345 size += sizeof(data->id);
1346
1347 if (sample_type & PERF_SAMPLE_STREAM_ID)
1348 size += sizeof(data->stream_id);
1349
1350 if (sample_type & PERF_SAMPLE_CPU)
1351 size += sizeof(data->cpu_entry);
1352
1353 event->id_header_size = size;
1354}
1355
1356static void perf_group_attach(struct perf_event *event)
1357{
1358 struct perf_event *group_leader = event->group_leader, *pos;
1359
1360
1361
1362
1363 if (event->attach_state & PERF_ATTACH_GROUP)
1364 return;
1365
1366 event->attach_state |= PERF_ATTACH_GROUP;
1367
1368 if (group_leader == event)
1369 return;
1370
1371 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1372
1373 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1374 !is_software_event(event))
1375 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1376
1377 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1378 group_leader->nr_siblings++;
1379
1380 perf_event__header_size(group_leader);
1381
1382 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1383 perf_event__header_size(pos);
1384}
1385
1386
1387
1388
1389
1390static void
1391list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1392{
1393 struct perf_cpu_context *cpuctx;
1394
1395 WARN_ON_ONCE(event->ctx != ctx);
1396 lockdep_assert_held(&ctx->lock);
1397
1398
1399
1400
1401 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1402 return;
1403
1404 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1405
1406 if (is_cgroup_event(event)) {
1407 ctx->nr_cgroups--;
1408 cpuctx = __get_cpu_context(ctx);
1409
1410
1411
1412
1413
1414 if (!ctx->nr_cgroups)
1415 cpuctx->cgrp = NULL;
1416 }
1417
1418 ctx->nr_events--;
1419 if (event->attr.inherit_stat)
1420 ctx->nr_stat--;
1421
1422 list_del_rcu(&event->event_entry);
1423
1424 if (event->group_leader == event)
1425 list_del_init(&event->group_entry);
1426
1427 update_group_times(event);
1428
1429
1430
1431
1432
1433
1434
1435
1436 if (event->state > PERF_EVENT_STATE_OFF)
1437 event->state = PERF_EVENT_STATE_OFF;
1438
1439 ctx->generation++;
1440}
1441
1442static void perf_group_detach(struct perf_event *event)
1443{
1444 struct perf_event *sibling, *tmp;
1445 struct list_head *list = NULL;
1446
1447
1448
1449
1450 if (!(event->attach_state & PERF_ATTACH_GROUP))
1451 return;
1452
1453 event->attach_state &= ~PERF_ATTACH_GROUP;
1454
1455
1456
1457
1458 if (event->group_leader != event) {
1459 list_del_init(&event->group_entry);
1460 event->group_leader->nr_siblings--;
1461 goto out;
1462 }
1463
1464 if (!list_empty(&event->group_entry))
1465 list = &event->group_entry;
1466
1467
1468
1469
1470
1471
1472 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1473 if (list)
1474 list_move_tail(&sibling->group_entry, list);
1475 sibling->group_leader = sibling;
1476
1477
1478 sibling->group_flags = event->group_flags;
1479
1480 WARN_ON_ONCE(sibling->ctx != event->ctx);
1481 }
1482
1483out:
1484 perf_event__header_size(event->group_leader);
1485
1486 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1487 perf_event__header_size(tmp);
1488}
1489
1490
1491
1492
1493static bool is_orphaned_event(struct perf_event *event)
1494{
1495 return event && !is_kernel_event(event) && !event->owner;
1496}
1497
1498
1499
1500
1501
1502static bool is_orphaned_child(struct perf_event *event)
1503{
1504 return is_orphaned_event(event->parent);
1505}
1506
1507static void orphans_remove_work(struct work_struct *work);
1508
1509static void schedule_orphans_remove(struct perf_event_context *ctx)
1510{
1511 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1512 return;
1513
1514 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1515 get_ctx(ctx);
1516 ctx->orphans_remove_sched = true;
1517 }
1518}
1519
1520static int __init perf_workqueue_init(void)
1521{
1522 perf_wq = create_singlethread_workqueue("perf");
1523 WARN(!perf_wq, "failed to create perf workqueue\n");
1524 return perf_wq ? 0 : -1;
1525}
1526
1527core_initcall(perf_workqueue_init);
1528
1529static inline int
1530event_filter_match(struct perf_event *event)
1531{
1532 return (event->cpu == -1 || event->cpu == smp_processor_id())
1533 && perf_cgroup_match(event);
1534}
1535
1536static void
1537event_sched_out(struct perf_event *event,
1538 struct perf_cpu_context *cpuctx,
1539 struct perf_event_context *ctx)
1540{
1541 u64 tstamp = perf_event_time(event);
1542 u64 delta;
1543
1544 WARN_ON_ONCE(event->ctx != ctx);
1545 lockdep_assert_held(&ctx->lock);
1546
1547
1548
1549
1550
1551
1552
1553 if (event->state == PERF_EVENT_STATE_INACTIVE
1554 && !event_filter_match(event)) {
1555 delta = tstamp - event->tstamp_stopped;
1556 event->tstamp_running += delta;
1557 event->tstamp_stopped = tstamp;
1558 }
1559
1560 if (event->state != PERF_EVENT_STATE_ACTIVE)
1561 return;
1562
1563 perf_pmu_disable(event->pmu);
1564
1565 event->state = PERF_EVENT_STATE_INACTIVE;
1566 if (event->pending_disable) {
1567 event->pending_disable = 0;
1568 event->state = PERF_EVENT_STATE_OFF;
1569 }
1570 event->tstamp_stopped = tstamp;
1571 event->pmu->del(event, 0);
1572 event->oncpu = -1;
1573
1574 if (!is_software_event(event))
1575 cpuctx->active_oncpu--;
1576 if (!--ctx->nr_active)
1577 perf_event_ctx_deactivate(ctx);
1578 if (event->attr.freq && event->attr.sample_freq)
1579 ctx->nr_freq--;
1580 if (event->attr.exclusive || !cpuctx->active_oncpu)
1581 cpuctx->exclusive = 0;
1582
1583 if (is_orphaned_child(event))
1584 schedule_orphans_remove(ctx);
1585
1586 perf_pmu_enable(event->pmu);
1587}
1588
1589static void
1590group_sched_out(struct perf_event *group_event,
1591 struct perf_cpu_context *cpuctx,
1592 struct perf_event_context *ctx)
1593{
1594 struct perf_event *event;
1595 int state = group_event->state;
1596
1597 event_sched_out(group_event, cpuctx, ctx);
1598
1599
1600
1601
1602 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1603 event_sched_out(event, cpuctx, ctx);
1604
1605 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1606 cpuctx->exclusive = 0;
1607}
1608
1609struct remove_event {
1610 struct perf_event *event;
1611 bool detach_group;
1612};
1613
1614
1615
1616
1617
1618
1619
1620static int __perf_remove_from_context(void *info)
1621{
1622 struct remove_event *re = info;
1623 struct perf_event *event = re->event;
1624 struct perf_event_context *ctx = event->ctx;
1625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1626
1627 raw_spin_lock(&ctx->lock);
1628 event_sched_out(event, cpuctx, ctx);
1629 if (re->detach_group)
1630 perf_group_detach(event);
1631 list_del_event(event, ctx);
1632 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1633 ctx->is_active = 0;
1634 cpuctx->task_ctx = NULL;
1635 }
1636 raw_spin_unlock(&ctx->lock);
1637
1638 return 0;
1639}
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1656{
1657 struct perf_event_context *ctx = event->ctx;
1658 struct task_struct *task = ctx->task;
1659 struct remove_event re = {
1660 .event = event,
1661 .detach_group = detach_group,
1662 };
1663
1664 lockdep_assert_held(&ctx->mutex);
1665
1666 if (!task) {
1667
1668
1669
1670
1671
1672
1673 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1674 return;
1675 }
1676
1677retry:
1678 if (!task_function_call(task, __perf_remove_from_context, &re))
1679 return;
1680
1681 raw_spin_lock_irq(&ctx->lock);
1682
1683
1684
1685
1686 if (ctx->is_active) {
1687 raw_spin_unlock_irq(&ctx->lock);
1688
1689
1690
1691
1692 task = ctx->task;
1693 goto retry;
1694 }
1695
1696
1697
1698
1699
1700 if (detach_group)
1701 perf_group_detach(event);
1702 list_del_event(event, ctx);
1703 raw_spin_unlock_irq(&ctx->lock);
1704}
1705
1706
1707
1708
1709int __perf_event_disable(void *info)
1710{
1711 struct perf_event *event = info;
1712 struct perf_event_context *ctx = event->ctx;
1713 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1714
1715
1716
1717
1718
1719
1720
1721
1722 if (ctx->task && cpuctx->task_ctx != ctx)
1723 return -EINVAL;
1724
1725 raw_spin_lock(&ctx->lock);
1726
1727
1728
1729
1730
1731 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1732 update_context_time(ctx);
1733 update_cgrp_time_from_event(event);
1734 update_group_times(event);
1735 if (event == event->group_leader)
1736 group_sched_out(event, cpuctx, ctx);
1737 else
1738 event_sched_out(event, cpuctx, ctx);
1739 event->state = PERF_EVENT_STATE_OFF;
1740 }
1741
1742 raw_spin_unlock(&ctx->lock);
1743
1744 return 0;
1745}
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760static void _perf_event_disable(struct perf_event *event)
1761{
1762 struct perf_event_context *ctx = event->ctx;
1763 struct task_struct *task = ctx->task;
1764
1765 if (!task) {
1766
1767
1768
1769 cpu_function_call(event->cpu, __perf_event_disable, event);
1770 return;
1771 }
1772
1773retry:
1774 if (!task_function_call(task, __perf_event_disable, event))
1775 return;
1776
1777 raw_spin_lock_irq(&ctx->lock);
1778
1779
1780
1781 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1782 raw_spin_unlock_irq(&ctx->lock);
1783
1784
1785
1786
1787 task = ctx->task;
1788 goto retry;
1789 }
1790
1791
1792
1793
1794
1795 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1796 update_group_times(event);
1797 event->state = PERF_EVENT_STATE_OFF;
1798 }
1799 raw_spin_unlock_irq(&ctx->lock);
1800}
1801
1802
1803
1804
1805
1806void perf_event_disable(struct perf_event *event)
1807{
1808 struct perf_event_context *ctx;
1809
1810 ctx = perf_event_ctx_lock(event);
1811 _perf_event_disable(event);
1812 perf_event_ctx_unlock(event, ctx);
1813}
1814EXPORT_SYMBOL_GPL(perf_event_disable);
1815
1816static void perf_set_shadow_time(struct perf_event *event,
1817 struct perf_event_context *ctx,
1818 u64 tstamp)
1819{
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845 if (is_cgroup_event(event))
1846 perf_cgroup_set_shadow_time(event, tstamp);
1847 else
1848 event->shadow_ctx_time = tstamp - ctx->timestamp;
1849}
1850
1851#define MAX_INTERRUPTS (~0ULL)
1852
1853static void perf_log_throttle(struct perf_event *event, int enable);
1854static void perf_log_itrace_start(struct perf_event *event);
1855
1856static int
1857event_sched_in(struct perf_event *event,
1858 struct perf_cpu_context *cpuctx,
1859 struct perf_event_context *ctx)
1860{
1861 u64 tstamp = perf_event_time(event);
1862 int ret = 0;
1863
1864 lockdep_assert_held(&ctx->lock);
1865
1866 if (event->state <= PERF_EVENT_STATE_OFF)
1867 return 0;
1868
1869 event->state = PERF_EVENT_STATE_ACTIVE;
1870 event->oncpu = smp_processor_id();
1871
1872
1873
1874
1875
1876
1877 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1878 perf_log_throttle(event, 1);
1879 event->hw.interrupts = 0;
1880 }
1881
1882
1883
1884
1885 smp_wmb();
1886
1887 perf_pmu_disable(event->pmu);
1888
1889 event->tstamp_running += tstamp - event->tstamp_stopped;
1890
1891 perf_set_shadow_time(event, ctx, tstamp);
1892
1893 perf_log_itrace_start(event);
1894
1895 if (event->pmu->add(event, PERF_EF_START)) {
1896 event->state = PERF_EVENT_STATE_INACTIVE;
1897 event->oncpu = -1;
1898 ret = -EAGAIN;
1899 goto out;
1900 }
1901
1902 if (!is_software_event(event))
1903 cpuctx->active_oncpu++;
1904 if (!ctx->nr_active++)
1905 perf_event_ctx_activate(ctx);
1906 if (event->attr.freq && event->attr.sample_freq)
1907 ctx->nr_freq++;
1908
1909 if (event->attr.exclusive)
1910 cpuctx->exclusive = 1;
1911
1912 if (is_orphaned_child(event))
1913 schedule_orphans_remove(ctx);
1914
1915out:
1916 perf_pmu_enable(event->pmu);
1917
1918 return ret;
1919}
1920
1921static int
1922group_sched_in(struct perf_event *group_event,
1923 struct perf_cpu_context *cpuctx,
1924 struct perf_event_context *ctx)
1925{
1926 struct perf_event *event, *partial_group = NULL;
1927 struct pmu *pmu = ctx->pmu;
1928 u64 now = ctx->time;
1929 bool simulate = false;
1930
1931 if (group_event->state == PERF_EVENT_STATE_OFF)
1932 return 0;
1933
1934 pmu->start_txn(pmu);
1935
1936 if (event_sched_in(group_event, cpuctx, ctx)) {
1937 pmu->cancel_txn(pmu);
1938 perf_cpu_hrtimer_restart(cpuctx);
1939 return -EAGAIN;
1940 }
1941
1942
1943
1944
1945 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1946 if (event_sched_in(event, cpuctx, ctx)) {
1947 partial_group = event;
1948 goto group_error;
1949 }
1950 }
1951
1952 if (!pmu->commit_txn(pmu))
1953 return 0;
1954
1955group_error:
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1971 if (event == partial_group)
1972 simulate = true;
1973
1974 if (simulate) {
1975 event->tstamp_running += now - event->tstamp_stopped;
1976 event->tstamp_stopped = now;
1977 } else {
1978 event_sched_out(event, cpuctx, ctx);
1979 }
1980 }
1981 event_sched_out(group_event, cpuctx, ctx);
1982
1983 pmu->cancel_txn(pmu);
1984
1985 perf_cpu_hrtimer_restart(cpuctx);
1986
1987 return -EAGAIN;
1988}
1989
1990
1991
1992
1993static int group_can_go_on(struct perf_event *event,
1994 struct perf_cpu_context *cpuctx,
1995 int can_add_hw)
1996{
1997
1998
1999
2000 if (event->group_flags & PERF_GROUP_SOFTWARE)
2001 return 1;
2002
2003
2004
2005
2006 if (cpuctx->exclusive)
2007 return 0;
2008
2009
2010
2011
2012 if (event->attr.exclusive && cpuctx->active_oncpu)
2013 return 0;
2014
2015
2016
2017
2018 return can_add_hw;
2019}
2020
2021static void add_event_to_ctx(struct perf_event *event,
2022 struct perf_event_context *ctx)
2023{
2024 u64 tstamp = perf_event_time(event);
2025
2026 list_add_event(event, ctx);
2027 perf_group_attach(event);
2028 event->tstamp_enabled = tstamp;
2029 event->tstamp_running = tstamp;
2030 event->tstamp_stopped = tstamp;
2031}
2032
2033static void task_ctx_sched_out(struct perf_event_context *ctx);
2034static void
2035ctx_sched_in(struct perf_event_context *ctx,
2036 struct perf_cpu_context *cpuctx,
2037 enum event_type_t event_type,
2038 struct task_struct *task);
2039
2040static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2041 struct perf_event_context *ctx,
2042 struct task_struct *task)
2043{
2044 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2045 if (ctx)
2046 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2047 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2048 if (ctx)
2049 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2050}
2051
2052
2053
2054
2055
2056
2057static int __perf_install_in_context(void *info)
2058{
2059 struct perf_event *event = info;
2060 struct perf_event_context *ctx = event->ctx;
2061 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2062 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2063 struct task_struct *task = current;
2064
2065 perf_ctx_lock(cpuctx, task_ctx);
2066 perf_pmu_disable(cpuctx->ctx.pmu);
2067
2068
2069
2070
2071 if (task_ctx)
2072 task_ctx_sched_out(task_ctx);
2073
2074
2075
2076
2077
2078 if (ctx->task && task_ctx != ctx) {
2079 if (task_ctx)
2080 raw_spin_unlock(&task_ctx->lock);
2081 raw_spin_lock(&ctx->lock);
2082 task_ctx = ctx;
2083 }
2084
2085 if (task_ctx) {
2086 cpuctx->task_ctx = task_ctx;
2087 task = task_ctx->task;
2088 }
2089
2090 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2091
2092 update_context_time(ctx);
2093
2094
2095
2096
2097
2098 update_cgrp_time_from_event(event);
2099
2100 add_event_to_ctx(event, ctx);
2101
2102
2103
2104
2105 perf_event_sched_in(cpuctx, task_ctx, task);
2106
2107 perf_pmu_enable(cpuctx->ctx.pmu);
2108 perf_ctx_unlock(cpuctx, task_ctx);
2109
2110 return 0;
2111}
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123static void
2124perf_install_in_context(struct perf_event_context *ctx,
2125 struct perf_event *event,
2126 int cpu)
2127{
2128 struct task_struct *task = ctx->task;
2129
2130 lockdep_assert_held(&ctx->mutex);
2131
2132 event->ctx = ctx;
2133 if (event->cpu != -1)
2134 event->cpu = cpu;
2135
2136 if (!task) {
2137
2138
2139
2140
2141 cpu_function_call(cpu, __perf_install_in_context, event);
2142 return;
2143 }
2144
2145retry:
2146 if (!task_function_call(task, __perf_install_in_context, event))
2147 return;
2148
2149 raw_spin_lock_irq(&ctx->lock);
2150
2151
2152
2153
2154 if (ctx->is_active) {
2155 raw_spin_unlock_irq(&ctx->lock);
2156
2157
2158
2159
2160 task = ctx->task;
2161 goto retry;
2162 }
2163
2164
2165
2166
2167
2168 add_event_to_ctx(event, ctx);
2169 raw_spin_unlock_irq(&ctx->lock);
2170}
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180static void __perf_event_mark_enabled(struct perf_event *event)
2181{
2182 struct perf_event *sub;
2183 u64 tstamp = perf_event_time(event);
2184
2185 event->state = PERF_EVENT_STATE_INACTIVE;
2186 event->tstamp_enabled = tstamp - event->total_time_enabled;
2187 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2188 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2189 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2190 }
2191}
2192
2193
2194
2195
2196static int __perf_event_enable(void *info)
2197{
2198 struct perf_event *event = info;
2199 struct perf_event_context *ctx = event->ctx;
2200 struct perf_event *leader = event->group_leader;
2201 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2202 int err;
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213 if (!ctx->is_active)
2214 return -EINVAL;
2215
2216 raw_spin_lock(&ctx->lock);
2217 update_context_time(ctx);
2218
2219 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2220 goto unlock;
2221
2222
2223
2224
2225 perf_cgroup_set_timestamp(current, ctx);
2226
2227 __perf_event_mark_enabled(event);
2228
2229 if (!event_filter_match(event)) {
2230 if (is_cgroup_event(event))
2231 perf_cgroup_defer_enabled(event);
2232 goto unlock;
2233 }
2234
2235
2236
2237
2238
2239 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2240 goto unlock;
2241
2242 if (!group_can_go_on(event, cpuctx, 1)) {
2243 err = -EEXIST;
2244 } else {
2245 if (event == leader)
2246 err = group_sched_in(event, cpuctx, ctx);
2247 else
2248 err = event_sched_in(event, cpuctx, ctx);
2249 }
2250
2251 if (err) {
2252
2253
2254
2255
2256 if (leader != event) {
2257 group_sched_out(leader, cpuctx, ctx);
2258 perf_cpu_hrtimer_restart(cpuctx);
2259 }
2260 if (leader->attr.pinned) {
2261 update_group_times(leader);
2262 leader->state = PERF_EVENT_STATE_ERROR;
2263 }
2264 }
2265
2266unlock:
2267 raw_spin_unlock(&ctx->lock);
2268
2269 return 0;
2270}
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281static void _perf_event_enable(struct perf_event *event)
2282{
2283 struct perf_event_context *ctx = event->ctx;
2284 struct task_struct *task = ctx->task;
2285
2286 if (!task) {
2287
2288
2289
2290 cpu_function_call(event->cpu, __perf_event_enable, event);
2291 return;
2292 }
2293
2294 raw_spin_lock_irq(&ctx->lock);
2295 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2296 goto out;
2297
2298
2299
2300
2301
2302
2303
2304
2305 if (event->state == PERF_EVENT_STATE_ERROR)
2306 event->state = PERF_EVENT_STATE_OFF;
2307
2308retry:
2309 if (!ctx->is_active) {
2310 __perf_event_mark_enabled(event);
2311 goto out;
2312 }
2313
2314 raw_spin_unlock_irq(&ctx->lock);
2315
2316 if (!task_function_call(task, __perf_event_enable, event))
2317 return;
2318
2319 raw_spin_lock_irq(&ctx->lock);
2320
2321
2322
2323
2324
2325 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2326
2327
2328
2329
2330 task = ctx->task;
2331 goto retry;
2332 }
2333
2334out:
2335 raw_spin_unlock_irq(&ctx->lock);
2336}
2337
2338
2339
2340
2341void perf_event_enable(struct perf_event *event)
2342{
2343 struct perf_event_context *ctx;
2344
2345 ctx = perf_event_ctx_lock(event);
2346 _perf_event_enable(event);
2347 perf_event_ctx_unlock(event, ctx);
2348}
2349EXPORT_SYMBOL_GPL(perf_event_enable);
2350
2351static int _perf_event_refresh(struct perf_event *event, int refresh)
2352{
2353
2354
2355
2356 if (event->attr.inherit || !is_sampling_event(event))
2357 return -EINVAL;
2358
2359 atomic_add(refresh, &event->event_limit);
2360 _perf_event_enable(event);
2361
2362 return 0;
2363}
2364
2365
2366
2367
2368int perf_event_refresh(struct perf_event *event, int refresh)
2369{
2370 struct perf_event_context *ctx;
2371 int ret;
2372
2373 ctx = perf_event_ctx_lock(event);
2374 ret = _perf_event_refresh(event, refresh);
2375 perf_event_ctx_unlock(event, ctx);
2376
2377 return ret;
2378}
2379EXPORT_SYMBOL_GPL(perf_event_refresh);
2380
2381static void ctx_sched_out(struct perf_event_context *ctx,
2382 struct perf_cpu_context *cpuctx,
2383 enum event_type_t event_type)
2384{
2385 struct perf_event *event;
2386 int is_active = ctx->is_active;
2387
2388 ctx->is_active &= ~event_type;
2389 if (likely(!ctx->nr_events))
2390 return;
2391
2392 update_context_time(ctx);
2393 update_cgrp_time_from_cpuctx(cpuctx);
2394 if (!ctx->nr_active)
2395 return;
2396
2397 perf_pmu_disable(ctx->pmu);
2398 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2399 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2400 group_sched_out(event, cpuctx, ctx);
2401 }
2402
2403 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2404 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2405 group_sched_out(event, cpuctx, ctx);
2406 }
2407 perf_pmu_enable(ctx->pmu);
2408}
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418static int context_equiv(struct perf_event_context *ctx1,
2419 struct perf_event_context *ctx2)
2420{
2421 lockdep_assert_held(&ctx1->lock);
2422 lockdep_assert_held(&ctx2->lock);
2423
2424
2425 if (ctx1->pin_count || ctx2->pin_count)
2426 return 0;
2427
2428
2429 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2430 return 1;
2431
2432
2433 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2434 return 1;
2435
2436
2437
2438
2439
2440 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2441 ctx1->parent_gen == ctx2->parent_gen)
2442 return 1;
2443
2444
2445 return 0;
2446}
2447
2448static void __perf_event_sync_stat(struct perf_event *event,
2449 struct perf_event *next_event)
2450{
2451 u64 value;
2452
2453 if (!event->attr.inherit_stat)
2454 return;
2455
2456
2457
2458
2459
2460
2461
2462
2463 switch (event->state) {
2464 case PERF_EVENT_STATE_ACTIVE:
2465 event->pmu->read(event);
2466
2467
2468 case PERF_EVENT_STATE_INACTIVE:
2469 update_event_times(event);
2470 break;
2471
2472 default:
2473 break;
2474 }
2475
2476
2477
2478
2479
2480 value = local64_read(&next_event->count);
2481 value = local64_xchg(&event->count, value);
2482 local64_set(&next_event->count, value);
2483
2484 swap(event->total_time_enabled, next_event->total_time_enabled);
2485 swap(event->total_time_running, next_event->total_time_running);
2486
2487
2488
2489
2490 perf_event_update_userpage(event);
2491 perf_event_update_userpage(next_event);
2492}
2493
2494static void perf_event_sync_stat(struct perf_event_context *ctx,
2495 struct perf_event_context *next_ctx)
2496{
2497 struct perf_event *event, *next_event;
2498
2499 if (!ctx->nr_stat)
2500 return;
2501
2502 update_context_time(ctx);
2503
2504 event = list_first_entry(&ctx->event_list,
2505 struct perf_event, event_entry);
2506
2507 next_event = list_first_entry(&next_ctx->event_list,
2508 struct perf_event, event_entry);
2509
2510 while (&event->event_entry != &ctx->event_list &&
2511 &next_event->event_entry != &next_ctx->event_list) {
2512
2513 __perf_event_sync_stat(event, next_event);
2514
2515 event = list_next_entry(event, event_entry);
2516 next_event = list_next_entry(next_event, event_entry);
2517 }
2518}
2519
2520static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2521 struct task_struct *next)
2522{
2523 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2524 struct perf_event_context *next_ctx;
2525 struct perf_event_context *parent, *next_parent;
2526 struct perf_cpu_context *cpuctx;
2527 int do_switch = 1;
2528
2529 if (likely(!ctx))
2530 return;
2531
2532 cpuctx = __get_cpu_context(ctx);
2533 if (!cpuctx->task_ctx)
2534 return;
2535
2536 rcu_read_lock();
2537 next_ctx = next->perf_event_ctxp[ctxn];
2538 if (!next_ctx)
2539 goto unlock;
2540
2541 parent = rcu_dereference(ctx->parent_ctx);
2542 next_parent = rcu_dereference(next_ctx->parent_ctx);
2543
2544
2545 if (!parent && !next_parent)
2546 goto unlock;
2547
2548 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558 raw_spin_lock(&ctx->lock);
2559 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2560 if (context_equiv(ctx, next_ctx)) {
2561
2562
2563
2564
2565 task->perf_event_ctxp[ctxn] = next_ctx;
2566 next->perf_event_ctxp[ctxn] = ctx;
2567 ctx->task = next;
2568 next_ctx->task = task;
2569
2570 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2571
2572 do_switch = 0;
2573
2574 perf_event_sync_stat(ctx, next_ctx);
2575 }
2576 raw_spin_unlock(&next_ctx->lock);
2577 raw_spin_unlock(&ctx->lock);
2578 }
2579unlock:
2580 rcu_read_unlock();
2581
2582 if (do_switch) {
2583 raw_spin_lock(&ctx->lock);
2584 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2585 cpuctx->task_ctx = NULL;
2586 raw_spin_unlock(&ctx->lock);
2587 }
2588}
2589
2590void perf_sched_cb_dec(struct pmu *pmu)
2591{
2592 this_cpu_dec(perf_sched_cb_usages);
2593}
2594
2595void perf_sched_cb_inc(struct pmu *pmu)
2596{
2597 this_cpu_inc(perf_sched_cb_usages);
2598}
2599
2600
2601
2602
2603
2604static void perf_pmu_sched_task(struct task_struct *prev,
2605 struct task_struct *next,
2606 bool sched_in)
2607{
2608 struct perf_cpu_context *cpuctx;
2609 struct pmu *pmu;
2610 unsigned long flags;
2611
2612 if (prev == next)
2613 return;
2614
2615 local_irq_save(flags);
2616
2617 rcu_read_lock();
2618
2619 list_for_each_entry_rcu(pmu, &pmus, entry) {
2620 if (pmu->sched_task) {
2621 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2622
2623 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2624
2625 perf_pmu_disable(pmu);
2626
2627 pmu->sched_task(cpuctx->task_ctx, sched_in);
2628
2629 perf_pmu_enable(pmu);
2630
2631 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2632 }
2633 }
2634
2635 rcu_read_unlock();
2636
2637 local_irq_restore(flags);
2638}
2639
2640#define for_each_task_context_nr(ctxn) \
2641 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654void __perf_event_task_sched_out(struct task_struct *task,
2655 struct task_struct *next)
2656{
2657 int ctxn;
2658
2659 if (__this_cpu_read(perf_sched_cb_usages))
2660 perf_pmu_sched_task(task, next, false);
2661
2662 for_each_task_context_nr(ctxn)
2663 perf_event_context_sched_out(task, ctxn, next);
2664
2665
2666
2667
2668
2669
2670 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2671 perf_cgroup_sched_out(task, next);
2672}
2673
2674static void task_ctx_sched_out(struct perf_event_context *ctx)
2675{
2676 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2677
2678 if (!cpuctx->task_ctx)
2679 return;
2680
2681 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2682 return;
2683
2684 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2685 cpuctx->task_ctx = NULL;
2686}
2687
2688
2689
2690
2691static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2692 enum event_type_t event_type)
2693{
2694 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2695}
2696
2697static void
2698ctx_pinned_sched_in(struct perf_event_context *ctx,
2699 struct perf_cpu_context *cpuctx)
2700{
2701 struct perf_event *event;
2702
2703 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2704 if (event->state <= PERF_EVENT_STATE_OFF)
2705 continue;
2706 if (!event_filter_match(event))
2707 continue;
2708
2709
2710 if (is_cgroup_event(event))
2711 perf_cgroup_mark_enabled(event, ctx);
2712
2713 if (group_can_go_on(event, cpuctx, 1))
2714 group_sched_in(event, cpuctx, ctx);
2715
2716
2717
2718
2719
2720 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2721 update_group_times(event);
2722 event->state = PERF_EVENT_STATE_ERROR;
2723 }
2724 }
2725}
2726
2727static void
2728ctx_flexible_sched_in(struct perf_event_context *ctx,
2729 struct perf_cpu_context *cpuctx)
2730{
2731 struct perf_event *event;
2732 int can_add_hw = 1;
2733
2734 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2735
2736 if (event->state <= PERF_EVENT_STATE_OFF)
2737 continue;
2738
2739
2740
2741
2742 if (!event_filter_match(event))
2743 continue;
2744
2745
2746 if (is_cgroup_event(event))
2747 perf_cgroup_mark_enabled(event, ctx);
2748
2749 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2750 if (group_sched_in(event, cpuctx, ctx))
2751 can_add_hw = 0;
2752 }
2753 }
2754}
2755
2756static void
2757ctx_sched_in(struct perf_event_context *ctx,
2758 struct perf_cpu_context *cpuctx,
2759 enum event_type_t event_type,
2760 struct task_struct *task)
2761{
2762 u64 now;
2763 int is_active = ctx->is_active;
2764
2765 ctx->is_active |= event_type;
2766 if (likely(!ctx->nr_events))
2767 return;
2768
2769 now = perf_clock();
2770 ctx->timestamp = now;
2771 perf_cgroup_set_timestamp(task, ctx);
2772
2773
2774
2775
2776 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2777 ctx_pinned_sched_in(ctx, cpuctx);
2778
2779
2780 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2781 ctx_flexible_sched_in(ctx, cpuctx);
2782}
2783
2784static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2785 enum event_type_t event_type,
2786 struct task_struct *task)
2787{
2788 struct perf_event_context *ctx = &cpuctx->ctx;
2789
2790 ctx_sched_in(ctx, cpuctx, event_type, task);
2791}
2792
2793static void perf_event_context_sched_in(struct perf_event_context *ctx,
2794 struct task_struct *task)
2795{
2796 struct perf_cpu_context *cpuctx;
2797
2798 cpuctx = __get_cpu_context(ctx);
2799 if (cpuctx->task_ctx == ctx)
2800 return;
2801
2802 perf_ctx_lock(cpuctx, ctx);
2803 perf_pmu_disable(ctx->pmu);
2804
2805
2806
2807
2808
2809 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2810
2811 if (ctx->nr_events)
2812 cpuctx->task_ctx = ctx;
2813
2814 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2815
2816 perf_pmu_enable(ctx->pmu);
2817 perf_ctx_unlock(cpuctx, ctx);
2818}
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831void __perf_event_task_sched_in(struct task_struct *prev,
2832 struct task_struct *task)
2833{
2834 struct perf_event_context *ctx;
2835 int ctxn;
2836
2837 for_each_task_context_nr(ctxn) {
2838 ctx = task->perf_event_ctxp[ctxn];
2839 if (likely(!ctx))
2840 continue;
2841
2842 perf_event_context_sched_in(ctx, task);
2843 }
2844
2845
2846
2847
2848
2849 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2850 perf_cgroup_sched_in(prev, task);
2851
2852 if (__this_cpu_read(perf_sched_cb_usages))
2853 perf_pmu_sched_task(prev, task, true);
2854}
2855
2856static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2857{
2858 u64 frequency = event->attr.sample_freq;
2859 u64 sec = NSEC_PER_SEC;
2860 u64 divisor, dividend;
2861
2862 int count_fls, nsec_fls, frequency_fls, sec_fls;
2863
2864 count_fls = fls64(count);
2865 nsec_fls = fls64(nsec);
2866 frequency_fls = fls64(frequency);
2867 sec_fls = 30;
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883#define REDUCE_FLS(a, b) \
2884do { \
2885 if (a##_fls > b##_fls) { \
2886 a >>= 1; \
2887 a##_fls--; \
2888 } else { \
2889 b >>= 1; \
2890 b##_fls--; \
2891 } \
2892} while (0)
2893
2894
2895
2896
2897
2898 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2899 REDUCE_FLS(nsec, frequency);
2900 REDUCE_FLS(sec, count);
2901 }
2902
2903 if (count_fls + sec_fls > 64) {
2904 divisor = nsec * frequency;
2905
2906 while (count_fls + sec_fls > 64) {
2907 REDUCE_FLS(count, sec);
2908 divisor >>= 1;
2909 }
2910
2911 dividend = count * sec;
2912 } else {
2913 dividend = count * sec;
2914
2915 while (nsec_fls + frequency_fls > 64) {
2916 REDUCE_FLS(nsec, frequency);
2917 dividend >>= 1;
2918 }
2919
2920 divisor = nsec * frequency;
2921 }
2922
2923 if (!divisor)
2924 return dividend;
2925
2926 return div64_u64(dividend, divisor);
2927}
2928
2929static DEFINE_PER_CPU(int, perf_throttled_count);
2930static DEFINE_PER_CPU(u64, perf_throttled_seq);
2931
2932static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2933{
2934 struct hw_perf_event *hwc = &event->hw;
2935 s64 period, sample_period;
2936 s64 delta;
2937
2938 period = perf_calculate_period(event, nsec, count);
2939
2940 delta = (s64)(period - hwc->sample_period);
2941 delta = (delta + 7) / 8;
2942
2943 sample_period = hwc->sample_period + delta;
2944
2945 if (!sample_period)
2946 sample_period = 1;
2947
2948 hwc->sample_period = sample_period;
2949
2950 if (local64_read(&hwc->period_left) > 8*sample_period) {
2951 if (disable)
2952 event->pmu->stop(event, PERF_EF_UPDATE);
2953
2954 local64_set(&hwc->period_left, 0);
2955
2956 if (disable)
2957 event->pmu->start(event, PERF_EF_RELOAD);
2958 }
2959}
2960
2961
2962
2963
2964
2965
2966static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2967 int needs_unthr)
2968{
2969 struct perf_event *event;
2970 struct hw_perf_event *hwc;
2971 u64 now, period = TICK_NSEC;
2972 s64 delta;
2973
2974
2975
2976
2977
2978
2979 if (!(ctx->nr_freq || needs_unthr))
2980 return;
2981
2982 raw_spin_lock(&ctx->lock);
2983 perf_pmu_disable(ctx->pmu);
2984
2985 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2986 if (event->state != PERF_EVENT_STATE_ACTIVE)
2987 continue;
2988
2989 if (!event_filter_match(event))
2990 continue;
2991
2992 perf_pmu_disable(event->pmu);
2993
2994 hwc = &event->hw;
2995
2996 if (hwc->interrupts == MAX_INTERRUPTS) {
2997 hwc->interrupts = 0;
2998 perf_log_throttle(event, 1);
2999 event->pmu->start(event, 0);
3000 }
3001
3002 if (!event->attr.freq || !event->attr.sample_freq)
3003 goto next;
3004
3005
3006
3007
3008 event->pmu->stop(event, PERF_EF_UPDATE);
3009
3010 now = local64_read(&event->count);
3011 delta = now - hwc->freq_count_stamp;
3012 hwc->freq_count_stamp = now;
3013
3014
3015
3016
3017
3018
3019
3020
3021 if (delta > 0)
3022 perf_adjust_period(event, period, delta, false);
3023
3024 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3025 next:
3026 perf_pmu_enable(event->pmu);
3027 }
3028
3029 perf_pmu_enable(ctx->pmu);
3030 raw_spin_unlock(&ctx->lock);
3031}
3032
3033
3034
3035
3036static void rotate_ctx(struct perf_event_context *ctx)
3037{
3038
3039
3040
3041
3042 if (!ctx->rotate_disable)
3043 list_rotate_left(&ctx->flexible_groups);
3044}
3045
3046static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3047{
3048 struct perf_event_context *ctx = NULL;
3049 int rotate = 0;
3050
3051 if (cpuctx->ctx.nr_events) {
3052 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3053 rotate = 1;
3054 }
3055
3056 ctx = cpuctx->task_ctx;
3057 if (ctx && ctx->nr_events) {
3058 if (ctx->nr_events != ctx->nr_active)
3059 rotate = 1;
3060 }
3061
3062 if (!rotate)
3063 goto done;
3064
3065 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3066 perf_pmu_disable(cpuctx->ctx.pmu);
3067
3068 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3069 if (ctx)
3070 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3071
3072 rotate_ctx(&cpuctx->ctx);
3073 if (ctx)
3074 rotate_ctx(ctx);
3075
3076 perf_event_sched_in(cpuctx, ctx, current);
3077
3078 perf_pmu_enable(cpuctx->ctx.pmu);
3079 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3080done:
3081
3082 return rotate;
3083}
3084
3085#ifdef CONFIG_NO_HZ_FULL
3086bool perf_event_can_stop_tick(void)
3087{
3088 if (atomic_read(&nr_freq_events) ||
3089 __this_cpu_read(perf_throttled_count))
3090 return false;
3091 else
3092 return true;
3093}
3094#endif
3095
3096void perf_event_task_tick(void)
3097{
3098 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3099 struct perf_event_context *ctx, *tmp;
3100 int throttled;
3101
3102 WARN_ON(!irqs_disabled());
3103
3104 __this_cpu_inc(perf_throttled_seq);
3105 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3106
3107 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3108 perf_adjust_freq_unthr_context(ctx, throttled);
3109}
3110
3111static int event_enable_on_exec(struct perf_event *event,
3112 struct perf_event_context *ctx)
3113{
3114 if (!event->attr.enable_on_exec)
3115 return 0;
3116
3117 event->attr.enable_on_exec = 0;
3118 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3119 return 0;
3120
3121 __perf_event_mark_enabled(event);
3122
3123 return 1;
3124}
3125
3126
3127
3128
3129
3130static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3131{
3132 struct perf_event_context *clone_ctx = NULL;
3133 struct perf_event *event;
3134 unsigned long flags;
3135 int enabled = 0;
3136 int ret;
3137
3138 local_irq_save(flags);
3139 if (!ctx || !ctx->nr_events)
3140 goto out;
3141
3142
3143
3144
3145
3146
3147
3148
3149 perf_cgroup_sched_out(current, NULL);
3150
3151 raw_spin_lock(&ctx->lock);
3152 task_ctx_sched_out(ctx);
3153
3154 list_for_each_entry(event, &ctx->event_list, event_entry) {
3155 ret = event_enable_on_exec(event, ctx);
3156 if (ret)
3157 enabled = 1;
3158 }
3159
3160
3161
3162
3163 if (enabled)
3164 clone_ctx = unclone_ctx(ctx);
3165
3166 raw_spin_unlock(&ctx->lock);
3167
3168
3169
3170
3171 perf_event_context_sched_in(ctx, ctx->task);
3172out:
3173 local_irq_restore(flags);
3174
3175 if (clone_ctx)
3176 put_ctx(clone_ctx);
3177}
3178
3179void perf_event_exec(void)
3180{
3181 struct perf_event_context *ctx;
3182 int ctxn;
3183
3184 rcu_read_lock();
3185 for_each_task_context_nr(ctxn) {
3186 ctx = current->perf_event_ctxp[ctxn];
3187 if (!ctx)
3188 continue;
3189
3190 perf_event_enable_on_exec(ctx);
3191 }
3192 rcu_read_unlock();
3193}
3194
3195
3196
3197
3198static void __perf_event_read(void *info)
3199{
3200 struct perf_event *event = info;
3201 struct perf_event_context *ctx = event->ctx;
3202 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3203
3204
3205
3206
3207
3208
3209
3210
3211 if (ctx->task && cpuctx->task_ctx != ctx)
3212 return;
3213
3214 raw_spin_lock(&ctx->lock);
3215 if (ctx->is_active) {
3216 update_context_time(ctx);
3217 update_cgrp_time_from_event(event);
3218 }
3219 update_event_times(event);
3220 if (event->state == PERF_EVENT_STATE_ACTIVE)
3221 event->pmu->read(event);
3222 raw_spin_unlock(&ctx->lock);
3223}
3224
3225static inline u64 perf_event_count(struct perf_event *event)
3226{
3227 if (event->pmu->count)
3228 return event->pmu->count(event);
3229
3230 return __perf_event_count(event);
3231}
3232
3233static u64 perf_event_read(struct perf_event *event)
3234{
3235
3236
3237
3238
3239 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3240 smp_call_function_single(event->oncpu,
3241 __perf_event_read, event, 1);
3242 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3243 struct perf_event_context *ctx = event->ctx;
3244 unsigned long flags;
3245
3246 raw_spin_lock_irqsave(&ctx->lock, flags);
3247
3248
3249
3250
3251
3252 if (ctx->is_active) {
3253 update_context_time(ctx);
3254 update_cgrp_time_from_event(event);
3255 }
3256 update_event_times(event);
3257 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3258 }
3259
3260 return perf_event_count(event);
3261}
3262
3263
3264
3265
3266static void __perf_event_init_context(struct perf_event_context *ctx)
3267{
3268 raw_spin_lock_init(&ctx->lock);
3269 mutex_init(&ctx->mutex);
3270 INIT_LIST_HEAD(&ctx->active_ctx_list);
3271 INIT_LIST_HEAD(&ctx->pinned_groups);
3272 INIT_LIST_HEAD(&ctx->flexible_groups);
3273 INIT_LIST_HEAD(&ctx->event_list);
3274 atomic_set(&ctx->refcount, 1);
3275 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3276}
3277
3278static struct perf_event_context *
3279alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3280{
3281 struct perf_event_context *ctx;
3282
3283 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3284 if (!ctx)
3285 return NULL;
3286
3287 __perf_event_init_context(ctx);
3288 if (task) {
3289 ctx->task = task;
3290 get_task_struct(task);
3291 }
3292 ctx->pmu = pmu;
3293
3294 return ctx;
3295}
3296
3297static struct task_struct *
3298find_lively_task_by_vpid(pid_t vpid)
3299{
3300 struct task_struct *task;
3301 int err;
3302
3303 rcu_read_lock();
3304 if (!vpid)
3305 task = current;
3306 else
3307 task = find_task_by_vpid(vpid);
3308 if (task)
3309 get_task_struct(task);
3310 rcu_read_unlock();
3311
3312 if (!task)
3313 return ERR_PTR(-ESRCH);
3314
3315
3316 err = -EACCES;
3317 if (!ptrace_may_access(task, PTRACE_MODE_READ))
3318 goto errout;
3319
3320 return task;
3321errout:
3322 put_task_struct(task);
3323 return ERR_PTR(err);
3324
3325}
3326
3327
3328
3329
3330static struct perf_event_context *
3331find_get_context(struct pmu *pmu, struct task_struct *task,
3332 struct perf_event *event)
3333{
3334 struct perf_event_context *ctx, *clone_ctx = NULL;
3335 struct perf_cpu_context *cpuctx;
3336 void *task_ctx_data = NULL;
3337 unsigned long flags;
3338 int ctxn, err;
3339 int cpu = event->cpu;
3340
3341 if (!task) {
3342
3343 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3344 return ERR_PTR(-EACCES);
3345
3346
3347
3348
3349
3350
3351 if (!cpu_online(cpu))
3352 return ERR_PTR(-ENODEV);
3353
3354 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3355 ctx = &cpuctx->ctx;
3356 get_ctx(ctx);
3357 ++ctx->pin_count;
3358
3359 return ctx;
3360 }
3361
3362 err = -EINVAL;
3363 ctxn = pmu->task_ctx_nr;
3364 if (ctxn < 0)
3365 goto errout;
3366
3367 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3368 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3369 if (!task_ctx_data) {
3370 err = -ENOMEM;
3371 goto errout;
3372 }
3373 }
3374
3375retry:
3376 ctx = perf_lock_task_context(task, ctxn, &flags);
3377 if (ctx) {
3378 clone_ctx = unclone_ctx(ctx);
3379 ++ctx->pin_count;
3380
3381 if (task_ctx_data && !ctx->task_ctx_data) {
3382 ctx->task_ctx_data = task_ctx_data;
3383 task_ctx_data = NULL;
3384 }
3385 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3386
3387 if (clone_ctx)
3388 put_ctx(clone_ctx);
3389 } else {
3390 ctx = alloc_perf_context(pmu, task);
3391 err = -ENOMEM;
3392 if (!ctx)
3393 goto errout;
3394
3395 if (task_ctx_data) {
3396 ctx->task_ctx_data = task_ctx_data;
3397 task_ctx_data = NULL;
3398 }
3399
3400 err = 0;
3401 mutex_lock(&task->perf_event_mutex);
3402
3403
3404
3405
3406 if (task->flags & PF_EXITING)
3407 err = -ESRCH;
3408 else if (task->perf_event_ctxp[ctxn])
3409 err = -EAGAIN;
3410 else {
3411 get_ctx(ctx);
3412 ++ctx->pin_count;
3413 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3414 }
3415 mutex_unlock(&task->perf_event_mutex);
3416
3417 if (unlikely(err)) {
3418 put_ctx(ctx);
3419
3420 if (err == -EAGAIN)
3421 goto retry;
3422 goto errout;
3423 }
3424 }
3425
3426 kfree(task_ctx_data);
3427 return ctx;
3428
3429errout:
3430 kfree(task_ctx_data);
3431 return ERR_PTR(err);
3432}
3433
3434static void perf_event_free_filter(struct perf_event *event);
3435static void perf_event_free_bpf_prog(struct perf_event *event);
3436
3437static void free_event_rcu(struct rcu_head *head)
3438{
3439 struct perf_event *event;
3440
3441 event = container_of(head, struct perf_event, rcu_head);
3442 if (event->ns)
3443 put_pid_ns(event->ns);
3444 perf_event_free_filter(event);
3445 kfree(event);
3446}
3447
3448static void ring_buffer_attach(struct perf_event *event,
3449 struct ring_buffer *rb);
3450
3451static void unaccount_event_cpu(struct perf_event *event, int cpu)
3452{
3453 if (event->parent)
3454 return;
3455
3456 if (is_cgroup_event(event))
3457 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3458}
3459
3460static void unaccount_event(struct perf_event *event)
3461{
3462 if (event->parent)
3463 return;
3464
3465 if (event->attach_state & PERF_ATTACH_TASK)
3466 static_key_slow_dec_deferred(&perf_sched_events);
3467 if (event->attr.mmap || event->attr.mmap_data)
3468 atomic_dec(&nr_mmap_events);
3469 if (event->attr.comm)
3470 atomic_dec(&nr_comm_events);
3471 if (event->attr.task)
3472 atomic_dec(&nr_task_events);
3473 if (event->attr.freq)
3474 atomic_dec(&nr_freq_events);
3475 if (is_cgroup_event(event))
3476 static_key_slow_dec_deferred(&perf_sched_events);
3477 if (has_branch_stack(event))
3478 static_key_slow_dec_deferred(&perf_sched_events);
3479
3480 unaccount_event_cpu(event, event->cpu);
3481}
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495static int exclusive_event_init(struct perf_event *event)
3496{
3497 struct pmu *pmu = event->pmu;
3498
3499 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3500 return 0;
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515 if (event->attach_state & PERF_ATTACH_TASK) {
3516 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3517 return -EBUSY;
3518 } else {
3519 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3520 return -EBUSY;
3521 }
3522
3523 return 0;
3524}
3525
3526static void exclusive_event_destroy(struct perf_event *event)
3527{
3528 struct pmu *pmu = event->pmu;
3529
3530 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3531 return;
3532
3533
3534 if (event->attach_state & PERF_ATTACH_TASK)
3535 atomic_dec(&pmu->exclusive_cnt);
3536 else
3537 atomic_inc(&pmu->exclusive_cnt);
3538}
3539
3540static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3541{
3542 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3543 (e1->cpu == e2->cpu ||
3544 e1->cpu == -1 ||
3545 e2->cpu == -1))
3546 return true;
3547 return false;
3548}
3549
3550
3551static bool exclusive_event_installable(struct perf_event *event,
3552 struct perf_event_context *ctx)
3553{
3554 struct perf_event *iter_event;
3555 struct pmu *pmu = event->pmu;
3556
3557 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3558 return true;
3559
3560 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3561 if (exclusive_event_match(iter_event, event))
3562 return false;
3563 }
3564
3565 return true;
3566}
3567
3568static void __free_event(struct perf_event *event)
3569{
3570 if (!event->parent) {
3571 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3572 put_callchain_buffers();
3573 }
3574
3575 perf_event_free_bpf_prog(event);
3576
3577 if (event->destroy)
3578 event->destroy(event);
3579
3580 if (event->ctx)
3581 put_ctx(event->ctx);
3582
3583 if (event->pmu) {
3584 exclusive_event_destroy(event);
3585 module_put(event->pmu->module);
3586 }
3587
3588 call_rcu(&event->rcu_head, free_event_rcu);
3589}
3590
3591static void _free_event(struct perf_event *event)
3592{
3593 irq_work_sync(&event->pending);
3594
3595 unaccount_event(event);
3596
3597 if (event->rb) {
3598
3599
3600
3601
3602
3603
3604 mutex_lock(&event->mmap_mutex);
3605 ring_buffer_attach(event, NULL);
3606 mutex_unlock(&event->mmap_mutex);
3607 }
3608
3609 if (is_cgroup_event(event))
3610 perf_detach_cgroup(event);
3611
3612 __free_event(event);
3613}
3614
3615
3616
3617
3618
3619static void free_event(struct perf_event *event)
3620{
3621 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3622 "unexpected event refcount: %ld; ptr=%p\n",
3623 atomic_long_read(&event->refcount), event)) {
3624
3625 return;
3626 }
3627
3628 _free_event(event);
3629}
3630
3631
3632
3633
3634static void perf_remove_from_owner(struct perf_event *event)
3635{
3636 struct task_struct *owner;
3637
3638 rcu_read_lock();
3639 owner = ACCESS_ONCE(event->owner);
3640
3641
3642
3643
3644
3645
3646 smp_read_barrier_depends();
3647 if (owner) {
3648
3649
3650
3651
3652
3653 get_task_struct(owner);
3654 }
3655 rcu_read_unlock();
3656
3657 if (owner) {
3658
3659
3660
3661
3662
3663
3664
3665
3666 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3667
3668
3669
3670
3671
3672
3673
3674 if (event->owner)
3675 list_del_init(&event->owner_entry);
3676 mutex_unlock(&owner->perf_event_mutex);
3677 put_task_struct(owner);
3678 }
3679}
3680
3681static void put_event(struct perf_event *event)
3682{
3683 struct perf_event_context *ctx;
3684
3685 if (!atomic_long_dec_and_test(&event->refcount))
3686 return;
3687
3688 if (!is_kernel_event(event))
3689 perf_remove_from_owner(event);
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3704 WARN_ON_ONCE(ctx->parent_ctx);
3705 perf_remove_from_context(event, true);
3706 perf_event_ctx_unlock(event, ctx);
3707
3708 _free_event(event);
3709}
3710
3711int perf_event_release_kernel(struct perf_event *event)
3712{
3713 put_event(event);
3714 return 0;
3715}
3716EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3717
3718
3719
3720
3721static int perf_release(struct inode *inode, struct file *file)
3722{
3723 put_event(file->private_data);
3724 return 0;
3725}
3726
3727
3728
3729
3730static void orphans_remove_work(struct work_struct *work)
3731{
3732 struct perf_event_context *ctx;
3733 struct perf_event *event, *tmp;
3734
3735 ctx = container_of(work, struct perf_event_context,
3736 orphans_remove.work);
3737
3738 mutex_lock(&ctx->mutex);
3739 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3740 struct perf_event *parent_event = event->parent;
3741
3742 if (!is_orphaned_child(event))
3743 continue;
3744
3745 perf_remove_from_context(event, true);
3746
3747 mutex_lock(&parent_event->child_mutex);
3748 list_del_init(&event->child_list);
3749 mutex_unlock(&parent_event->child_mutex);
3750
3751 free_event(event);
3752 put_event(parent_event);
3753 }
3754
3755 raw_spin_lock_irq(&ctx->lock);
3756 ctx->orphans_remove_sched = false;
3757 raw_spin_unlock_irq(&ctx->lock);
3758 mutex_unlock(&ctx->mutex);
3759
3760 put_ctx(ctx);
3761}
3762
3763u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3764{
3765 struct perf_event *child;
3766 u64 total = 0;
3767
3768 *enabled = 0;
3769 *running = 0;
3770
3771 mutex_lock(&event->child_mutex);
3772 total += perf_event_read(event);
3773 *enabled += event->total_time_enabled +
3774 atomic64_read(&event->child_total_time_enabled);
3775 *running += event->total_time_running +
3776 atomic64_read(&event->child_total_time_running);
3777
3778 list_for_each_entry(child, &event->child_list, child_list) {
3779 total += perf_event_read(child);
3780 *enabled += child->total_time_enabled;
3781 *running += child->total_time_running;
3782 }
3783 mutex_unlock(&event->child_mutex);
3784
3785 return total;
3786}
3787EXPORT_SYMBOL_GPL(perf_event_read_value);
3788
3789static int perf_event_read_group(struct perf_event *event,
3790 u64 read_format, char __user *buf)
3791{
3792 struct perf_event *leader = event->group_leader, *sub;
3793 struct perf_event_context *ctx = leader->ctx;
3794 int n = 0, size = 0, ret;
3795 u64 count, enabled, running;
3796 u64 values[5];
3797
3798 lockdep_assert_held(&ctx->mutex);
3799
3800 count = perf_event_read_value(leader, &enabled, &running);
3801
3802 values[n++] = 1 + leader->nr_siblings;
3803 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3804 values[n++] = enabled;
3805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3806 values[n++] = running;
3807 values[n++] = count;
3808 if (read_format & PERF_FORMAT_ID)
3809 values[n++] = primary_event_id(leader);
3810
3811 size = n * sizeof(u64);
3812
3813 if (copy_to_user(buf, values, size))
3814 return -EFAULT;
3815
3816 ret = size;
3817
3818 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3819 n = 0;
3820
3821 values[n++] = perf_event_read_value(sub, &enabled, &running);
3822 if (read_format & PERF_FORMAT_ID)
3823 values[n++] = primary_event_id(sub);
3824
3825 size = n * sizeof(u64);
3826
3827 if (copy_to_user(buf + ret, values, size)) {
3828 return -EFAULT;
3829 }
3830
3831 ret += size;
3832 }
3833
3834 return ret;
3835}
3836
3837static int perf_event_read_one(struct perf_event *event,
3838 u64 read_format, char __user *buf)
3839{
3840 u64 enabled, running;
3841 u64 values[4];
3842 int n = 0;
3843
3844 values[n++] = perf_event_read_value(event, &enabled, &running);
3845 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3846 values[n++] = enabled;
3847 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3848 values[n++] = running;
3849 if (read_format & PERF_FORMAT_ID)
3850 values[n++] = primary_event_id(event);
3851
3852 if (copy_to_user(buf, values, n * sizeof(u64)))
3853 return -EFAULT;
3854
3855 return n * sizeof(u64);
3856}
3857
3858static bool is_event_hup(struct perf_event *event)
3859{
3860 bool no_children;
3861
3862 if (event->state != PERF_EVENT_STATE_EXIT)
3863 return false;
3864
3865 mutex_lock(&event->child_mutex);
3866 no_children = list_empty(&event->child_list);
3867 mutex_unlock(&event->child_mutex);
3868 return no_children;
3869}
3870
3871
3872
3873
3874static ssize_t
3875perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3876{
3877 u64 read_format = event->attr.read_format;
3878 int ret;
3879
3880
3881
3882
3883
3884
3885 if (event->state == PERF_EVENT_STATE_ERROR)
3886 return 0;
3887
3888 if (count < event->read_size)
3889 return -ENOSPC;
3890
3891 WARN_ON_ONCE(event->ctx->parent_ctx);
3892 if (read_format & PERF_FORMAT_GROUP)
3893 ret = perf_event_read_group(event, read_format, buf);
3894 else
3895 ret = perf_event_read_one(event, read_format, buf);
3896
3897 return ret;
3898}
3899
3900static ssize_t
3901perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3902{
3903 struct perf_event *event = file->private_data;
3904 struct perf_event_context *ctx;
3905 int ret;
3906
3907 ctx = perf_event_ctx_lock(event);
3908 ret = perf_read_hw(event, buf, count);
3909 perf_event_ctx_unlock(event, ctx);
3910
3911 return ret;
3912}
3913
3914static unsigned int perf_poll(struct file *file, poll_table *wait)
3915{
3916 struct perf_event *event = file->private_data;
3917 struct ring_buffer *rb;
3918 unsigned int events = POLLHUP;
3919
3920 poll_wait(file, &event->waitq, wait);
3921
3922 if (is_event_hup(event))
3923 return events;
3924
3925
3926
3927
3928
3929 mutex_lock(&event->mmap_mutex);
3930 rb = event->rb;
3931 if (rb)
3932 events = atomic_xchg(&rb->poll, 0);
3933 mutex_unlock(&event->mmap_mutex);
3934 return events;
3935}
3936
3937static void _perf_event_reset(struct perf_event *event)
3938{
3939 (void)perf_event_read(event);
3940 local64_set(&event->count, 0);
3941 perf_event_update_userpage(event);
3942}
3943
3944
3945
3946
3947
3948
3949
3950static void perf_event_for_each_child(struct perf_event *event,
3951 void (*func)(struct perf_event *))
3952{
3953 struct perf_event *child;
3954
3955 WARN_ON_ONCE(event->ctx->parent_ctx);
3956
3957 mutex_lock(&event->child_mutex);
3958 func(event);
3959 list_for_each_entry(child, &event->child_list, child_list)
3960 func(child);
3961 mutex_unlock(&event->child_mutex);
3962}
3963
3964static void perf_event_for_each(struct perf_event *event,
3965 void (*func)(struct perf_event *))
3966{
3967 struct perf_event_context *ctx = event->ctx;
3968 struct perf_event *sibling;
3969
3970 lockdep_assert_held(&ctx->mutex);
3971
3972 event = event->group_leader;
3973
3974 perf_event_for_each_child(event, func);
3975 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3976 perf_event_for_each_child(sibling, func);
3977}
3978
3979static int perf_event_period(struct perf_event *event, u64 __user *arg)
3980{
3981 struct perf_event_context *ctx = event->ctx;
3982 int ret = 0, active;
3983 u64 value;
3984
3985 if (!is_sampling_event(event))
3986 return -EINVAL;
3987
3988 if (copy_from_user(&value, arg, sizeof(value)))
3989 return -EFAULT;
3990
3991 if (!value)
3992 return -EINVAL;
3993
3994 raw_spin_lock_irq(&ctx->lock);
3995 if (event->attr.freq) {
3996 if (value > sysctl_perf_event_sample_rate) {
3997 ret = -EINVAL;
3998 goto unlock;
3999 }
4000
4001 event->attr.sample_freq = value;
4002 } else {
4003 event->attr.sample_period = value;
4004 event->hw.sample_period = value;
4005 }
4006
4007 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4008 if (active) {
4009 perf_pmu_disable(ctx->pmu);
4010 event->pmu->stop(event, PERF_EF_UPDATE);
4011 }
4012
4013 local64_set(&event->hw.period_left, 0);
4014
4015 if (active) {
4016 event->pmu->start(event, PERF_EF_RELOAD);
4017 perf_pmu_enable(ctx->pmu);
4018 }
4019
4020unlock:
4021 raw_spin_unlock_irq(&ctx->lock);
4022
4023 return ret;
4024}
4025
4026static const struct file_operations perf_fops;
4027
4028static inline int perf_fget_light(int fd, struct fd *p)
4029{
4030 struct fd f = fdget(fd);
4031 if (!f.file)
4032 return -EBADF;
4033
4034 if (f.file->f_op != &perf_fops) {
4035 fdput(f);
4036 return -EBADF;
4037 }
4038 *p = f;
4039 return 0;
4040}
4041
4042static int perf_event_set_output(struct perf_event *event,
4043 struct perf_event *output_event);
4044static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4045static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4046
4047static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4048{
4049 void (*func)(struct perf_event *);
4050 u32 flags = arg;
4051
4052 switch (cmd) {
4053 case PERF_EVENT_IOC_ENABLE:
4054 func = _perf_event_enable;
4055 break;
4056 case PERF_EVENT_IOC_DISABLE:
4057 func = _perf_event_disable;
4058 break;
4059 case PERF_EVENT_IOC_RESET:
4060 func = _perf_event_reset;
4061 break;
4062
4063 case PERF_EVENT_IOC_REFRESH:
4064 return _perf_event_refresh(event, arg);
4065
4066 case PERF_EVENT_IOC_PERIOD:
4067 return perf_event_period(event, (u64 __user *)arg);
4068
4069 case PERF_EVENT_IOC_ID:
4070 {
4071 u64 id = primary_event_id(event);
4072
4073 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4074 return -EFAULT;
4075 return 0;
4076 }
4077
4078 case PERF_EVENT_IOC_SET_OUTPUT:
4079 {
4080 int ret;
4081 if (arg != -1) {
4082 struct perf_event *output_event;
4083 struct fd output;
4084 ret = perf_fget_light(arg, &output);
4085 if (ret)
4086 return ret;
4087 output_event = output.file->private_data;
4088 ret = perf_event_set_output(event, output_event);
4089 fdput(output);
4090 } else {
4091 ret = perf_event_set_output(event, NULL);
4092 }
4093 return ret;
4094 }
4095
4096 case PERF_EVENT_IOC_SET_FILTER:
4097 return perf_event_set_filter(event, (void __user *)arg);
4098
4099 case PERF_EVENT_IOC_SET_BPF:
4100 return perf_event_set_bpf_prog(event, arg);
4101
4102 default:
4103 return -ENOTTY;
4104 }
4105
4106 if (flags & PERF_IOC_FLAG_GROUP)
4107 perf_event_for_each(event, func);
4108 else
4109 perf_event_for_each_child(event, func);
4110
4111 return 0;
4112}
4113
4114static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4115{
4116 struct perf_event *event = file->private_data;
4117 struct perf_event_context *ctx;
4118 long ret;
4119
4120 ctx = perf_event_ctx_lock(event);
4121 ret = _perf_ioctl(event, cmd, arg);
4122 perf_event_ctx_unlock(event, ctx);
4123
4124 return ret;
4125}
4126
4127#ifdef CONFIG_COMPAT
4128static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4129 unsigned long arg)
4130{
4131 switch (_IOC_NR(cmd)) {
4132 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4133 case _IOC_NR(PERF_EVENT_IOC_ID):
4134
4135 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4136 cmd &= ~IOCSIZE_MASK;
4137 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4138 }
4139 break;
4140 }
4141 return perf_ioctl(file, cmd, arg);
4142}
4143#else
4144# define perf_compat_ioctl NULL
4145#endif
4146
4147int perf_event_task_enable(void)
4148{
4149 struct perf_event_context *ctx;
4150 struct perf_event *event;
4151
4152 mutex_lock(¤t->perf_event_mutex);
4153 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4154 ctx = perf_event_ctx_lock(event);
4155 perf_event_for_each_child(event, _perf_event_enable);
4156 perf_event_ctx_unlock(event, ctx);
4157 }
4158 mutex_unlock(¤t->perf_event_mutex);
4159
4160 return 0;
4161}
4162
4163int perf_event_task_disable(void)
4164{
4165 struct perf_event_context *ctx;
4166 struct perf_event *event;
4167
4168 mutex_lock(¤t->perf_event_mutex);
4169 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4170 ctx = perf_event_ctx_lock(event);
4171 perf_event_for_each_child(event, _perf_event_disable);
4172 perf_event_ctx_unlock(event, ctx);
4173 }
4174 mutex_unlock(¤t->perf_event_mutex);
4175
4176 return 0;
4177}
4178
4179static int perf_event_index(struct perf_event *event)
4180{
4181 if (event->hw.state & PERF_HES_STOPPED)
4182 return 0;
4183
4184 if (event->state != PERF_EVENT_STATE_ACTIVE)
4185 return 0;
4186
4187 return event->pmu->event_idx(event);
4188}
4189
4190static void calc_timer_values(struct perf_event *event,
4191 u64 *now,
4192 u64 *enabled,
4193 u64 *running)
4194{
4195 u64 ctx_time;
4196
4197 *now = perf_clock();
4198 ctx_time = event->shadow_ctx_time + *now;
4199 *enabled = ctx_time - event->tstamp_enabled;
4200 *running = ctx_time - event->tstamp_running;
4201}
4202
4203static void perf_event_init_userpage(struct perf_event *event)
4204{
4205 struct perf_event_mmap_page *userpg;
4206 struct ring_buffer *rb;
4207
4208 rcu_read_lock();
4209 rb = rcu_dereference(event->rb);
4210 if (!rb)
4211 goto unlock;
4212
4213 userpg = rb->user_page;
4214
4215
4216 userpg->cap_bit0_is_deprecated = 1;
4217 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4218 userpg->data_offset = PAGE_SIZE;
4219 userpg->data_size = perf_data_size(rb);
4220
4221unlock:
4222 rcu_read_unlock();
4223}
4224
4225void __weak arch_perf_update_userpage(
4226 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4227{
4228}
4229
4230
4231
4232
4233
4234
4235void perf_event_update_userpage(struct perf_event *event)
4236{
4237 struct perf_event_mmap_page *userpg;
4238 struct ring_buffer *rb;
4239 u64 enabled, running, now;
4240
4241 rcu_read_lock();
4242 rb = rcu_dereference(event->rb);
4243 if (!rb)
4244 goto unlock;
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255 calc_timer_values(event, &now, &enabled, &running);
4256
4257 userpg = rb->user_page;
4258
4259
4260
4261
4262 preempt_disable();
4263 ++userpg->lock;
4264 barrier();
4265 userpg->index = perf_event_index(event);
4266 userpg->offset = perf_event_count(event);
4267 if (userpg->index)
4268 userpg->offset -= local64_read(&event->hw.prev_count);
4269
4270 userpg->time_enabled = enabled +
4271 atomic64_read(&event->child_total_time_enabled);
4272
4273 userpg->time_running = running +
4274 atomic64_read(&event->child_total_time_running);
4275
4276 arch_perf_update_userpage(event, userpg, now);
4277
4278 barrier();
4279 ++userpg->lock;
4280 preempt_enable();
4281unlock:
4282 rcu_read_unlock();
4283}
4284
4285static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4286{
4287 struct perf_event *event = vma->vm_file->private_data;
4288 struct ring_buffer *rb;
4289 int ret = VM_FAULT_SIGBUS;
4290
4291 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4292 if (vmf->pgoff == 0)
4293 ret = 0;
4294 return ret;
4295 }
4296
4297 rcu_read_lock();
4298 rb = rcu_dereference(event->rb);
4299 if (!rb)
4300 goto unlock;
4301
4302 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4303 goto unlock;
4304
4305 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4306 if (!vmf->page)
4307 goto unlock;
4308
4309 get_page(vmf->page);
4310 vmf->page->mapping = vma->vm_file->f_mapping;
4311 vmf->page->index = vmf->pgoff;
4312
4313 ret = 0;
4314unlock:
4315 rcu_read_unlock();
4316
4317 return ret;
4318}
4319
4320static void ring_buffer_attach(struct perf_event *event,
4321 struct ring_buffer *rb)
4322{
4323 struct ring_buffer *old_rb = NULL;
4324 unsigned long flags;
4325
4326 if (event->rb) {
4327
4328
4329
4330
4331 WARN_ON_ONCE(event->rcu_pending);
4332
4333 old_rb = event->rb;
4334 event->rcu_batches = get_state_synchronize_rcu();
4335 event->rcu_pending = 1;
4336
4337 spin_lock_irqsave(&old_rb->event_lock, flags);
4338 list_del_rcu(&event->rb_entry);
4339 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4340 }
4341
4342 if (event->rcu_pending && rb) {
4343 cond_synchronize_rcu(event->rcu_batches);
4344 event->rcu_pending = 0;
4345 }
4346
4347 if (rb) {
4348 spin_lock_irqsave(&rb->event_lock, flags);
4349 list_add_rcu(&event->rb_entry, &rb->event_list);
4350 spin_unlock_irqrestore(&rb->event_lock, flags);
4351 }
4352
4353 rcu_assign_pointer(event->rb, rb);
4354
4355 if (old_rb) {
4356 ring_buffer_put(old_rb);
4357
4358
4359
4360
4361
4362 wake_up_all(&event->waitq);
4363 }
4364}
4365
4366static void ring_buffer_wakeup(struct perf_event *event)
4367{
4368 struct ring_buffer *rb;
4369
4370 rcu_read_lock();
4371 rb = rcu_dereference(event->rb);
4372 if (rb) {
4373 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4374 wake_up_all(&event->waitq);
4375 }
4376 rcu_read_unlock();
4377}
4378
4379static void rb_free_rcu(struct rcu_head *rcu_head)
4380{
4381 struct ring_buffer *rb;
4382
4383 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4384 rb_free(rb);
4385}
4386
4387struct ring_buffer *ring_buffer_get(struct perf_event *event)
4388{
4389 struct ring_buffer *rb;
4390
4391 rcu_read_lock();
4392 rb = rcu_dereference(event->rb);
4393 if (rb) {
4394 if (!atomic_inc_not_zero(&rb->refcount))
4395 rb = NULL;
4396 }
4397 rcu_read_unlock();
4398
4399 return rb;
4400}
4401
4402void ring_buffer_put(struct ring_buffer *rb)
4403{
4404 if (!atomic_dec_and_test(&rb->refcount))
4405 return;
4406
4407 WARN_ON_ONCE(!list_empty(&rb->event_list));
4408
4409 call_rcu(&rb->rcu_head, rb_free_rcu);
4410}
4411
4412static void perf_mmap_open(struct vm_area_struct *vma)
4413{
4414 struct perf_event *event = vma->vm_file->private_data;
4415
4416 atomic_inc(&event->mmap_count);
4417 atomic_inc(&event->rb->mmap_count);
4418
4419 if (vma->vm_pgoff)
4420 atomic_inc(&event->rb->aux_mmap_count);
4421
4422 if (event->pmu->event_mapped)
4423 event->pmu->event_mapped(event);
4424}
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434static void perf_mmap_close(struct vm_area_struct *vma)
4435{
4436 struct perf_event *event = vma->vm_file->private_data;
4437
4438 struct ring_buffer *rb = ring_buffer_get(event);
4439 struct user_struct *mmap_user = rb->mmap_user;
4440 int mmap_locked = rb->mmap_locked;
4441 unsigned long size = perf_data_size(rb);
4442
4443 if (event->pmu->event_unmapped)
4444 event->pmu->event_unmapped(event);
4445
4446
4447
4448
4449
4450
4451 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4452 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4453 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4454 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4455
4456 rb_free_aux(rb);
4457 mutex_unlock(&event->mmap_mutex);
4458 }
4459
4460 atomic_dec(&rb->mmap_count);
4461
4462 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4463 goto out_put;
4464
4465 ring_buffer_attach(event, NULL);
4466 mutex_unlock(&event->mmap_mutex);
4467
4468
4469 if (atomic_read(&rb->mmap_count))
4470 goto out_put;
4471
4472
4473
4474
4475
4476
4477again:
4478 rcu_read_lock();
4479 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4480 if (!atomic_long_inc_not_zero(&event->refcount)) {
4481
4482
4483
4484
4485 continue;
4486 }
4487 rcu_read_unlock();
4488
4489 mutex_lock(&event->mmap_mutex);
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500 if (event->rb == rb)
4501 ring_buffer_attach(event, NULL);
4502
4503 mutex_unlock(&event->mmap_mutex);
4504 put_event(event);
4505
4506
4507
4508
4509
4510 goto again;
4511 }
4512 rcu_read_unlock();
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4524 vma->vm_mm->pinned_vm -= mmap_locked;
4525 free_uid(mmap_user);
4526
4527out_put:
4528 ring_buffer_put(rb);
4529}
4530
4531static const struct vm_operations_struct perf_mmap_vmops = {
4532 .open = perf_mmap_open,
4533 .close = perf_mmap_close,
4534 .fault = perf_mmap_fault,
4535 .page_mkwrite = perf_mmap_fault,
4536};
4537
4538static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4539{
4540 struct perf_event *event = file->private_data;
4541 unsigned long user_locked, user_lock_limit;
4542 struct user_struct *user = current_user();
4543 unsigned long locked, lock_limit;
4544 struct ring_buffer *rb = NULL;
4545 unsigned long vma_size;
4546 unsigned long nr_pages;
4547 long user_extra = 0, extra = 0;
4548 int ret = 0, flags = 0;
4549
4550
4551
4552
4553
4554
4555 if (event->cpu == -1 && event->attr.inherit)
4556 return -EINVAL;
4557
4558 if (!(vma->vm_flags & VM_SHARED))
4559 return -EINVAL;
4560
4561 vma_size = vma->vm_end - vma->vm_start;
4562
4563 if (vma->vm_pgoff == 0) {
4564 nr_pages = (vma_size / PAGE_SIZE) - 1;
4565 } else {
4566
4567
4568
4569
4570
4571 u64 aux_offset, aux_size;
4572
4573 if (!event->rb)
4574 return -EINVAL;
4575
4576 nr_pages = vma_size / PAGE_SIZE;
4577
4578 mutex_lock(&event->mmap_mutex);
4579 ret = -EINVAL;
4580
4581 rb = event->rb;
4582 if (!rb)
4583 goto aux_unlock;
4584
4585 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4586 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4587
4588 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4589 goto aux_unlock;
4590
4591 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4592 goto aux_unlock;
4593
4594
4595 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4596 goto aux_unlock;
4597
4598 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4599 goto aux_unlock;
4600
4601
4602 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4603 goto aux_unlock;
4604
4605 if (!is_power_of_2(nr_pages))
4606 goto aux_unlock;
4607
4608 if (!atomic_inc_not_zero(&rb->mmap_count))
4609 goto aux_unlock;
4610
4611 if (rb_has_aux(rb)) {
4612 atomic_inc(&rb->aux_mmap_count);
4613 ret = 0;
4614 goto unlock;
4615 }
4616
4617 atomic_set(&rb->aux_mmap_count, 1);
4618 user_extra = nr_pages;
4619
4620 goto accounting;
4621 }
4622
4623
4624
4625
4626
4627 if (nr_pages != 0 && !is_power_of_2(nr_pages))
4628 return -EINVAL;
4629
4630 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4631 return -EINVAL;
4632
4633 WARN_ON_ONCE(event->ctx->parent_ctx);
4634again:
4635 mutex_lock(&event->mmap_mutex);
4636 if (event->rb) {
4637 if (event->rb->nr_pages != nr_pages) {
4638 ret = -EINVAL;
4639 goto unlock;
4640 }
4641
4642 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4643
4644
4645
4646
4647
4648 mutex_unlock(&event->mmap_mutex);
4649 goto again;
4650 }
4651
4652 goto unlock;
4653 }
4654
4655 user_extra = nr_pages + 1;
4656
4657accounting:
4658 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4659
4660
4661
4662
4663 user_lock_limit *= num_online_cpus();
4664
4665 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4666
4667 if (user_locked > user_lock_limit)
4668 extra = user_locked - user_lock_limit;
4669
4670 lock_limit = rlimit(RLIMIT_MEMLOCK);
4671 lock_limit >>= PAGE_SHIFT;
4672 locked = vma->vm_mm->pinned_vm + extra;
4673
4674 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4675 !capable(CAP_IPC_LOCK)) {
4676 ret = -EPERM;
4677 goto unlock;
4678 }
4679
4680 WARN_ON(!rb && event->rb);
4681
4682 if (vma->vm_flags & VM_WRITE)
4683 flags |= RING_BUFFER_WRITABLE;
4684
4685 if (!rb) {
4686 rb = rb_alloc(nr_pages,
4687 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4688 event->cpu, flags);
4689
4690 if (!rb) {
4691 ret = -ENOMEM;
4692 goto unlock;
4693 }
4694
4695 atomic_set(&rb->mmap_count, 1);
4696 rb->mmap_user = get_current_user();
4697 rb->mmap_locked = extra;
4698
4699 ring_buffer_attach(event, rb);
4700
4701 perf_event_init_userpage(event);
4702 perf_event_update_userpage(event);
4703 } else {
4704 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4705 event->attr.aux_watermark, flags);
4706 if (!ret)
4707 rb->aux_mmap_locked = extra;
4708 }
4709
4710unlock:
4711 if (!ret) {
4712 atomic_long_add(user_extra, &user->locked_vm);
4713 vma->vm_mm->pinned_vm += extra;
4714
4715 atomic_inc(&event->mmap_count);
4716 } else if (rb) {
4717 atomic_dec(&rb->mmap_count);
4718 }
4719aux_unlock:
4720 mutex_unlock(&event->mmap_mutex);
4721
4722
4723
4724
4725
4726 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4727 vma->vm_ops = &perf_mmap_vmops;
4728
4729 if (event->pmu->event_mapped)
4730 event->pmu->event_mapped(event);
4731
4732 return ret;
4733}
4734
4735static int perf_fasync(int fd, struct file *filp, int on)
4736{
4737 struct inode *inode = file_inode(filp);
4738 struct perf_event *event = filp->private_data;
4739 int retval;
4740
4741 mutex_lock(&inode->i_mutex);
4742 retval = fasync_helper(fd, filp, on, &event->fasync);
4743 mutex_unlock(&inode->i_mutex);
4744
4745 if (retval < 0)
4746 return retval;
4747
4748 return 0;
4749}
4750
4751static const struct file_operations perf_fops = {
4752 .llseek = no_llseek,
4753 .release = perf_release,
4754 .read = perf_read,
4755 .poll = perf_poll,
4756 .unlocked_ioctl = perf_ioctl,
4757 .compat_ioctl = perf_compat_ioctl,
4758 .mmap = perf_mmap,
4759 .fasync = perf_fasync,
4760};
4761
4762
4763
4764
4765
4766
4767
4768
4769void perf_event_wakeup(struct perf_event *event)
4770{
4771 ring_buffer_wakeup(event);
4772
4773 if (event->pending_kill) {
4774 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4775 event->pending_kill = 0;
4776 }
4777}
4778
4779static void perf_pending_event(struct irq_work *entry)
4780{
4781 struct perf_event *event = container_of(entry,
4782 struct perf_event, pending);
4783 int rctx;
4784
4785 rctx = perf_swevent_get_recursion_context();
4786
4787
4788
4789
4790
4791 if (event->pending_disable) {
4792 event->pending_disable = 0;
4793 __perf_event_disable(event);
4794 }
4795
4796 if (event->pending_wakeup) {
4797 event->pending_wakeup = 0;
4798 perf_event_wakeup(event);
4799 }
4800
4801 if (rctx >= 0)
4802 perf_swevent_put_recursion_context(rctx);
4803}
4804
4805
4806
4807
4808
4809
4810struct perf_guest_info_callbacks *perf_guest_cbs;
4811
4812int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4813{
4814 perf_guest_cbs = cbs;
4815 return 0;
4816}
4817EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4818
4819int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4820{
4821 perf_guest_cbs = NULL;
4822 return 0;
4823}
4824EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4825
4826static void
4827perf_output_sample_regs(struct perf_output_handle *handle,
4828 struct pt_regs *regs, u64 mask)
4829{
4830 int bit;
4831
4832 for_each_set_bit(bit, (const unsigned long *) &mask,
4833 sizeof(mask) * BITS_PER_BYTE) {
4834 u64 val;
4835
4836 val = perf_reg_value(regs, bit);
4837 perf_output_put(handle, val);
4838 }
4839}
4840
4841static void perf_sample_regs_user(struct perf_regs *regs_user,
4842 struct pt_regs *regs,
4843 struct pt_regs *regs_user_copy)
4844{
4845 if (user_mode(regs)) {
4846 regs_user->abi = perf_reg_abi(current);
4847 regs_user->regs = regs;
4848 } else if (current->mm) {
4849 perf_get_regs_user(regs_user, regs, regs_user_copy);
4850 } else {
4851 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4852 regs_user->regs = NULL;
4853 }
4854}
4855
4856static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4857 struct pt_regs *regs)
4858{
4859 regs_intr->regs = regs;
4860 regs_intr->abi = perf_reg_abi(current);
4861}
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871static u64 perf_ustack_task_size(struct pt_regs *regs)
4872{
4873 unsigned long addr = perf_user_stack_pointer(regs);
4874
4875 if (!addr || addr >= TASK_SIZE)
4876 return 0;
4877
4878 return TASK_SIZE - addr;
4879}
4880
4881static u16
4882perf_sample_ustack_size(u16 stack_size, u16 header_size,
4883 struct pt_regs *regs)
4884{
4885 u64 task_size;
4886
4887
4888 if (!regs)
4889 return 0;
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4902 stack_size = min(stack_size, (u16) task_size);
4903
4904
4905 header_size += 2 * sizeof(u64);
4906
4907
4908 if ((u16) (header_size + stack_size) < header_size) {
4909
4910
4911
4912
4913 stack_size = USHRT_MAX - header_size - sizeof(u64);
4914 stack_size = round_up(stack_size, sizeof(u64));
4915 }
4916
4917 return stack_size;
4918}
4919
4920static void
4921perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4922 struct pt_regs *regs)
4923{
4924
4925 if (!regs) {
4926 u64 size = 0;
4927 perf_output_put(handle, size);
4928 } else {
4929 unsigned long sp;
4930 unsigned int rem;
4931 u64 dyn_size;
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945 perf_output_put(handle, dump_size);
4946
4947
4948 sp = perf_user_stack_pointer(regs);
4949 rem = __output_copy_user(handle, (void *) sp, dump_size);
4950 dyn_size = dump_size - rem;
4951
4952 perf_output_skip(handle, rem);
4953
4954
4955 perf_output_put(handle, dyn_size);
4956 }
4957}
4958
4959static void __perf_event_header__init_id(struct perf_event_header *header,
4960 struct perf_sample_data *data,
4961 struct perf_event *event)
4962{
4963 u64 sample_type = event->attr.sample_type;
4964
4965 data->type = sample_type;
4966 header->size += event->id_header_size;
4967
4968 if (sample_type & PERF_SAMPLE_TID) {
4969
4970 data->tid_entry.pid = perf_event_pid(event, current);
4971 data->tid_entry.tid = perf_event_tid(event, current);
4972 }
4973
4974 if (sample_type & PERF_SAMPLE_TIME)
4975 data->time = perf_event_clock(event);
4976
4977 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4978 data->id = primary_event_id(event);
4979
4980 if (sample_type & PERF_SAMPLE_STREAM_ID)
4981 data->stream_id = event->id;
4982
4983 if (sample_type & PERF_SAMPLE_CPU) {
4984 data->cpu_entry.cpu = raw_smp_processor_id();
4985 data->cpu_entry.reserved = 0;
4986 }
4987}
4988
4989void perf_event_header__init_id(struct perf_event_header *header,
4990 struct perf_sample_data *data,
4991 struct perf_event *event)
4992{
4993 if (event->attr.sample_id_all)
4994 __perf_event_header__init_id(header, data, event);
4995}
4996
4997static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4998 struct perf_sample_data *data)
4999{
5000 u64 sample_type = data->type;
5001
5002 if (sample_type & PERF_SAMPLE_TID)
5003 perf_output_put(handle, data->tid_entry);
5004
5005 if (sample_type & PERF_SAMPLE_TIME)
5006 perf_output_put(handle, data->time);
5007
5008 if (sample_type & PERF_SAMPLE_ID)
5009 perf_output_put(handle, data->id);
5010
5011 if (sample_type & PERF_SAMPLE_STREAM_ID)
5012 perf_output_put(handle, data->stream_id);
5013
5014 if (sample_type & PERF_SAMPLE_CPU)
5015 perf_output_put(handle, data->cpu_entry);
5016
5017 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5018 perf_output_put(handle, data->id);
5019}
5020
5021void perf_event__output_id_sample(struct perf_event *event,
5022 struct perf_output_handle *handle,
5023 struct perf_sample_data *sample)
5024{
5025 if (event->attr.sample_id_all)
5026 __perf_event__output_id_sample(handle, sample);
5027}
5028
5029static void perf_output_read_one(struct perf_output_handle *handle,
5030 struct perf_event *event,
5031 u64 enabled, u64 running)
5032{
5033 u64 read_format = event->attr.read_format;
5034 u64 values[4];
5035 int n = 0;
5036
5037 values[n++] = perf_event_count(event);
5038 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5039 values[n++] = enabled +
5040 atomic64_read(&event->child_total_time_enabled);
5041 }
5042 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5043 values[n++] = running +
5044 atomic64_read(&event->child_total_time_running);
5045 }
5046 if (read_format & PERF_FORMAT_ID)
5047 values[n++] = primary_event_id(event);
5048
5049 __output_copy(handle, values, n * sizeof(u64));
5050}
5051
5052
5053
5054
5055static void perf_output_read_group(struct perf_output_handle *handle,
5056 struct perf_event *event,
5057 u64 enabled, u64 running)
5058{
5059 struct perf_event *leader = event->group_leader, *sub;
5060 u64 read_format = event->attr.read_format;
5061 u64 values[5];
5062 int n = 0;
5063
5064 values[n++] = 1 + leader->nr_siblings;
5065
5066 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5067 values[n++] = enabled;
5068
5069 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5070 values[n++] = running;
5071
5072 if (leader != event)
5073 leader->pmu->read(leader);
5074
5075 values[n++] = perf_event_count(leader);
5076 if (read_format & PERF_FORMAT_ID)
5077 values[n++] = primary_event_id(leader);
5078
5079 __output_copy(handle, values, n * sizeof(u64));
5080
5081 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5082 n = 0;
5083
5084 if ((sub != event) &&
5085 (sub->state == PERF_EVENT_STATE_ACTIVE))
5086 sub->pmu->read(sub);
5087
5088 values[n++] = perf_event_count(sub);
5089 if (read_format & PERF_FORMAT_ID)
5090 values[n++] = primary_event_id(sub);
5091
5092 __output_copy(handle, values, n * sizeof(u64));
5093 }
5094}
5095
5096#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5097 PERF_FORMAT_TOTAL_TIME_RUNNING)
5098
5099static void perf_output_read(struct perf_output_handle *handle,
5100 struct perf_event *event)
5101{
5102 u64 enabled = 0, running = 0, now;
5103 u64 read_format = event->attr.read_format;
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5115 calc_timer_values(event, &now, &enabled, &running);
5116
5117 if (event->attr.read_format & PERF_FORMAT_GROUP)
5118 perf_output_read_group(handle, event, enabled, running);
5119 else
5120 perf_output_read_one(handle, event, enabled, running);
5121}
5122
5123void perf_output_sample(struct perf_output_handle *handle,
5124 struct perf_event_header *header,
5125 struct perf_sample_data *data,
5126 struct perf_event *event)
5127{
5128 u64 sample_type = data->type;
5129
5130 perf_output_put(handle, *header);
5131
5132 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5133 perf_output_put(handle, data->id);
5134
5135 if (sample_type & PERF_SAMPLE_IP)
5136 perf_output_put(handle, data->ip);
5137
5138 if (sample_type & PERF_SAMPLE_TID)
5139 perf_output_put(handle, data->tid_entry);
5140
5141 if (sample_type & PERF_SAMPLE_TIME)
5142 perf_output_put(handle, data->time);
5143
5144 if (sample_type & PERF_SAMPLE_ADDR)
5145 perf_output_put(handle, data->addr);
5146
5147 if (sample_type & PERF_SAMPLE_ID)
5148 perf_output_put(handle, data->id);
5149
5150 if (sample_type & PERF_SAMPLE_STREAM_ID)
5151 perf_output_put(handle, data->stream_id);
5152
5153 if (sample_type & PERF_SAMPLE_CPU)
5154 perf_output_put(handle, data->cpu_entry);
5155
5156 if (sample_type & PERF_SAMPLE_PERIOD)
5157 perf_output_put(handle, data->period);
5158
5159 if (sample_type & PERF_SAMPLE_READ)
5160 perf_output_read(handle, event);
5161
5162 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5163 if (data->callchain) {
5164 int size = 1;
5165
5166 if (data->callchain)
5167 size += data->callchain->nr;
5168
5169 size *= sizeof(u64);
5170
5171 __output_copy(handle, data->callchain, size);
5172 } else {
5173 u64 nr = 0;
5174 perf_output_put(handle, nr);
5175 }
5176 }
5177
5178 if (sample_type & PERF_SAMPLE_RAW) {
5179 if (data->raw) {
5180 perf_output_put(handle, data->raw->size);
5181 __output_copy(handle, data->raw->data,
5182 data->raw->size);
5183 } else {
5184 struct {
5185 u32 size;
5186 u32 data;
5187 } raw = {
5188 .size = sizeof(u32),
5189 .data = 0,
5190 };
5191 perf_output_put(handle, raw);
5192 }
5193 }
5194
5195 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5196 if (data->br_stack) {
5197 size_t size;
5198
5199 size = data->br_stack->nr
5200 * sizeof(struct perf_branch_entry);
5201
5202 perf_output_put(handle, data->br_stack->nr);
5203 perf_output_copy(handle, data->br_stack->entries, size);
5204 } else {
5205
5206
5207
5208 u64 nr = 0;
5209 perf_output_put(handle, nr);
5210 }
5211 }
5212
5213 if (sample_type & PERF_SAMPLE_REGS_USER) {
5214 u64 abi = data->regs_user.abi;
5215
5216
5217
5218
5219
5220 perf_output_put(handle, abi);
5221
5222 if (abi) {
5223 u64 mask = event->attr.sample_regs_user;
5224 perf_output_sample_regs(handle,
5225 data->regs_user.regs,
5226 mask);
5227 }
5228 }
5229
5230 if (sample_type & PERF_SAMPLE_STACK_USER) {
5231 perf_output_sample_ustack(handle,
5232 data->stack_user_size,
5233 data->regs_user.regs);
5234 }
5235
5236 if (sample_type & PERF_SAMPLE_WEIGHT)
5237 perf_output_put(handle, data->weight);
5238
5239 if (sample_type & PERF_SAMPLE_DATA_SRC)
5240 perf_output_put(handle, data->data_src.val);
5241
5242 if (sample_type & PERF_SAMPLE_TRANSACTION)
5243 perf_output_put(handle, data->txn);
5244
5245 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5246 u64 abi = data->regs_intr.abi;
5247
5248
5249
5250
5251 perf_output_put(handle, abi);
5252
5253 if (abi) {
5254 u64 mask = event->attr.sample_regs_intr;
5255
5256 perf_output_sample_regs(handle,
5257 data->regs_intr.regs,
5258 mask);
5259 }
5260 }
5261
5262 if (!event->attr.watermark) {
5263 int wakeup_events = event->attr.wakeup_events;
5264
5265 if (wakeup_events) {
5266 struct ring_buffer *rb = handle->rb;
5267 int events = local_inc_return(&rb->events);
5268
5269 if (events >= wakeup_events) {
5270 local_sub(wakeup_events, &rb->events);
5271 local_inc(&rb->wakeup);
5272 }
5273 }
5274 }
5275}
5276
5277void perf_prepare_sample(struct perf_event_header *header,
5278 struct perf_sample_data *data,
5279 struct perf_event *event,
5280 struct pt_regs *regs)
5281{
5282 u64 sample_type = event->attr.sample_type;
5283
5284 header->type = PERF_RECORD_SAMPLE;
5285 header->size = sizeof(*header) + event->header_size;
5286
5287 header->misc = 0;
5288 header->misc |= perf_misc_flags(regs);
5289
5290 __perf_event_header__init_id(header, data, event);
5291
5292 if (sample_type & PERF_SAMPLE_IP)
5293 data->ip = perf_instruction_pointer(regs);
5294
5295 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5296 int size = 1;
5297
5298 data->callchain = perf_callchain(event, regs);
5299
5300 if (data->callchain)
5301 size += data->callchain->nr;
5302
5303 header->size += size * sizeof(u64);
5304 }
5305
5306 if (sample_type & PERF_SAMPLE_RAW) {
5307 int size = sizeof(u32);
5308
5309 if (data->raw)
5310 size += data->raw->size;
5311 else
5312 size += sizeof(u32);
5313
5314 WARN_ON_ONCE(size & (sizeof(u64)-1));
5315 header->size += size;
5316 }
5317
5318 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5319 int size = sizeof(u64);
5320 if (data->br_stack) {
5321 size += data->br_stack->nr
5322 * sizeof(struct perf_branch_entry);
5323 }
5324 header->size += size;
5325 }
5326
5327 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5328 perf_sample_regs_user(&data->regs_user, regs,
5329 &data->regs_user_copy);
5330
5331 if (sample_type & PERF_SAMPLE_REGS_USER) {
5332
5333 int size = sizeof(u64);
5334
5335 if (data->regs_user.regs) {
5336 u64 mask = event->attr.sample_regs_user;
5337 size += hweight64(mask) * sizeof(u64);
5338 }
5339
5340 header->size += size;
5341 }
5342
5343 if (sample_type & PERF_SAMPLE_STACK_USER) {
5344
5345
5346
5347
5348
5349
5350 u16 stack_size = event->attr.sample_stack_user;
5351 u16 size = sizeof(u64);
5352
5353 stack_size = perf_sample_ustack_size(stack_size, header->size,
5354 data->regs_user.regs);
5355
5356
5357
5358
5359
5360
5361 if (stack_size)
5362 size += sizeof(u64) + stack_size;
5363
5364 data->stack_user_size = stack_size;
5365 header->size += size;
5366 }
5367
5368 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5369
5370 int size = sizeof(u64);
5371
5372 perf_sample_regs_intr(&data->regs_intr, regs);
5373
5374 if (data->regs_intr.regs) {
5375 u64 mask = event->attr.sample_regs_intr;
5376
5377 size += hweight64(mask) * sizeof(u64);
5378 }
5379
5380 header->size += size;
5381 }
5382}
5383
5384static void perf_event_output(struct perf_event *event,
5385 struct perf_sample_data *data,
5386 struct pt_regs *regs)
5387{
5388 struct perf_output_handle handle;
5389 struct perf_event_header header;
5390
5391
5392 rcu_read_lock();
5393
5394 perf_prepare_sample(&header, data, event, regs);
5395
5396 if (perf_output_begin(&handle, event, header.size))
5397 goto exit;
5398
5399 perf_output_sample(&handle, &header, data, event);
5400
5401 perf_output_end(&handle);
5402
5403exit:
5404 rcu_read_unlock();
5405}
5406
5407
5408
5409
5410
5411struct perf_read_event {
5412 struct perf_event_header header;
5413
5414 u32 pid;
5415 u32 tid;
5416};
5417
5418static void
5419perf_event_read_event(struct perf_event *event,
5420 struct task_struct *task)
5421{
5422 struct perf_output_handle handle;
5423 struct perf_sample_data sample;
5424 struct perf_read_event read_event = {
5425 .header = {
5426 .type = PERF_RECORD_READ,
5427 .misc = 0,
5428 .size = sizeof(read_event) + event->read_size,
5429 },
5430 .pid = perf_event_pid(event, task),
5431 .tid = perf_event_tid(event, task),
5432 };
5433 int ret;
5434
5435 perf_event_header__init_id(&read_event.header, &sample, event);
5436 ret = perf_output_begin(&handle, event, read_event.header.size);
5437 if (ret)
5438 return;
5439
5440 perf_output_put(&handle, read_event);
5441 perf_output_read(&handle, event);
5442 perf_event__output_id_sample(event, &handle, &sample);
5443
5444 perf_output_end(&handle);
5445}
5446
5447typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5448
5449static void
5450perf_event_aux_ctx(struct perf_event_context *ctx,
5451 perf_event_aux_output_cb output,
5452 void *data)
5453{
5454 struct perf_event *event;
5455
5456 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5457 if (event->state < PERF_EVENT_STATE_INACTIVE)
5458 continue;
5459 if (!event_filter_match(event))
5460 continue;
5461 output(event, data);
5462 }
5463}
5464
5465static void
5466perf_event_aux(perf_event_aux_output_cb output, void *data,
5467 struct perf_event_context *task_ctx)
5468{
5469 struct perf_cpu_context *cpuctx;
5470 struct perf_event_context *ctx;
5471 struct pmu *pmu;
5472 int ctxn;
5473
5474 rcu_read_lock();
5475 list_for_each_entry_rcu(pmu, &pmus, entry) {
5476 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5477 if (cpuctx->unique_pmu != pmu)
5478 goto next;
5479 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5480 if (task_ctx)
5481 goto next;
5482 ctxn = pmu->task_ctx_nr;
5483 if (ctxn < 0)
5484 goto next;
5485 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5486 if (ctx)
5487 perf_event_aux_ctx(ctx, output, data);
5488next:
5489 put_cpu_ptr(pmu->pmu_cpu_context);
5490 }
5491
5492 if (task_ctx) {
5493 preempt_disable();
5494 perf_event_aux_ctx(task_ctx, output, data);
5495 preempt_enable();
5496 }
5497 rcu_read_unlock();
5498}
5499
5500
5501
5502
5503
5504
5505
5506struct perf_task_event {
5507 struct task_struct *task;
5508 struct perf_event_context *task_ctx;
5509
5510 struct {
5511 struct perf_event_header header;
5512
5513 u32 pid;
5514 u32 ppid;
5515 u32 tid;
5516 u32 ptid;
5517 u64 time;
5518 } event_id;
5519};
5520
5521static int perf_event_task_match(struct perf_event *event)
5522{
5523 return event->attr.comm || event->attr.mmap ||
5524 event->attr.mmap2 || event->attr.mmap_data ||
5525 event->attr.task;
5526}
5527
5528static void perf_event_task_output(struct perf_event *event,
5529 void *data)
5530{
5531 struct perf_task_event *task_event = data;
5532 struct perf_output_handle handle;
5533 struct perf_sample_data sample;
5534 struct task_struct *task = task_event->task;
5535 int ret, size = task_event->event_id.header.size;
5536
5537 if (!perf_event_task_match(event))
5538 return;
5539
5540 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5541
5542 ret = perf_output_begin(&handle, event,
5543 task_event->event_id.header.size);
5544 if (ret)
5545 goto out;
5546
5547 task_event->event_id.pid = perf_event_pid(event, task);
5548 task_event->event_id.ppid = perf_event_pid(event, current);
5549
5550 task_event->event_id.tid = perf_event_tid(event, task);
5551 task_event->event_id.ptid = perf_event_tid(event, current);
5552
5553 task_event->event_id.time = perf_event_clock(event);
5554
5555 perf_output_put(&handle, task_event->event_id);
5556
5557 perf_event__output_id_sample(event, &handle, &sample);
5558
5559 perf_output_end(&handle);
5560out:
5561 task_event->event_id.header.size = size;
5562}
5563
5564static void perf_event_task(struct task_struct *task,
5565 struct perf_event_context *task_ctx,
5566 int new)
5567{
5568 struct perf_task_event task_event;
5569
5570 if (!atomic_read(&nr_comm_events) &&
5571 !atomic_read(&nr_mmap_events) &&
5572 !atomic_read(&nr_task_events))
5573 return;
5574
5575 task_event = (struct perf_task_event){
5576 .task = task,
5577 .task_ctx = task_ctx,
5578 .event_id = {
5579 .header = {
5580 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5581 .misc = 0,
5582 .size = sizeof(task_event.event_id),
5583 },
5584
5585
5586
5587
5588
5589 },
5590 };
5591
5592 perf_event_aux(perf_event_task_output,
5593 &task_event,
5594 task_ctx);
5595}
5596
5597void perf_event_fork(struct task_struct *task)
5598{
5599 perf_event_task(task, NULL, 1);
5600}
5601
5602
5603
5604
5605
5606struct perf_comm_event {
5607 struct task_struct *task;
5608 char *comm;
5609 int comm_size;
5610
5611 struct {
5612 struct perf_event_header header;
5613
5614 u32 pid;
5615 u32 tid;
5616 } event_id;
5617};
5618
5619static int perf_event_comm_match(struct perf_event *event)
5620{
5621 return event->attr.comm;
5622}
5623
5624static void perf_event_comm_output(struct perf_event *event,
5625 void *data)
5626{
5627 struct perf_comm_event *comm_event = data;
5628 struct perf_output_handle handle;
5629 struct perf_sample_data sample;
5630 int size = comm_event->event_id.header.size;
5631 int ret;
5632
5633 if (!perf_event_comm_match(event))
5634 return;
5635
5636 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5637 ret = perf_output_begin(&handle, event,
5638 comm_event->event_id.header.size);
5639
5640 if (ret)
5641 goto out;
5642
5643 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5644 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5645
5646 perf_output_put(&handle, comm_event->event_id);
5647 __output_copy(&handle, comm_event->comm,
5648 comm_event->comm_size);
5649
5650 perf_event__output_id_sample(event, &handle, &sample);
5651
5652 perf_output_end(&handle);
5653out:
5654 comm_event->event_id.header.size = size;
5655}
5656
5657static void perf_event_comm_event(struct perf_comm_event *comm_event)
5658{
5659 char comm[TASK_COMM_LEN];
5660 unsigned int size;
5661
5662 memset(comm, 0, sizeof(comm));
5663 strlcpy(comm, comm_event->task->comm, sizeof(comm));
5664 size = ALIGN(strlen(comm)+1, sizeof(u64));
5665
5666 comm_event->comm = comm;
5667 comm_event->comm_size = size;
5668
5669 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5670
5671 perf_event_aux(perf_event_comm_output,
5672 comm_event,
5673 NULL);
5674}
5675
5676void perf_event_comm(struct task_struct *task, bool exec)
5677{
5678 struct perf_comm_event comm_event;
5679
5680 if (!atomic_read(&nr_comm_events))
5681 return;
5682
5683 comm_event = (struct perf_comm_event){
5684 .task = task,
5685
5686
5687 .event_id = {
5688 .header = {
5689 .type = PERF_RECORD_COMM,
5690 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5691
5692 },
5693
5694
5695 },
5696 };
5697
5698 perf_event_comm_event(&comm_event);
5699}
5700
5701
5702
5703
5704
5705struct perf_mmap_event {
5706 struct vm_area_struct *vma;
5707
5708 const char *file_name;
5709 int file_size;
5710 int maj, min;
5711 u64 ino;
5712 u64 ino_generation;
5713 u32 prot, flags;
5714
5715 struct {
5716 struct perf_event_header header;
5717
5718 u32 pid;
5719 u32 tid;
5720 u64 start;
5721 u64 len;
5722 u64 pgoff;
5723 } event_id;
5724};
5725
5726static int perf_event_mmap_match(struct perf_event *event,
5727 void *data)
5728{
5729 struct perf_mmap_event *mmap_event = data;
5730 struct vm_area_struct *vma = mmap_event->vma;
5731 int executable = vma->vm_flags & VM_EXEC;
5732
5733 return (!executable && event->attr.mmap_data) ||
5734 (executable && (event->attr.mmap || event->attr.mmap2));
5735}
5736
5737static void perf_event_mmap_output(struct perf_event *event,
5738 void *data)
5739{
5740 struct perf_mmap_event *mmap_event = data;
5741 struct perf_output_handle handle;
5742 struct perf_sample_data sample;
5743 int size = mmap_event->event_id.header.size;
5744 int ret;
5745
5746 if (!perf_event_mmap_match(event, data))
5747 return;
5748
5749 if (event->attr.mmap2) {
5750 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5751 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5752 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5753 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5754 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5755 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5756 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5757 }
5758
5759 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5760 ret = perf_output_begin(&handle, event,
5761 mmap_event->event_id.header.size);
5762 if (ret)
5763 goto out;
5764
5765 mmap_event->event_id.pid = perf_event_pid(event, current);
5766 mmap_event->event_id.tid = perf_event_tid(event, current);
5767
5768 perf_output_put(&handle, mmap_event->event_id);
5769
5770 if (event->attr.mmap2) {
5771 perf_output_put(&handle, mmap_event->maj);
5772 perf_output_put(&handle, mmap_event->min);
5773 perf_output_put(&handle, mmap_event->ino);
5774 perf_output_put(&handle, mmap_event->ino_generation);
5775 perf_output_put(&handle, mmap_event->prot);
5776 perf_output_put(&handle, mmap_event->flags);
5777 }
5778
5779 __output_copy(&handle, mmap_event->file_name,
5780 mmap_event->file_size);
5781
5782 perf_event__output_id_sample(event, &handle, &sample);
5783
5784 perf_output_end(&handle);
5785out:
5786 mmap_event->event_id.header.size = size;
5787}
5788
5789static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5790{
5791 struct vm_area_struct *vma = mmap_event->vma;
5792 struct file *file = vma->vm_file;
5793 int maj = 0, min = 0;
5794 u64 ino = 0, gen = 0;
5795 u32 prot = 0, flags = 0;
5796 unsigned int size;
5797 char tmp[16];
5798 char *buf = NULL;
5799 char *name;
5800
5801 if (file) {
5802 struct inode *inode;
5803 dev_t dev;
5804
5805 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5806 if (!buf) {
5807 name = "//enomem";
5808 goto cpy_name;
5809 }
5810
5811
5812
5813
5814
5815 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5816 if (IS_ERR(name)) {
5817 name = "//toolong";
5818 goto cpy_name;
5819 }
5820 inode = file_inode(vma->vm_file);
5821 dev = inode->i_sb->s_dev;
5822 ino = inode->i_ino;
5823 gen = inode->i_generation;
5824 maj = MAJOR(dev);
5825 min = MINOR(dev);
5826
5827 if (vma->vm_flags & VM_READ)
5828 prot |= PROT_READ;
5829 if (vma->vm_flags & VM_WRITE)
5830 prot |= PROT_WRITE;
5831 if (vma->vm_flags & VM_EXEC)
5832 prot |= PROT_EXEC;
5833
5834 if (vma->vm_flags & VM_MAYSHARE)
5835 flags = MAP_SHARED;
5836 else
5837 flags = MAP_PRIVATE;
5838
5839 if (vma->vm_flags & VM_DENYWRITE)
5840 flags |= MAP_DENYWRITE;
5841 if (vma->vm_flags & VM_MAYEXEC)
5842 flags |= MAP_EXECUTABLE;
5843 if (vma->vm_flags & VM_LOCKED)
5844 flags |= MAP_LOCKED;
5845 if (vma->vm_flags & VM_HUGETLB)
5846 flags |= MAP_HUGETLB;
5847
5848 goto got_name;
5849 } else {
5850 if (vma->vm_ops && vma->vm_ops->name) {
5851 name = (char *) vma->vm_ops->name(vma);
5852 if (name)
5853 goto cpy_name;
5854 }
5855
5856 name = (char *)arch_vma_name(vma);
5857 if (name)
5858 goto cpy_name;
5859
5860 if (vma->vm_start <= vma->vm_mm->start_brk &&
5861 vma->vm_end >= vma->vm_mm->brk) {
5862 name = "[heap]";
5863 goto cpy_name;
5864 }
5865 if (vma->vm_start <= vma->vm_mm->start_stack &&
5866 vma->vm_end >= vma->vm_mm->start_stack) {
5867 name = "[stack]";
5868 goto cpy_name;
5869 }
5870
5871 name = "//anon";
5872 goto cpy_name;
5873 }
5874
5875cpy_name:
5876 strlcpy(tmp, name, sizeof(tmp));
5877 name = tmp;
5878got_name:
5879
5880
5881
5882
5883
5884 size = strlen(name)+1;
5885 while (!IS_ALIGNED(size, sizeof(u64)))
5886 name[size++] = '\0';
5887
5888 mmap_event->file_name = name;
5889 mmap_event->file_size = size;
5890 mmap_event->maj = maj;
5891 mmap_event->min = min;
5892 mmap_event->ino = ino;
5893 mmap_event->ino_generation = gen;
5894 mmap_event->prot = prot;
5895 mmap_event->flags = flags;
5896
5897 if (!(vma->vm_flags & VM_EXEC))
5898 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5899
5900 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5901
5902 perf_event_aux(perf_event_mmap_output,
5903 mmap_event,
5904 NULL);
5905
5906 kfree(buf);
5907}
5908
5909void perf_event_mmap(struct vm_area_struct *vma)
5910{
5911 struct perf_mmap_event mmap_event;
5912
5913 if (!atomic_read(&nr_mmap_events))
5914 return;
5915
5916 mmap_event = (struct perf_mmap_event){
5917 .vma = vma,
5918
5919
5920 .event_id = {
5921 .header = {
5922 .type = PERF_RECORD_MMAP,
5923 .misc = PERF_RECORD_MISC_USER,
5924
5925 },
5926
5927
5928 .start = vma->vm_start,
5929 .len = vma->vm_end - vma->vm_start,
5930 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5931 },
5932
5933
5934
5935
5936
5937
5938 };
5939
5940 perf_event_mmap_event(&mmap_event);
5941}
5942
5943void perf_event_aux_event(struct perf_event *event, unsigned long head,
5944 unsigned long size, u64 flags)
5945{
5946 struct perf_output_handle handle;
5947 struct perf_sample_data sample;
5948 struct perf_aux_event {
5949 struct perf_event_header header;
5950 u64 offset;
5951 u64 size;
5952 u64 flags;
5953 } rec = {
5954 .header = {
5955 .type = PERF_RECORD_AUX,
5956 .misc = 0,
5957 .size = sizeof(rec),
5958 },
5959 .offset = head,
5960 .size = size,
5961 .flags = flags,
5962 };
5963 int ret;
5964
5965 perf_event_header__init_id(&rec.header, &sample, event);
5966 ret = perf_output_begin(&handle, event, rec.header.size);
5967
5968 if (ret)
5969 return;
5970
5971 perf_output_put(&handle, rec);
5972 perf_event__output_id_sample(event, &handle, &sample);
5973
5974 perf_output_end(&handle);
5975}
5976
5977
5978
5979
5980
5981static void perf_log_throttle(struct perf_event *event, int enable)
5982{
5983 struct perf_output_handle handle;
5984 struct perf_sample_data sample;
5985 int ret;
5986
5987 struct {
5988 struct perf_event_header header;
5989 u64 time;
5990 u64 id;
5991 u64 stream_id;
5992 } throttle_event = {
5993 .header = {
5994 .type = PERF_RECORD_THROTTLE,
5995 .misc = 0,
5996 .size = sizeof(throttle_event),
5997 },
5998 .time = perf_event_clock(event),
5999 .id = primary_event_id(event),
6000 .stream_id = event->id,
6001 };
6002
6003 if (enable)
6004 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6005
6006 perf_event_header__init_id(&throttle_event.header, &sample, event);
6007
6008 ret = perf_output_begin(&handle, event,
6009 throttle_event.header.size);
6010 if (ret)
6011 return;
6012
6013 perf_output_put(&handle, throttle_event);
6014 perf_event__output_id_sample(event, &handle, &sample);
6015 perf_output_end(&handle);
6016}
6017
6018static void perf_log_itrace_start(struct perf_event *event)
6019{
6020 struct perf_output_handle handle;
6021 struct perf_sample_data sample;
6022 struct perf_aux_event {
6023 struct perf_event_header header;
6024 u32 pid;
6025 u32 tid;
6026 } rec;
6027 int ret;
6028
6029 if (event->parent)
6030 event = event->parent;
6031
6032 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6033 event->hw.itrace_started)
6034 return;
6035
6036 event->hw.itrace_started = 1;
6037
6038 rec.header.type = PERF_RECORD_ITRACE_START;
6039 rec.header.misc = 0;
6040 rec.header.size = sizeof(rec);
6041 rec.pid = perf_event_pid(event, current);
6042 rec.tid = perf_event_tid(event, current);
6043
6044 perf_event_header__init_id(&rec.header, &sample, event);
6045 ret = perf_output_begin(&handle, event, rec.header.size);
6046
6047 if (ret)
6048 return;
6049
6050 perf_output_put(&handle, rec);
6051 perf_event__output_id_sample(event, &handle, &sample);
6052
6053 perf_output_end(&handle);
6054}
6055
6056
6057
6058
6059
6060static int __perf_event_overflow(struct perf_event *event,
6061 int throttle, struct perf_sample_data *data,
6062 struct pt_regs *regs)
6063{
6064 int events = atomic_read(&event->event_limit);
6065 struct hw_perf_event *hwc = &event->hw;
6066 u64 seq;
6067 int ret = 0;
6068
6069
6070
6071
6072
6073 if (unlikely(!is_sampling_event(event)))
6074 return 0;
6075
6076 seq = __this_cpu_read(perf_throttled_seq);
6077 if (seq != hwc->interrupts_seq) {
6078 hwc->interrupts_seq = seq;
6079 hwc->interrupts = 1;
6080 } else {
6081 hwc->interrupts++;
6082 if (unlikely(throttle
6083 && hwc->interrupts >= max_samples_per_tick)) {
6084 __this_cpu_inc(perf_throttled_count);
6085 hwc->interrupts = MAX_INTERRUPTS;
6086 perf_log_throttle(event, 0);
6087 tick_nohz_full_kick();
6088 ret = 1;
6089 }
6090 }
6091
6092 if (event->attr.freq) {
6093 u64 now = perf_clock();
6094 s64 delta = now - hwc->freq_time_stamp;
6095
6096 hwc->freq_time_stamp = now;
6097
6098 if (delta > 0 && delta < 2*TICK_NSEC)
6099 perf_adjust_period(event, delta, hwc->last_period, true);
6100 }
6101
6102
6103
6104
6105
6106
6107 event->pending_kill = POLL_IN;
6108 if (events && atomic_dec_and_test(&event->event_limit)) {
6109 ret = 1;
6110 event->pending_kill = POLL_HUP;
6111 event->pending_disable = 1;
6112 irq_work_queue(&event->pending);
6113 }
6114
6115 if (event->overflow_handler)
6116 event->overflow_handler(event, data, regs);
6117 else
6118 perf_event_output(event, data, regs);
6119
6120 if (event->fasync && event->pending_kill) {
6121 event->pending_wakeup = 1;
6122 irq_work_queue(&event->pending);
6123 }
6124
6125 return ret;
6126}
6127
6128int perf_event_overflow(struct perf_event *event,
6129 struct perf_sample_data *data,
6130 struct pt_regs *regs)
6131{
6132 return __perf_event_overflow(event, 1, data, regs);
6133}
6134
6135
6136
6137
6138
6139struct swevent_htable {
6140 struct swevent_hlist *swevent_hlist;
6141 struct mutex hlist_mutex;
6142 int hlist_refcount;
6143
6144
6145 int recursion[PERF_NR_CONTEXTS];
6146
6147
6148 bool online;
6149};
6150
6151static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6152
6153
6154
6155
6156
6157
6158
6159
6160u64 perf_swevent_set_period(struct perf_event *event)
6161{
6162 struct hw_perf_event *hwc = &event->hw;
6163 u64 period = hwc->last_period;
6164 u64 nr, offset;
6165 s64 old, val;
6166
6167 hwc->last_period = hwc->sample_period;
6168
6169again:
6170 old = val = local64_read(&hwc->period_left);
6171 if (val < 0)
6172 return 0;
6173
6174 nr = div64_u64(period + val, period);
6175 offset = nr * period;
6176 val -= offset;
6177 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6178 goto again;
6179
6180 return nr;
6181}
6182
6183static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6184 struct perf_sample_data *data,
6185 struct pt_regs *regs)
6186{
6187 struct hw_perf_event *hwc = &event->hw;
6188 int throttle = 0;
6189
6190 if (!overflow)
6191 overflow = perf_swevent_set_period(event);
6192
6193 if (hwc->interrupts == MAX_INTERRUPTS)
6194 return;
6195
6196 for (; overflow; overflow--) {
6197 if (__perf_event_overflow(event, throttle,
6198 data, regs)) {
6199
6200
6201
6202
6203 break;
6204 }
6205 throttle = 1;
6206 }
6207}
6208
6209static void perf_swevent_event(struct perf_event *event, u64 nr,
6210 struct perf_sample_data *data,
6211 struct pt_regs *regs)
6212{
6213 struct hw_perf_event *hwc = &event->hw;
6214
6215 local64_add(nr, &event->count);
6216
6217 if (!regs)
6218 return;
6219
6220 if (!is_sampling_event(event))
6221 return;
6222
6223 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6224 data->period = nr;
6225 return perf_swevent_overflow(event, 1, data, regs);
6226 } else
6227 data->period = event->hw.last_period;
6228
6229 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6230 return perf_swevent_overflow(event, 1, data, regs);
6231
6232 if (local64_add_negative(nr, &hwc->period_left))
6233 return;
6234
6235 perf_swevent_overflow(event, 0, data, regs);
6236}
6237
6238static int perf_exclude_event(struct perf_event *event,
6239 struct pt_regs *regs)
6240{
6241 if (event->hw.state & PERF_HES_STOPPED)
6242 return 1;
6243
6244 if (regs) {
6245 if (event->attr.exclude_user && user_mode(regs))
6246 return 1;
6247
6248 if (event->attr.exclude_kernel && !user_mode(regs))
6249 return 1;
6250 }
6251
6252 return 0;
6253}
6254
6255static int perf_swevent_match(struct perf_event *event,
6256 enum perf_type_id type,
6257 u32 event_id,
6258 struct perf_sample_data *data,
6259 struct pt_regs *regs)
6260{
6261 if (event->attr.type != type)
6262 return 0;
6263
6264 if (event->attr.config != event_id)
6265 return 0;
6266
6267 if (perf_exclude_event(event, regs))
6268 return 0;
6269
6270 return 1;
6271}
6272
6273static inline u64 swevent_hash(u64 type, u32 event_id)
6274{
6275 u64 val = event_id | (type << 32);
6276
6277 return hash_64(val, SWEVENT_HLIST_BITS);
6278}
6279
6280static inline struct hlist_head *
6281__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6282{
6283 u64 hash = swevent_hash(type, event_id);
6284
6285 return &hlist->heads[hash];
6286}
6287
6288
6289static inline struct hlist_head *
6290find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6291{
6292 struct swevent_hlist *hlist;
6293
6294 hlist = rcu_dereference(swhash->swevent_hlist);
6295 if (!hlist)
6296 return NULL;
6297
6298 return __find_swevent_head(hlist, type, event_id);
6299}
6300
6301
6302static inline struct hlist_head *
6303find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6304{
6305 struct swevent_hlist *hlist;
6306 u32 event_id = event->attr.config;
6307 u64 type = event->attr.type;
6308
6309
6310
6311
6312
6313
6314 hlist = rcu_dereference_protected(swhash->swevent_hlist,
6315 lockdep_is_held(&event->ctx->lock));
6316 if (!hlist)
6317 return NULL;
6318
6319 return __find_swevent_head(hlist, type, event_id);
6320}
6321
6322static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6323 u64 nr,
6324 struct perf_sample_data *data,
6325 struct pt_regs *regs)
6326{
6327 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6328 struct perf_event *event;
6329 struct hlist_head *head;
6330
6331 rcu_read_lock();
6332 head = find_swevent_head_rcu(swhash, type, event_id);
6333 if (!head)
6334 goto end;
6335
6336 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6337 if (perf_swevent_match(event, type, event_id, data, regs))
6338 perf_swevent_event(event, nr, data, regs);
6339 }
6340end:
6341 rcu_read_unlock();
6342}
6343
6344DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6345
6346int perf_swevent_get_recursion_context(void)
6347{
6348 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6349
6350 return get_recursion_context(swhash->recursion);
6351}
6352EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6353
6354inline void perf_swevent_put_recursion_context(int rctx)
6355{
6356 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6357
6358 put_recursion_context(swhash->recursion, rctx);
6359}
6360
6361void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6362{
6363 struct perf_sample_data data;
6364
6365 if (WARN_ON_ONCE(!regs))
6366 return;
6367
6368 perf_sample_data_init(&data, addr, 0);
6369 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6370}
6371
6372void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6373{
6374 int rctx;
6375
6376 preempt_disable_notrace();
6377 rctx = perf_swevent_get_recursion_context();
6378 if (unlikely(rctx < 0))
6379 goto fail;
6380
6381 ___perf_sw_event(event_id, nr, regs, addr);
6382
6383 perf_swevent_put_recursion_context(rctx);
6384fail:
6385 preempt_enable_notrace();
6386}
6387
6388static void perf_swevent_read(struct perf_event *event)
6389{
6390}
6391
6392static int perf_swevent_add(struct perf_event *event, int flags)
6393{
6394 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6395 struct hw_perf_event *hwc = &event->hw;
6396 struct hlist_head *head;
6397
6398 if (is_sampling_event(event)) {
6399 hwc->last_period = hwc->sample_period;
6400 perf_swevent_set_period(event);
6401 }
6402
6403 hwc->state = !(flags & PERF_EF_START);
6404
6405 head = find_swevent_head(swhash, event);
6406 if (!head) {
6407
6408
6409
6410
6411 WARN_ON_ONCE(swhash->online);
6412 return -EINVAL;
6413 }
6414
6415 hlist_add_head_rcu(&event->hlist_entry, head);
6416 perf_event_update_userpage(event);
6417
6418 return 0;
6419}
6420
6421static void perf_swevent_del(struct perf_event *event, int flags)
6422{
6423 hlist_del_rcu(&event->hlist_entry);
6424}
6425
6426static void perf_swevent_start(struct perf_event *event, int flags)
6427{
6428 event->hw.state = 0;
6429}
6430
6431static void perf_swevent_stop(struct perf_event *event, int flags)
6432{
6433 event->hw.state = PERF_HES_STOPPED;
6434}
6435
6436
6437static inline struct swevent_hlist *
6438swevent_hlist_deref(struct swevent_htable *swhash)
6439{
6440 return rcu_dereference_protected(swhash->swevent_hlist,
6441 lockdep_is_held(&swhash->hlist_mutex));
6442}
6443
6444static void swevent_hlist_release(struct swevent_htable *swhash)
6445{
6446 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6447
6448 if (!hlist)
6449 return;
6450
6451 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6452 kfree_rcu(hlist, rcu_head);
6453}
6454
6455static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6456{
6457 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6458
6459 mutex_lock(&swhash->hlist_mutex);
6460
6461 if (!--swhash->hlist_refcount)
6462 swevent_hlist_release(swhash);
6463
6464 mutex_unlock(&swhash->hlist_mutex);
6465}
6466
6467static void swevent_hlist_put(struct perf_event *event)
6468{
6469 int cpu;
6470
6471 for_each_possible_cpu(cpu)
6472 swevent_hlist_put_cpu(event, cpu);
6473}
6474
6475static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6476{
6477 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6478 int err = 0;
6479
6480 mutex_lock(&swhash->hlist_mutex);
6481
6482 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6483 struct swevent_hlist *hlist;
6484
6485 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6486 if (!hlist) {
6487 err = -ENOMEM;
6488 goto exit;
6489 }
6490 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6491 }
6492 swhash->hlist_refcount++;
6493exit:
6494 mutex_unlock(&swhash->hlist_mutex);
6495
6496 return err;
6497}
6498
6499static int swevent_hlist_get(struct perf_event *event)
6500{
6501 int err;
6502 int cpu, failed_cpu;
6503
6504 get_online_cpus();
6505 for_each_possible_cpu(cpu) {
6506 err = swevent_hlist_get_cpu(event, cpu);
6507 if (err) {
6508 failed_cpu = cpu;
6509 goto fail;
6510 }
6511 }
6512 put_online_cpus();
6513
6514 return 0;
6515fail:
6516 for_each_possible_cpu(cpu) {
6517 if (cpu == failed_cpu)
6518 break;
6519 swevent_hlist_put_cpu(event, cpu);
6520 }
6521
6522 put_online_cpus();
6523 return err;
6524}
6525
6526struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6527
6528static void sw_perf_event_destroy(struct perf_event *event)
6529{
6530 u64 event_id = event->attr.config;
6531
6532 WARN_ON(event->parent);
6533
6534 static_key_slow_dec(&perf_swevent_enabled[event_id]);
6535 swevent_hlist_put(event);
6536}
6537
6538static int perf_swevent_init(struct perf_event *event)
6539{
6540 u64 event_id = event->attr.config;
6541
6542 if (event->attr.type != PERF_TYPE_SOFTWARE)
6543 return -ENOENT;
6544
6545
6546
6547
6548 if (has_branch_stack(event))
6549 return -EOPNOTSUPP;
6550
6551 switch (event_id) {
6552 case PERF_COUNT_SW_CPU_CLOCK:
6553 case PERF_COUNT_SW_TASK_CLOCK:
6554 return -ENOENT;
6555
6556 default:
6557 break;
6558 }
6559
6560 if (event_id >= PERF_COUNT_SW_MAX)
6561 return -ENOENT;
6562
6563 if (!event->parent) {
6564 int err;
6565
6566 err = swevent_hlist_get(event);
6567 if (err)
6568 return err;
6569
6570 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6571 event->destroy = sw_perf_event_destroy;
6572 }
6573
6574 return 0;
6575}
6576
6577static struct pmu perf_swevent = {
6578 .task_ctx_nr = perf_sw_context,
6579
6580 .capabilities = PERF_PMU_CAP_NO_NMI,
6581
6582 .event_init = perf_swevent_init,
6583 .add = perf_swevent_add,
6584 .del = perf_swevent_del,
6585 .start = perf_swevent_start,
6586 .stop = perf_swevent_stop,
6587 .read = perf_swevent_read,
6588};
6589
6590#ifdef CONFIG_EVENT_TRACING
6591
6592static int perf_tp_filter_match(struct perf_event *event,
6593 struct perf_sample_data *data)
6594{
6595 void *record = data->raw->data;
6596
6597 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6598 return 1;
6599 return 0;
6600}
6601
6602static int perf_tp_event_match(struct perf_event *event,
6603 struct perf_sample_data *data,
6604 struct pt_regs *regs)
6605{
6606 if (event->hw.state & PERF_HES_STOPPED)
6607 return 0;
6608
6609
6610
6611 if (event->attr.exclude_kernel)
6612 return 0;
6613
6614 if (!perf_tp_filter_match(event, data))
6615 return 0;
6616
6617 return 1;
6618}
6619
6620void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6621 struct pt_regs *regs, struct hlist_head *head, int rctx,
6622 struct task_struct *task)
6623{
6624 struct perf_sample_data data;
6625 struct perf_event *event;
6626
6627 struct perf_raw_record raw = {
6628 .size = entry_size,
6629 .data = record,
6630 };
6631
6632 perf_sample_data_init(&data, addr, 0);
6633 data.raw = &raw;
6634
6635 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6636 if (perf_tp_event_match(event, &data, regs))
6637 perf_swevent_event(event, count, &data, regs);
6638 }
6639
6640
6641
6642
6643
6644 if (task && task != current) {
6645 struct perf_event_context *ctx;
6646 struct trace_entry *entry = record;
6647
6648 rcu_read_lock();
6649 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6650 if (!ctx)
6651 goto unlock;
6652
6653 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6654 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6655 continue;
6656 if (event->attr.config != entry->type)
6657 continue;
6658 if (perf_tp_event_match(event, &data, regs))
6659 perf_swevent_event(event, count, &data, regs);
6660 }
6661unlock:
6662 rcu_read_unlock();
6663 }
6664
6665 perf_swevent_put_recursion_context(rctx);
6666}
6667EXPORT_SYMBOL_GPL(perf_tp_event);
6668
6669static void tp_perf_event_destroy(struct perf_event *event)
6670{
6671 perf_trace_destroy(event);
6672}
6673
6674static int perf_tp_event_init(struct perf_event *event)
6675{
6676 int err;
6677
6678 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6679 return -ENOENT;
6680
6681
6682
6683
6684 if (has_branch_stack(event))
6685 return -EOPNOTSUPP;
6686
6687 err = perf_trace_init(event);
6688 if (err)
6689 return err;
6690
6691 event->destroy = tp_perf_event_destroy;
6692
6693 return 0;
6694}
6695
6696static struct pmu perf_tracepoint = {
6697 .task_ctx_nr = perf_sw_context,
6698
6699 .event_init = perf_tp_event_init,
6700 .add = perf_trace_add,
6701 .del = perf_trace_del,
6702 .start = perf_swevent_start,
6703 .stop = perf_swevent_stop,
6704 .read = perf_swevent_read,
6705};
6706
6707static inline void perf_tp_register(void)
6708{
6709 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
6710}
6711
6712static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6713{
6714 char *filter_str;
6715 int ret;
6716
6717 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6718 return -EINVAL;
6719
6720 filter_str = strndup_user(arg, PAGE_SIZE);
6721 if (IS_ERR(filter_str))
6722 return PTR_ERR(filter_str);
6723
6724 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6725
6726 kfree(filter_str);
6727 return ret;
6728}
6729
6730static void perf_event_free_filter(struct perf_event *event)
6731{
6732 ftrace_profile_free_filter(event);
6733}
6734
6735static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6736{
6737 struct bpf_prog *prog;
6738
6739 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6740 return -EINVAL;
6741
6742 if (event->tp_event->prog)
6743 return -EEXIST;
6744
6745 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6746
6747 return -EINVAL;
6748
6749 prog = bpf_prog_get(prog_fd);
6750 if (IS_ERR(prog))
6751 return PTR_ERR(prog);
6752
6753 if (prog->type != BPF_PROG_TYPE_KPROBE) {
6754
6755 bpf_prog_put(prog);
6756 return -EINVAL;
6757 }
6758
6759 event->tp_event->prog = prog;
6760
6761 return 0;
6762}
6763
6764static void perf_event_free_bpf_prog(struct perf_event *event)
6765{
6766 struct bpf_prog *prog;
6767
6768 if (!event->tp_event)
6769 return;
6770
6771 prog = event->tp_event->prog;
6772 if (prog) {
6773 event->tp_event->prog = NULL;
6774 bpf_prog_put(prog);
6775 }
6776}
6777
6778#else
6779
6780static inline void perf_tp_register(void)
6781{
6782}
6783
6784static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6785{
6786 return -ENOENT;
6787}
6788
6789static void perf_event_free_filter(struct perf_event *event)
6790{
6791}
6792
6793static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6794{
6795 return -ENOENT;
6796}
6797
6798static void perf_event_free_bpf_prog(struct perf_event *event)
6799{
6800}
6801#endif
6802
6803#ifdef CONFIG_HAVE_HW_BREAKPOINT
6804void perf_bp_event(struct perf_event *bp, void *data)
6805{
6806 struct perf_sample_data sample;
6807 struct pt_regs *regs = data;
6808
6809 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6810
6811 if (!bp->hw.state && !perf_exclude_event(bp, regs))
6812 perf_swevent_event(bp, 1, &sample, regs);
6813}
6814#endif
6815
6816
6817
6818
6819
6820static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6821{
6822 enum hrtimer_restart ret = HRTIMER_RESTART;
6823 struct perf_sample_data data;
6824 struct pt_regs *regs;
6825 struct perf_event *event;
6826 u64 period;
6827
6828 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6829
6830 if (event->state != PERF_EVENT_STATE_ACTIVE)
6831 return HRTIMER_NORESTART;
6832
6833 event->pmu->read(event);
6834
6835 perf_sample_data_init(&data, 0, event->hw.last_period);
6836 regs = get_irq_regs();
6837
6838 if (regs && !perf_exclude_event(event, regs)) {
6839 if (!(event->attr.exclude_idle && is_idle_task(current)))
6840 if (__perf_event_overflow(event, 1, &data, regs))
6841 ret = HRTIMER_NORESTART;
6842 }
6843
6844 period = max_t(u64, 10000, event->hw.sample_period);
6845 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6846
6847 return ret;
6848}
6849
6850static void perf_swevent_start_hrtimer(struct perf_event *event)
6851{
6852 struct hw_perf_event *hwc = &event->hw;
6853 s64 period;
6854
6855 if (!is_sampling_event(event))
6856 return;
6857
6858 period = local64_read(&hwc->period_left);
6859 if (period) {
6860 if (period < 0)
6861 period = 10000;
6862
6863 local64_set(&hwc->period_left, 0);
6864 } else {
6865 period = max_t(u64, 10000, hwc->sample_period);
6866 }
6867 __hrtimer_start_range_ns(&hwc->hrtimer,
6868 ns_to_ktime(period), 0,
6869 HRTIMER_MODE_REL_PINNED, 0);
6870}
6871
6872static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6873{
6874 struct hw_perf_event *hwc = &event->hw;
6875
6876 if (is_sampling_event(event)) {
6877 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6878 local64_set(&hwc->period_left, ktime_to_ns(remaining));
6879
6880 hrtimer_cancel(&hwc->hrtimer);
6881 }
6882}
6883
6884static void perf_swevent_init_hrtimer(struct perf_event *event)
6885{
6886 struct hw_perf_event *hwc = &event->hw;
6887
6888 if (!is_sampling_event(event))
6889 return;
6890
6891 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6892 hwc->hrtimer.function = perf_swevent_hrtimer;
6893
6894
6895
6896
6897
6898 if (event->attr.freq) {
6899 long freq = event->attr.sample_freq;
6900
6901 event->attr.sample_period = NSEC_PER_SEC / freq;
6902 hwc->sample_period = event->attr.sample_period;
6903 local64_set(&hwc->period_left, hwc->sample_period);
6904 hwc->last_period = hwc->sample_period;
6905 event->attr.freq = 0;
6906 }
6907}
6908
6909
6910
6911
6912
6913static void cpu_clock_event_update(struct perf_event *event)
6914{
6915 s64 prev;
6916 u64 now;
6917
6918 now = local_clock();
6919 prev = local64_xchg(&event->hw.prev_count, now);
6920 local64_add(now - prev, &event->count);
6921}
6922
6923static void cpu_clock_event_start(struct perf_event *event, int flags)
6924{
6925 local64_set(&event->hw.prev_count, local_clock());
6926 perf_swevent_start_hrtimer(event);
6927}
6928
6929static void cpu_clock_event_stop(struct perf_event *event, int flags)
6930{
6931 perf_swevent_cancel_hrtimer(event);
6932 cpu_clock_event_update(event);
6933}
6934
6935static int cpu_clock_event_add(struct perf_event *event, int flags)
6936{
6937 if (flags & PERF_EF_START)
6938 cpu_clock_event_start(event, flags);
6939 perf_event_update_userpage(event);
6940
6941 return 0;
6942}
6943
6944static void cpu_clock_event_del(struct perf_event *event, int flags)
6945{
6946 cpu_clock_event_stop(event, flags);
6947}
6948
6949static void cpu_clock_event_read(struct perf_event *event)
6950{
6951 cpu_clock_event_update(event);
6952}
6953
6954static int cpu_clock_event_init(struct perf_event *event)
6955{
6956 if (event->attr.type != PERF_TYPE_SOFTWARE)
6957 return -ENOENT;
6958
6959 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6960 return -ENOENT;
6961
6962
6963
6964
6965 if (has_branch_stack(event))
6966 return -EOPNOTSUPP;
6967
6968 perf_swevent_init_hrtimer(event);
6969
6970 return 0;
6971}
6972
6973static struct pmu perf_cpu_clock = {
6974 .task_ctx_nr = perf_sw_context,
6975
6976 .capabilities = PERF_PMU_CAP_NO_NMI,
6977
6978 .event_init = cpu_clock_event_init,
6979 .add = cpu_clock_event_add,
6980 .del = cpu_clock_event_del,
6981 .start = cpu_clock_event_start,
6982 .stop = cpu_clock_event_stop,
6983 .read = cpu_clock_event_read,
6984};
6985
6986
6987
6988
6989
6990static void task_clock_event_update(struct perf_event *event, u64 now)
6991{
6992 u64 prev;
6993 s64 delta;
6994
6995 prev = local64_xchg(&event->hw.prev_count, now);
6996 delta = now - prev;
6997 local64_add(delta, &event->count);
6998}
6999
7000static void task_clock_event_start(struct perf_event *event, int flags)
7001{
7002 local64_set(&event->hw.prev_count, event->ctx->time);
7003 perf_swevent_start_hrtimer(event);
7004}
7005
7006static void task_clock_event_stop(struct perf_event *event, int flags)
7007{
7008 perf_swevent_cancel_hrtimer(event);
7009 task_clock_event_update(event, event->ctx->time);
7010}
7011
7012static int task_clock_event_add(struct perf_event *event, int flags)
7013{
7014 if (flags & PERF_EF_START)
7015 task_clock_event_start(event, flags);
7016 perf_event_update_userpage(event);
7017
7018 return 0;
7019}
7020
7021static void task_clock_event_del(struct perf_event *event, int flags)
7022{
7023 task_clock_event_stop(event, PERF_EF_UPDATE);
7024}
7025
7026static void task_clock_event_read(struct perf_event *event)
7027{
7028 u64 now = perf_clock();
7029 u64 delta = now - event->ctx->timestamp;
7030 u64 time = event->ctx->time + delta;
7031
7032 task_clock_event_update(event, time);
7033}
7034
7035static int task_clock_event_init(struct perf_event *event)
7036{
7037 if (event->attr.type != PERF_TYPE_SOFTWARE)
7038 return -ENOENT;
7039
7040 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7041 return -ENOENT;
7042
7043
7044
7045
7046 if (has_branch_stack(event))
7047 return -EOPNOTSUPP;
7048
7049 perf_swevent_init_hrtimer(event);
7050
7051 return 0;
7052}
7053
7054static struct pmu perf_task_clock = {
7055 .task_ctx_nr = perf_sw_context,
7056
7057 .capabilities = PERF_PMU_CAP_NO_NMI,
7058
7059 .event_init = task_clock_event_init,
7060 .add = task_clock_event_add,
7061 .del = task_clock_event_del,
7062 .start = task_clock_event_start,
7063 .stop = task_clock_event_stop,
7064 .read = task_clock_event_read,
7065};
7066
7067static void perf_pmu_nop_void(struct pmu *pmu)
7068{
7069}
7070
7071static int perf_pmu_nop_int(struct pmu *pmu)
7072{
7073 return 0;
7074}
7075
7076static void perf_pmu_start_txn(struct pmu *pmu)
7077{
7078 perf_pmu_disable(pmu);
7079}
7080
7081static int perf_pmu_commit_txn(struct pmu *pmu)
7082{
7083 perf_pmu_enable(pmu);
7084 return 0;
7085}
7086
7087static void perf_pmu_cancel_txn(struct pmu *pmu)
7088{
7089 perf_pmu_enable(pmu);
7090}
7091
7092static int perf_event_idx_default(struct perf_event *event)
7093{
7094 return 0;
7095}
7096
7097
7098
7099
7100
7101static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7102{
7103 struct pmu *pmu;
7104
7105 if (ctxn < 0)
7106 return NULL;
7107
7108 list_for_each_entry(pmu, &pmus, entry) {
7109 if (pmu->task_ctx_nr == ctxn)
7110 return pmu->pmu_cpu_context;
7111 }
7112
7113 return NULL;
7114}
7115
7116static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7117{
7118 int cpu;
7119
7120 for_each_possible_cpu(cpu) {
7121 struct perf_cpu_context *cpuctx;
7122
7123 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7124
7125 if (cpuctx->unique_pmu == old_pmu)
7126 cpuctx->unique_pmu = pmu;
7127 }
7128}
7129
7130static void free_pmu_context(struct pmu *pmu)
7131{
7132 struct pmu *i;
7133
7134 mutex_lock(&pmus_lock);
7135
7136
7137
7138 list_for_each_entry(i, &pmus, entry) {
7139 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7140 update_pmu_context(i, pmu);
7141 goto out;
7142 }
7143 }
7144
7145 free_percpu(pmu->pmu_cpu_context);
7146out:
7147 mutex_unlock(&pmus_lock);
7148}
7149static struct idr pmu_idr;
7150
7151static ssize_t
7152type_show(struct device *dev, struct device_attribute *attr, char *page)
7153{
7154 struct pmu *pmu = dev_get_drvdata(dev);
7155
7156 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7157}
7158static DEVICE_ATTR_RO(type);
7159
7160static ssize_t
7161perf_event_mux_interval_ms_show(struct device *dev,
7162 struct device_attribute *attr,
7163 char *page)
7164{
7165 struct pmu *pmu = dev_get_drvdata(dev);
7166
7167 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7168}
7169
7170static ssize_t
7171perf_event_mux_interval_ms_store(struct device *dev,
7172 struct device_attribute *attr,
7173 const char *buf, size_t count)
7174{
7175 struct pmu *pmu = dev_get_drvdata(dev);
7176 int timer, cpu, ret;
7177
7178 ret = kstrtoint(buf, 0, &timer);
7179 if (ret)
7180 return ret;
7181
7182 if (timer < 1)
7183 return -EINVAL;
7184
7185
7186 if (timer == pmu->hrtimer_interval_ms)
7187 return count;
7188
7189 pmu->hrtimer_interval_ms = timer;
7190
7191
7192 for_each_possible_cpu(cpu) {
7193 struct perf_cpu_context *cpuctx;
7194 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7195 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7196
7197 if (hrtimer_active(&cpuctx->hrtimer))
7198 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
7199 }
7200
7201 return count;
7202}
7203static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7204
7205static struct attribute *pmu_dev_attrs[] = {
7206 &dev_attr_type.attr,
7207 &dev_attr_perf_event_mux_interval_ms.attr,
7208 NULL,
7209};
7210ATTRIBUTE_GROUPS(pmu_dev);
7211
7212static int pmu_bus_running;
7213static struct bus_type pmu_bus = {
7214 .name = "event_source",
7215 .dev_groups = pmu_dev_groups,
7216};
7217
7218static void pmu_dev_release(struct device *dev)
7219{
7220 kfree(dev);
7221}
7222
7223static int pmu_dev_alloc(struct pmu *pmu)
7224{
7225 int ret = -ENOMEM;
7226
7227 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7228 if (!pmu->dev)
7229 goto out;
7230
7231 pmu->dev->groups = pmu->attr_groups;
7232 device_initialize(pmu->dev);
7233 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7234 if (ret)
7235 goto free_dev;
7236
7237 dev_set_drvdata(pmu->dev, pmu);
7238 pmu->dev->bus = &pmu_bus;
7239 pmu->dev->release = pmu_dev_release;
7240 ret = device_add(pmu->dev);
7241 if (ret)
7242 goto free_dev;
7243
7244out:
7245 return ret;
7246
7247free_dev:
7248 put_device(pmu->dev);
7249 goto out;
7250}
7251
7252static struct lock_class_key cpuctx_mutex;
7253static struct lock_class_key cpuctx_lock;
7254
7255int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7256{
7257 int cpu, ret;
7258
7259 mutex_lock(&pmus_lock);
7260 ret = -ENOMEM;
7261 pmu->pmu_disable_count = alloc_percpu(int);
7262 if (!pmu->pmu_disable_count)
7263 goto unlock;
7264
7265 pmu->type = -1;
7266 if (!name)
7267 goto skip_type;
7268 pmu->name = name;
7269
7270 if (type < 0) {
7271 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7272 if (type < 0) {
7273 ret = type;
7274 goto free_pdc;
7275 }
7276 }
7277 pmu->type = type;
7278
7279 if (pmu_bus_running) {
7280 ret = pmu_dev_alloc(pmu);
7281 if (ret)
7282 goto free_idr;
7283 }
7284
7285skip_type:
7286 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7287 if (pmu->pmu_cpu_context)
7288 goto got_cpu_context;
7289
7290 ret = -ENOMEM;
7291 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7292 if (!pmu->pmu_cpu_context)
7293 goto free_dev;
7294
7295 for_each_possible_cpu(cpu) {
7296 struct perf_cpu_context *cpuctx;
7297
7298 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7299 __perf_event_init_context(&cpuctx->ctx);
7300 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7301 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7302 cpuctx->ctx.pmu = pmu;
7303
7304 __perf_cpu_hrtimer_init(cpuctx, cpu);
7305
7306 cpuctx->unique_pmu = pmu;
7307 }
7308
7309got_cpu_context:
7310 if (!pmu->start_txn) {
7311 if (pmu->pmu_enable) {
7312
7313
7314
7315
7316
7317 pmu->start_txn = perf_pmu_start_txn;
7318 pmu->commit_txn = perf_pmu_commit_txn;
7319 pmu->cancel_txn = perf_pmu_cancel_txn;
7320 } else {
7321 pmu->start_txn = perf_pmu_nop_void;
7322 pmu->commit_txn = perf_pmu_nop_int;
7323 pmu->cancel_txn = perf_pmu_nop_void;
7324 }
7325 }
7326
7327 if (!pmu->pmu_enable) {
7328 pmu->pmu_enable = perf_pmu_nop_void;
7329 pmu->pmu_disable = perf_pmu_nop_void;
7330 }
7331
7332 if (!pmu->event_idx)
7333 pmu->event_idx = perf_event_idx_default;
7334
7335 list_add_rcu(&pmu->entry, &pmus);
7336 atomic_set(&pmu->exclusive_cnt, 0);
7337 ret = 0;
7338unlock:
7339 mutex_unlock(&pmus_lock);
7340
7341 return ret;
7342
7343free_dev:
7344 device_del(pmu->dev);
7345 put_device(pmu->dev);
7346
7347free_idr:
7348 if (pmu->type >= PERF_TYPE_MAX)
7349 idr_remove(&pmu_idr, pmu->type);
7350
7351free_pdc:
7352 free_percpu(pmu->pmu_disable_count);
7353 goto unlock;
7354}
7355EXPORT_SYMBOL_GPL(perf_pmu_register);
7356
7357void perf_pmu_unregister(struct pmu *pmu)
7358{
7359 mutex_lock(&pmus_lock);
7360 list_del_rcu(&pmu->entry);
7361 mutex_unlock(&pmus_lock);
7362
7363
7364
7365
7366
7367 synchronize_srcu(&pmus_srcu);
7368 synchronize_rcu();
7369
7370 free_percpu(pmu->pmu_disable_count);
7371 if (pmu->type >= PERF_TYPE_MAX)
7372 idr_remove(&pmu_idr, pmu->type);
7373 device_del(pmu->dev);
7374 put_device(pmu->dev);
7375 free_pmu_context(pmu);
7376}
7377EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7378
7379static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7380{
7381 struct perf_event_context *ctx = NULL;
7382 int ret;
7383
7384 if (!try_module_get(pmu->module))
7385 return -ENODEV;
7386
7387 if (event->group_leader != event) {
7388
7389
7390
7391
7392 ctx = perf_event_ctx_lock_nested(event->group_leader,
7393 SINGLE_DEPTH_NESTING);
7394 BUG_ON(!ctx);
7395 }
7396
7397 event->pmu = pmu;
7398 ret = pmu->event_init(event);
7399
7400 if (ctx)
7401 perf_event_ctx_unlock(event->group_leader, ctx);
7402
7403 if (ret)
7404 module_put(pmu->module);
7405
7406 return ret;
7407}
7408
7409struct pmu *perf_init_event(struct perf_event *event)
7410{
7411 struct pmu *pmu = NULL;
7412 int idx;
7413 int ret;
7414
7415 idx = srcu_read_lock(&pmus_srcu);
7416
7417 rcu_read_lock();
7418 pmu = idr_find(&pmu_idr, event->attr.type);
7419 rcu_read_unlock();
7420 if (pmu) {
7421 ret = perf_try_init_event(pmu, event);
7422 if (ret)
7423 pmu = ERR_PTR(ret);
7424 goto unlock;
7425 }
7426
7427 list_for_each_entry_rcu(pmu, &pmus, entry) {
7428 ret = perf_try_init_event(pmu, event);
7429 if (!ret)
7430 goto unlock;
7431
7432 if (ret != -ENOENT) {
7433 pmu = ERR_PTR(ret);
7434 goto unlock;
7435 }
7436 }
7437 pmu = ERR_PTR(-ENOENT);
7438unlock:
7439 srcu_read_unlock(&pmus_srcu, idx);
7440
7441 return pmu;
7442}
7443
7444static void account_event_cpu(struct perf_event *event, int cpu)
7445{
7446 if (event->parent)
7447 return;
7448
7449 if (is_cgroup_event(event))
7450 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7451}
7452
7453static void account_event(struct perf_event *event)
7454{
7455 if (event->parent)
7456 return;
7457
7458 if (event->attach_state & PERF_ATTACH_TASK)
7459 static_key_slow_inc(&perf_sched_events.key);
7460 if (event->attr.mmap || event->attr.mmap_data)
7461 atomic_inc(&nr_mmap_events);
7462 if (event->attr.comm)
7463 atomic_inc(&nr_comm_events);
7464 if (event->attr.task)
7465 atomic_inc(&nr_task_events);
7466 if (event->attr.freq) {
7467 if (atomic_inc_return(&nr_freq_events) == 1)
7468 tick_nohz_full_kick_all();
7469 }
7470 if (has_branch_stack(event))
7471 static_key_slow_inc(&perf_sched_events.key);
7472 if (is_cgroup_event(event))
7473 static_key_slow_inc(&perf_sched_events.key);
7474
7475 account_event_cpu(event, event->cpu);
7476}
7477
7478
7479
7480
7481static struct perf_event *
7482perf_event_alloc(struct perf_event_attr *attr, int cpu,
7483 struct task_struct *task,
7484 struct perf_event *group_leader,
7485 struct perf_event *parent_event,
7486 perf_overflow_handler_t overflow_handler,
7487 void *context, int cgroup_fd)
7488{
7489 struct pmu *pmu;
7490 struct perf_event *event;
7491 struct hw_perf_event *hwc;
7492 long err = -EINVAL;
7493
7494 if ((unsigned)cpu >= nr_cpu_ids) {
7495 if (!task || cpu != -1)
7496 return ERR_PTR(-EINVAL);
7497 }
7498
7499 event = kzalloc(sizeof(*event), GFP_KERNEL);
7500 if (!event)
7501 return ERR_PTR(-ENOMEM);
7502
7503
7504
7505
7506
7507 if (!group_leader)
7508 group_leader = event;
7509
7510 mutex_init(&event->child_mutex);
7511 INIT_LIST_HEAD(&event->child_list);
7512
7513 INIT_LIST_HEAD(&event->group_entry);
7514 INIT_LIST_HEAD(&event->event_entry);
7515 INIT_LIST_HEAD(&event->sibling_list);
7516 INIT_LIST_HEAD(&event->rb_entry);
7517 INIT_LIST_HEAD(&event->active_entry);
7518 INIT_HLIST_NODE(&event->hlist_entry);
7519
7520
7521 init_waitqueue_head(&event->waitq);
7522 init_irq_work(&event->pending, perf_pending_event);
7523
7524 mutex_init(&event->mmap_mutex);
7525
7526 atomic_long_set(&event->refcount, 1);
7527 event->cpu = cpu;
7528 event->attr = *attr;
7529 event->group_leader = group_leader;
7530 event->pmu = NULL;
7531 event->oncpu = -1;
7532
7533 event->parent = parent_event;
7534
7535 event->ns = get_pid_ns(task_active_pid_ns(current));
7536 event->id = atomic64_inc_return(&perf_event_id);
7537
7538 event->state = PERF_EVENT_STATE_INACTIVE;
7539
7540 if (task) {
7541 event->attach_state = PERF_ATTACH_TASK;
7542
7543
7544
7545
7546
7547 event->hw.target = task;
7548 }
7549
7550 event->clock = &local_clock;
7551 if (parent_event)
7552 event->clock = parent_event->clock;
7553
7554 if (!overflow_handler && parent_event) {
7555 overflow_handler = parent_event->overflow_handler;
7556 context = parent_event->overflow_handler_context;
7557 }
7558
7559 event->overflow_handler = overflow_handler;
7560 event->overflow_handler_context = context;
7561
7562 perf_event__state_init(event);
7563
7564 pmu = NULL;
7565
7566 hwc = &event->hw;
7567 hwc->sample_period = attr->sample_period;
7568 if (attr->freq && attr->sample_freq)
7569 hwc->sample_period = 1;
7570 hwc->last_period = hwc->sample_period;
7571
7572 local64_set(&hwc->period_left, hwc->sample_period);
7573
7574
7575
7576
7577 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7578 goto err_ns;
7579
7580 if (!has_branch_stack(event))
7581 event->attr.branch_sample_type = 0;
7582
7583 if (cgroup_fd != -1) {
7584 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7585 if (err)
7586 goto err_ns;
7587 }
7588
7589 pmu = perf_init_event(event);
7590 if (!pmu)
7591 goto err_ns;
7592 else if (IS_ERR(pmu)) {
7593 err = PTR_ERR(pmu);
7594 goto err_ns;
7595 }
7596
7597 err = exclusive_event_init(event);
7598 if (err)
7599 goto err_pmu;
7600
7601 if (!event->parent) {
7602 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7603 err = get_callchain_buffers();
7604 if (err)
7605 goto err_per_task;
7606 }
7607 }
7608
7609 return event;
7610
7611err_per_task:
7612 exclusive_event_destroy(event);
7613
7614err_pmu:
7615 if (event->destroy)
7616 event->destroy(event);
7617 module_put(pmu->module);
7618err_ns:
7619 if (is_cgroup_event(event))
7620 perf_detach_cgroup(event);
7621 if (event->ns)
7622 put_pid_ns(event->ns);
7623 kfree(event);
7624
7625 return ERR_PTR(err);
7626}
7627
7628static int perf_copy_attr(struct perf_event_attr __user *uattr,
7629 struct perf_event_attr *attr)
7630{
7631 u32 size;
7632 int ret;
7633
7634 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7635 return -EFAULT;
7636
7637
7638
7639
7640 memset(attr, 0, sizeof(*attr));
7641
7642 ret = get_user(size, &uattr->size);
7643 if (ret)
7644 return ret;
7645
7646 if (size > PAGE_SIZE)
7647 goto err_size;
7648
7649 if (!size)
7650 size = PERF_ATTR_SIZE_VER0;
7651
7652 if (size < PERF_ATTR_SIZE_VER0)
7653 goto err_size;
7654
7655
7656
7657
7658
7659
7660
7661 if (size > sizeof(*attr)) {
7662 unsigned char __user *addr;
7663 unsigned char __user *end;
7664 unsigned char val;
7665
7666 addr = (void __user *)uattr + sizeof(*attr);
7667 end = (void __user *)uattr + size;
7668
7669 for (; addr < end; addr++) {
7670 ret = get_user(val, addr);
7671 if (ret)
7672 return ret;
7673 if (val)
7674 goto err_size;
7675 }
7676 size = sizeof(*attr);
7677 }
7678
7679 ret = copy_from_user(attr, uattr, size);
7680 if (ret)
7681 return -EFAULT;
7682
7683 if (attr->__reserved_1)
7684 return -EINVAL;
7685
7686 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7687 return -EINVAL;
7688
7689 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7690 return -EINVAL;
7691
7692 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7693 u64 mask = attr->branch_sample_type;
7694
7695
7696 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7697 return -EINVAL;
7698
7699
7700 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7701 return -EINVAL;
7702
7703
7704 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7705
7706
7707 if (!attr->exclude_kernel)
7708 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7709
7710 if (!attr->exclude_user)
7711 mask |= PERF_SAMPLE_BRANCH_USER;
7712
7713 if (!attr->exclude_hv)
7714 mask |= PERF_SAMPLE_BRANCH_HV;
7715
7716
7717
7718 attr->branch_sample_type = mask;
7719 }
7720
7721 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
7722 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7723 return -EACCES;
7724 }
7725
7726 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
7727 ret = perf_reg_validate(attr->sample_regs_user);
7728 if (ret)
7729 return ret;
7730 }
7731
7732 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7733 if (!arch_perf_have_user_stack_dump())
7734 return -ENOSYS;
7735
7736
7737
7738
7739
7740
7741 if (attr->sample_stack_user >= USHRT_MAX)
7742 ret = -EINVAL;
7743 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7744 ret = -EINVAL;
7745 }
7746
7747 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7748 ret = perf_reg_validate(attr->sample_regs_intr);
7749out:
7750 return ret;
7751
7752err_size:
7753 put_user(sizeof(*attr), &uattr->size);
7754 ret = -E2BIG;
7755 goto out;
7756}
7757
7758static int
7759perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7760{
7761 struct ring_buffer *rb = NULL;
7762 int ret = -EINVAL;
7763
7764 if (!output_event)
7765 goto set;
7766
7767
7768 if (event == output_event)
7769 goto out;
7770
7771
7772
7773
7774 if (output_event->cpu != event->cpu)
7775 goto out;
7776
7777
7778
7779
7780 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7781 goto out;
7782
7783
7784
7785
7786 if (output_event->clock != event->clock)
7787 goto out;
7788
7789
7790
7791
7792 if (has_aux(event) && has_aux(output_event) &&
7793 event->pmu != output_event->pmu)
7794 goto out;
7795
7796set:
7797 mutex_lock(&event->mmap_mutex);
7798
7799 if (atomic_read(&event->mmap_count))
7800 goto unlock;
7801
7802 if (output_event) {
7803
7804 rb = ring_buffer_get(output_event);
7805 if (!rb)
7806 goto unlock;
7807 }
7808
7809 ring_buffer_attach(event, rb);
7810
7811 ret = 0;
7812unlock:
7813 mutex_unlock(&event->mmap_mutex);
7814
7815out:
7816 return ret;
7817}
7818
7819static void mutex_lock_double(struct mutex *a, struct mutex *b)
7820{
7821 if (b < a)
7822 swap(a, b);
7823
7824 mutex_lock(a);
7825 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7826}
7827
7828static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7829{
7830 bool nmi_safe = false;
7831
7832 switch (clk_id) {
7833 case CLOCK_MONOTONIC:
7834 event->clock = &ktime_get_mono_fast_ns;
7835 nmi_safe = true;
7836 break;
7837
7838 case CLOCK_MONOTONIC_RAW:
7839 event->clock = &ktime_get_raw_fast_ns;
7840 nmi_safe = true;
7841 break;
7842
7843 case CLOCK_REALTIME:
7844 event->clock = &ktime_get_real_ns;
7845 break;
7846
7847 case CLOCK_BOOTTIME:
7848 event->clock = &ktime_get_boot_ns;
7849 break;
7850
7851 case CLOCK_TAI:
7852 event->clock = &ktime_get_tai_ns;
7853 break;
7854
7855 default:
7856 return -EINVAL;
7857 }
7858
7859 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7860 return -EINVAL;
7861
7862 return 0;
7863}
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873SYSCALL_DEFINE5(perf_event_open,
7874 struct perf_event_attr __user *, attr_uptr,
7875 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7876{
7877 struct perf_event *group_leader = NULL, *output_event = NULL;
7878 struct perf_event *event, *sibling;
7879 struct perf_event_attr attr;
7880 struct perf_event_context *ctx, *uninitialized_var(gctx);
7881 struct file *event_file = NULL;
7882 struct fd group = {NULL, 0};
7883 struct task_struct *task = NULL;
7884 struct pmu *pmu;
7885 int event_fd;
7886 int move_group = 0;
7887 int err;
7888 int f_flags = O_RDWR;
7889 int cgroup_fd = -1;
7890
7891
7892 if (flags & ~PERF_FLAG_ALL)
7893 return -EINVAL;
7894
7895 err = perf_copy_attr(attr_uptr, &attr);
7896 if (err)
7897 return err;
7898
7899 if (!attr.exclude_kernel) {
7900 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7901 return -EACCES;
7902 }
7903
7904 if (attr.freq) {
7905 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7906 return -EINVAL;
7907 } else {
7908 if (attr.sample_period & (1ULL << 63))
7909 return -EINVAL;
7910 }
7911
7912
7913
7914
7915
7916
7917
7918 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7919 return -EINVAL;
7920
7921 if (flags & PERF_FLAG_FD_CLOEXEC)
7922 f_flags |= O_CLOEXEC;
7923
7924 event_fd = get_unused_fd_flags(f_flags);
7925 if (event_fd < 0)
7926 return event_fd;
7927
7928 if (group_fd != -1) {
7929 err = perf_fget_light(group_fd, &group);
7930 if (err)
7931 goto err_fd;
7932 group_leader = group.file->private_data;
7933 if (flags & PERF_FLAG_FD_OUTPUT)
7934 output_event = group_leader;
7935 if (flags & PERF_FLAG_FD_NO_GROUP)
7936 group_leader = NULL;
7937 }
7938
7939 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7940 task = find_lively_task_by_vpid(pid);
7941 if (IS_ERR(task)) {
7942 err = PTR_ERR(task);
7943 goto err_group_fd;
7944 }
7945 }
7946
7947 if (task && group_leader &&
7948 group_leader->attr.inherit != attr.inherit) {
7949 err = -EINVAL;
7950 goto err_task;
7951 }
7952
7953 get_online_cpus();
7954
7955 if (flags & PERF_FLAG_PID_CGROUP)
7956 cgroup_fd = pid;
7957
7958 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7959 NULL, NULL, cgroup_fd);
7960 if (IS_ERR(event)) {
7961 err = PTR_ERR(event);
7962 goto err_cpus;
7963 }
7964
7965 if (is_sampling_event(event)) {
7966 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7967 err = -ENOTSUPP;
7968 goto err_alloc;
7969 }
7970 }
7971
7972 account_event(event);
7973
7974
7975
7976
7977
7978 pmu = event->pmu;
7979
7980 if (attr.use_clockid) {
7981 err = perf_event_set_clock(event, attr.clockid);
7982 if (err)
7983 goto err_alloc;
7984 }
7985
7986 if (group_leader &&
7987 (is_software_event(event) != is_software_event(group_leader))) {
7988 if (is_software_event(event)) {
7989
7990
7991
7992
7993
7994
7995
7996
7997 pmu = group_leader->pmu;
7998 } else if (is_software_event(group_leader) &&
7999 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
8000
8001
8002
8003
8004
8005 move_group = 1;
8006 }
8007 }
8008
8009
8010
8011
8012 ctx = find_get_context(pmu, task, event);
8013 if (IS_ERR(ctx)) {
8014 err = PTR_ERR(ctx);
8015 goto err_alloc;
8016 }
8017
8018 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
8019 err = -EBUSY;
8020 goto err_context;
8021 }
8022
8023 if (task) {
8024 put_task_struct(task);
8025 task = NULL;
8026 }
8027
8028
8029
8030
8031 if (group_leader) {
8032 err = -EINVAL;
8033
8034
8035
8036
8037
8038 if (group_leader->group_leader != group_leader)
8039 goto err_context;
8040
8041
8042 if (group_leader->clock != event->clock)
8043 goto err_context;
8044
8045
8046
8047
8048
8049 if (move_group) {
8050
8051
8052
8053
8054 if (group_leader->ctx->task != ctx->task)
8055 goto err_context;
8056
8057
8058
8059
8060
8061
8062 if (group_leader->cpu != event->cpu)
8063 goto err_context;
8064 } else {
8065 if (group_leader->ctx != ctx)
8066 goto err_context;
8067 }
8068
8069
8070
8071
8072 if (attr.exclusive || attr.pinned)
8073 goto err_context;
8074 }
8075
8076 if (output_event) {
8077 err = perf_event_set_output(event, output_event);
8078 if (err)
8079 goto err_context;
8080 }
8081
8082 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8083 f_flags);
8084 if (IS_ERR(event_file)) {
8085 err = PTR_ERR(event_file);
8086 goto err_context;
8087 }
8088
8089 if (move_group) {
8090 gctx = group_leader->ctx;
8091
8092
8093
8094
8095
8096 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8097
8098 perf_remove_from_context(group_leader, false);
8099
8100 list_for_each_entry(sibling, &group_leader->sibling_list,
8101 group_entry) {
8102 perf_remove_from_context(sibling, false);
8103 put_ctx(gctx);
8104 }
8105 } else {
8106 mutex_lock(&ctx->mutex);
8107 }
8108
8109 WARN_ON_ONCE(ctx->parent_ctx);
8110
8111 if (move_group) {
8112
8113
8114
8115
8116 synchronize_rcu();
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128 list_for_each_entry(sibling, &group_leader->sibling_list,
8129 group_entry) {
8130 perf_event__state_init(sibling);
8131 perf_install_in_context(ctx, sibling, sibling->cpu);
8132 get_ctx(ctx);
8133 }
8134
8135
8136
8137
8138
8139
8140 perf_event__state_init(group_leader);
8141 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8142 get_ctx(ctx);
8143 }
8144
8145 if (!exclusive_event_installable(event, ctx)) {
8146 err = -EBUSY;
8147 mutex_unlock(&ctx->mutex);
8148 fput(event_file);
8149 goto err_context;
8150 }
8151
8152 perf_install_in_context(ctx, event, event->cpu);
8153 perf_unpin_context(ctx);
8154
8155 if (move_group) {
8156 mutex_unlock(&gctx->mutex);
8157 put_ctx(gctx);
8158 }
8159 mutex_unlock(&ctx->mutex);
8160
8161 put_online_cpus();
8162
8163 event->owner = current;
8164
8165 mutex_lock(¤t->perf_event_mutex);
8166 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
8167 mutex_unlock(¤t->perf_event_mutex);
8168
8169
8170
8171
8172 perf_event__header_size(event);
8173 perf_event__id_header_size(event);
8174
8175
8176
8177
8178
8179
8180
8181 fdput(group);
8182 fd_install(event_fd, event_file);
8183 return event_fd;
8184
8185err_context:
8186 perf_unpin_context(ctx);
8187 put_ctx(ctx);
8188err_alloc:
8189 free_event(event);
8190err_cpus:
8191 put_online_cpus();
8192err_task:
8193 if (task)
8194 put_task_struct(task);
8195err_group_fd:
8196 fdput(group);
8197err_fd:
8198 put_unused_fd(event_fd);
8199 return err;
8200}
8201
8202
8203
8204
8205
8206
8207
8208
8209struct perf_event *
8210perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8211 struct task_struct *task,
8212 perf_overflow_handler_t overflow_handler,
8213 void *context)
8214{
8215 struct perf_event_context *ctx;
8216 struct perf_event *event;
8217 int err;
8218
8219
8220
8221
8222
8223 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8224 overflow_handler, context, -1);
8225 if (IS_ERR(event)) {
8226 err = PTR_ERR(event);
8227 goto err;
8228 }
8229
8230
8231 event->owner = EVENT_OWNER_KERNEL;
8232
8233 account_event(event);
8234
8235 ctx = find_get_context(event->pmu, task, event);
8236 if (IS_ERR(ctx)) {
8237 err = PTR_ERR(ctx);
8238 goto err_free;
8239 }
8240
8241 WARN_ON_ONCE(ctx->parent_ctx);
8242 mutex_lock(&ctx->mutex);
8243 if (!exclusive_event_installable(event, ctx)) {
8244 mutex_unlock(&ctx->mutex);
8245 perf_unpin_context(ctx);
8246 put_ctx(ctx);
8247 err = -EBUSY;
8248 goto err_free;
8249 }
8250
8251 perf_install_in_context(ctx, event, cpu);
8252 perf_unpin_context(ctx);
8253 mutex_unlock(&ctx->mutex);
8254
8255 return event;
8256
8257err_free:
8258 free_event(event);
8259err:
8260 return ERR_PTR(err);
8261}
8262EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8263
8264void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8265{
8266 struct perf_event_context *src_ctx;
8267 struct perf_event_context *dst_ctx;
8268 struct perf_event *event, *tmp;
8269 LIST_HEAD(events);
8270
8271 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8272 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8273
8274
8275
8276
8277
8278 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8279 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8280 event_entry) {
8281 perf_remove_from_context(event, false);
8282 unaccount_event_cpu(event, src_cpu);
8283 put_ctx(src_ctx);
8284 list_add(&event->migrate_entry, &events);
8285 }
8286
8287
8288
8289
8290 synchronize_rcu();
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8301 if (event->group_leader == event)
8302 continue;
8303
8304 list_del(&event->migrate_entry);
8305 if (event->state >= PERF_EVENT_STATE_OFF)
8306 event->state = PERF_EVENT_STATE_INACTIVE;
8307 account_event_cpu(event, dst_cpu);
8308 perf_install_in_context(dst_ctx, event, dst_cpu);
8309 get_ctx(dst_ctx);
8310 }
8311
8312
8313
8314
8315
8316 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8317 list_del(&event->migrate_entry);
8318 if (event->state >= PERF_EVENT_STATE_OFF)
8319 event->state = PERF_EVENT_STATE_INACTIVE;
8320 account_event_cpu(event, dst_cpu);
8321 perf_install_in_context(dst_ctx, event, dst_cpu);
8322 get_ctx(dst_ctx);
8323 }
8324 mutex_unlock(&dst_ctx->mutex);
8325 mutex_unlock(&src_ctx->mutex);
8326}
8327EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8328
8329static void sync_child_event(struct perf_event *child_event,
8330 struct task_struct *child)
8331{
8332 struct perf_event *parent_event = child_event->parent;
8333 u64 child_val;
8334
8335 if (child_event->attr.inherit_stat)
8336 perf_event_read_event(child_event, child);
8337
8338 child_val = perf_event_count(child_event);
8339
8340
8341
8342
8343 atomic64_add(child_val, &parent_event->child_count);
8344 atomic64_add(child_event->total_time_enabled,
8345 &parent_event->child_total_time_enabled);
8346 atomic64_add(child_event->total_time_running,
8347 &parent_event->child_total_time_running);
8348
8349
8350
8351
8352 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8353 mutex_lock(&parent_event->child_mutex);
8354 list_del_init(&child_event->child_list);
8355 mutex_unlock(&parent_event->child_mutex);
8356
8357
8358
8359
8360
8361 perf_event_wakeup(parent_event);
8362
8363
8364
8365
8366
8367 put_event(parent_event);
8368}
8369
8370static void
8371__perf_event_exit_task(struct perf_event *child_event,
8372 struct perf_event_context *child_ctx,
8373 struct task_struct *child)
8374{
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387 perf_remove_from_context(child_event, !!child_event->parent);
8388
8389
8390
8391
8392
8393
8394 if (child_event->parent) {
8395 sync_child_event(child_event, child);
8396 free_event(child_event);
8397 } else {
8398 child_event->state = PERF_EVENT_STATE_EXIT;
8399 perf_event_wakeup(child_event);
8400 }
8401}
8402
8403static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8404{
8405 struct perf_event *child_event, *next;
8406 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8407 unsigned long flags;
8408
8409 if (likely(!child->perf_event_ctxp[ctxn])) {
8410 perf_event_task(child, NULL, 0);
8411 return;
8412 }
8413
8414 local_irq_save(flags);
8415
8416
8417
8418
8419
8420
8421 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8422
8423
8424
8425
8426
8427
8428 raw_spin_lock(&child_ctx->lock);
8429 task_ctx_sched_out(child_ctx);
8430 child->perf_event_ctxp[ctxn] = NULL;
8431
8432
8433
8434
8435
8436
8437 clone_ctx = unclone_ctx(child_ctx);
8438 update_context_time(child_ctx);
8439 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8440
8441 if (clone_ctx)
8442 put_ctx(clone_ctx);
8443
8444
8445
8446
8447
8448
8449 perf_event_task(child, child_ctx, 0);
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461 mutex_lock(&child_ctx->mutex);
8462
8463 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8464 __perf_event_exit_task(child_event, child_ctx, child);
8465
8466 mutex_unlock(&child_ctx->mutex);
8467
8468 put_ctx(child_ctx);
8469}
8470
8471
8472
8473
8474void perf_event_exit_task(struct task_struct *child)
8475{
8476 struct perf_event *event, *tmp;
8477 int ctxn;
8478
8479 mutex_lock(&child->perf_event_mutex);
8480 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8481 owner_entry) {
8482 list_del_init(&event->owner_entry);
8483
8484
8485
8486
8487
8488
8489 smp_wmb();
8490 event->owner = NULL;
8491 }
8492 mutex_unlock(&child->perf_event_mutex);
8493
8494 for_each_task_context_nr(ctxn)
8495 perf_event_exit_task_context(child, ctxn);
8496}
8497
8498static void perf_free_event(struct perf_event *event,
8499 struct perf_event_context *ctx)
8500{
8501 struct perf_event *parent = event->parent;
8502
8503 if (WARN_ON_ONCE(!parent))
8504 return;
8505
8506 mutex_lock(&parent->child_mutex);
8507 list_del_init(&event->child_list);
8508 mutex_unlock(&parent->child_mutex);
8509
8510 put_event(parent);
8511
8512 raw_spin_lock_irq(&ctx->lock);
8513 perf_group_detach(event);
8514 list_del_event(event, ctx);
8515 raw_spin_unlock_irq(&ctx->lock);
8516 free_event(event);
8517}
8518
8519
8520
8521
8522
8523
8524
8525
8526void perf_event_free_task(struct task_struct *task)
8527{
8528 struct perf_event_context *ctx;
8529 struct perf_event *event, *tmp;
8530 int ctxn;
8531
8532 for_each_task_context_nr(ctxn) {
8533 ctx = task->perf_event_ctxp[ctxn];
8534 if (!ctx)
8535 continue;
8536
8537 mutex_lock(&ctx->mutex);
8538again:
8539 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8540 group_entry)
8541 perf_free_event(event, ctx);
8542
8543 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8544 group_entry)
8545 perf_free_event(event, ctx);
8546
8547 if (!list_empty(&ctx->pinned_groups) ||
8548 !list_empty(&ctx->flexible_groups))
8549 goto again;
8550
8551 mutex_unlock(&ctx->mutex);
8552
8553 put_ctx(ctx);
8554 }
8555}
8556
8557void perf_event_delayed_put(struct task_struct *task)
8558{
8559 int ctxn;
8560
8561 for_each_task_context_nr(ctxn)
8562 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8563}
8564
8565
8566
8567
8568static struct perf_event *
8569inherit_event(struct perf_event *parent_event,
8570 struct task_struct *parent,
8571 struct perf_event_context *parent_ctx,
8572 struct task_struct *child,
8573 struct perf_event *group_leader,
8574 struct perf_event_context *child_ctx)
8575{
8576 enum perf_event_active_state parent_state = parent_event->state;
8577 struct perf_event *child_event;
8578 unsigned long flags;
8579
8580
8581
8582
8583
8584
8585
8586 if (parent_event->parent)
8587 parent_event = parent_event->parent;
8588
8589 child_event = perf_event_alloc(&parent_event->attr,
8590 parent_event->cpu,
8591 child,
8592 group_leader, parent_event,
8593 NULL, NULL, -1);
8594 if (IS_ERR(child_event))
8595 return child_event;
8596
8597 if (is_orphaned_event(parent_event) ||
8598 !atomic_long_inc_not_zero(&parent_event->refcount)) {
8599 free_event(child_event);
8600 return NULL;
8601 }
8602
8603 get_ctx(child_ctx);
8604
8605
8606
8607
8608
8609
8610 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
8611 child_event->state = PERF_EVENT_STATE_INACTIVE;
8612 else
8613 child_event->state = PERF_EVENT_STATE_OFF;
8614
8615 if (parent_event->attr.freq) {
8616 u64 sample_period = parent_event->hw.sample_period;
8617 struct hw_perf_event *hwc = &child_event->hw;
8618
8619 hwc->sample_period = sample_period;
8620 hwc->last_period = sample_period;
8621
8622 local64_set(&hwc->period_left, sample_period);
8623 }
8624
8625 child_event->ctx = child_ctx;
8626 child_event->overflow_handler = parent_event->overflow_handler;
8627 child_event->overflow_handler_context
8628 = parent_event->overflow_handler_context;
8629
8630
8631
8632
8633 perf_event__header_size(child_event);
8634 perf_event__id_header_size(child_event);
8635
8636
8637
8638
8639 raw_spin_lock_irqsave(&child_ctx->lock, flags);
8640 add_event_to_ctx(child_event, child_ctx);
8641 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8642
8643
8644
8645
8646 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8647 mutex_lock(&parent_event->child_mutex);
8648 list_add_tail(&child_event->child_list, &parent_event->child_list);
8649 mutex_unlock(&parent_event->child_mutex);
8650
8651 return child_event;
8652}
8653
8654static int inherit_group(struct perf_event *parent_event,
8655 struct task_struct *parent,
8656 struct perf_event_context *parent_ctx,
8657 struct task_struct *child,
8658 struct perf_event_context *child_ctx)
8659{
8660 struct perf_event *leader;
8661 struct perf_event *sub;
8662 struct perf_event *child_ctr;
8663
8664 leader = inherit_event(parent_event, parent, parent_ctx,
8665 child, NULL, child_ctx);
8666 if (IS_ERR(leader))
8667 return PTR_ERR(leader);
8668 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8669 child_ctr = inherit_event(sub, parent, parent_ctx,
8670 child, leader, child_ctx);
8671 if (IS_ERR(child_ctr))
8672 return PTR_ERR(child_ctr);
8673 }
8674 return 0;
8675}
8676
8677static int
8678inherit_task_group(struct perf_event *event, struct task_struct *parent,
8679 struct perf_event_context *parent_ctx,
8680 struct task_struct *child, int ctxn,
8681 int *inherited_all)
8682{
8683 int ret;
8684 struct perf_event_context *child_ctx;
8685
8686 if (!event->attr.inherit) {
8687 *inherited_all = 0;
8688 return 0;
8689 }
8690
8691 child_ctx = child->perf_event_ctxp[ctxn];
8692 if (!child_ctx) {
8693
8694
8695
8696
8697
8698
8699
8700 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
8701 if (!child_ctx)
8702 return -ENOMEM;
8703
8704 child->perf_event_ctxp[ctxn] = child_ctx;
8705 }
8706
8707 ret = inherit_group(event, parent, parent_ctx,
8708 child, child_ctx);
8709
8710 if (ret)
8711 *inherited_all = 0;
8712
8713 return ret;
8714}
8715
8716
8717
8718
8719static int perf_event_init_context(struct task_struct *child, int ctxn)
8720{
8721 struct perf_event_context *child_ctx, *parent_ctx;
8722 struct perf_event_context *cloned_ctx;
8723 struct perf_event *event;
8724 struct task_struct *parent = current;
8725 int inherited_all = 1;
8726 unsigned long flags;
8727 int ret = 0;
8728
8729 if (likely(!parent->perf_event_ctxp[ctxn]))
8730 return 0;
8731
8732
8733
8734
8735
8736 parent_ctx = perf_pin_task_context(parent, ctxn);
8737 if (!parent_ctx)
8738 return 0;
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751 mutex_lock(&parent_ctx->mutex);
8752
8753
8754
8755
8756
8757 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8758 ret = inherit_task_group(event, parent, parent_ctx,
8759 child, ctxn, &inherited_all);
8760 if (ret)
8761 break;
8762 }
8763
8764
8765
8766
8767
8768
8769 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8770 parent_ctx->rotate_disable = 1;
8771 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8772
8773 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8774 ret = inherit_task_group(event, parent, parent_ctx,
8775 child, ctxn, &inherited_all);
8776 if (ret)
8777 break;
8778 }
8779
8780 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8781 parent_ctx->rotate_disable = 0;
8782
8783 child_ctx = child->perf_event_ctxp[ctxn];
8784
8785 if (child_ctx && inherited_all) {
8786
8787
8788
8789
8790
8791
8792
8793 cloned_ctx = parent_ctx->parent_ctx;
8794 if (cloned_ctx) {
8795 child_ctx->parent_ctx = cloned_ctx;
8796 child_ctx->parent_gen = parent_ctx->parent_gen;
8797 } else {
8798 child_ctx->parent_ctx = parent_ctx;
8799 child_ctx->parent_gen = parent_ctx->generation;
8800 }
8801 get_ctx(child_ctx->parent_ctx);
8802 }
8803
8804 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8805 mutex_unlock(&parent_ctx->mutex);
8806
8807 perf_unpin_context(parent_ctx);
8808 put_ctx(parent_ctx);
8809
8810 return ret;
8811}
8812
8813
8814
8815
8816int perf_event_init_task(struct task_struct *child)
8817{
8818 int ctxn, ret;
8819
8820 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8821 mutex_init(&child->perf_event_mutex);
8822 INIT_LIST_HEAD(&child->perf_event_list);
8823
8824 for_each_task_context_nr(ctxn) {
8825 ret = perf_event_init_context(child, ctxn);
8826 if (ret) {
8827 perf_event_free_task(child);
8828 return ret;
8829 }
8830 }
8831
8832 return 0;
8833}
8834
8835static void __init perf_event_init_all_cpus(void)
8836{
8837 struct swevent_htable *swhash;
8838 int cpu;
8839
8840 for_each_possible_cpu(cpu) {
8841 swhash = &per_cpu(swevent_htable, cpu);
8842 mutex_init(&swhash->hlist_mutex);
8843 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8844 }
8845}
8846
8847static void perf_event_init_cpu(int cpu)
8848{
8849 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8850
8851 mutex_lock(&swhash->hlist_mutex);
8852 swhash->online = true;
8853 if (swhash->hlist_refcount > 0) {
8854 struct swevent_hlist *hlist;
8855
8856 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8857 WARN_ON(!hlist);
8858 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8859 }
8860 mutex_unlock(&swhash->hlist_mutex);
8861}
8862
8863#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8864static void __perf_event_exit_context(void *__info)
8865{
8866 struct remove_event re = { .detach_group = true };
8867 struct perf_event_context *ctx = __info;
8868
8869 rcu_read_lock();
8870 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8871 __perf_remove_from_context(&re);
8872 rcu_read_unlock();
8873}
8874
8875static void perf_event_exit_cpu_context(int cpu)
8876{
8877 struct perf_event_context *ctx;
8878 struct pmu *pmu;
8879 int idx;
8880
8881 idx = srcu_read_lock(&pmus_srcu);
8882 list_for_each_entry_rcu(pmu, &pmus, entry) {
8883 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
8884
8885 mutex_lock(&ctx->mutex);
8886 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8887 mutex_unlock(&ctx->mutex);
8888 }
8889 srcu_read_unlock(&pmus_srcu, idx);
8890}
8891
8892static void perf_event_exit_cpu(int cpu)
8893{
8894 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8895
8896 perf_event_exit_cpu_context(cpu);
8897
8898 mutex_lock(&swhash->hlist_mutex);
8899 swhash->online = false;
8900 swevent_hlist_release(swhash);
8901 mutex_unlock(&swhash->hlist_mutex);
8902}
8903#else
8904static inline void perf_event_exit_cpu(int cpu) { }
8905#endif
8906
8907static int
8908perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8909{
8910 int cpu;
8911
8912 for_each_online_cpu(cpu)
8913 perf_event_exit_cpu(cpu);
8914
8915 return NOTIFY_OK;
8916}
8917
8918
8919
8920
8921
8922static struct notifier_block perf_reboot_notifier = {
8923 .notifier_call = perf_reboot,
8924 .priority = INT_MIN,
8925};
8926
8927static int
8928perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8929{
8930 unsigned int cpu = (long)hcpu;
8931
8932 switch (action & ~CPU_TASKS_FROZEN) {
8933
8934 case CPU_UP_PREPARE:
8935 case CPU_DOWN_FAILED:
8936 perf_event_init_cpu(cpu);
8937 break;
8938
8939 case CPU_UP_CANCELED:
8940 case CPU_DOWN_PREPARE:
8941 perf_event_exit_cpu(cpu);
8942 break;
8943 default:
8944 break;
8945 }
8946
8947 return NOTIFY_OK;
8948}
8949
8950void __init perf_event_init(void)
8951{
8952 int ret;
8953
8954 idr_init(&pmu_idr);
8955
8956 perf_event_init_all_cpus();
8957 init_srcu_struct(&pmus_srcu);
8958 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8959 perf_pmu_register(&perf_cpu_clock, NULL, -1);
8960 perf_pmu_register(&perf_task_clock, NULL, -1);
8961 perf_tp_register();
8962 perf_cpu_notifier(perf_cpu_notify);
8963 register_reboot_notifier(&perf_reboot_notifier);
8964
8965 ret = init_hw_breakpoint();
8966 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
8967
8968
8969 jump_label_rate_limit(&perf_sched_events, HZ);
8970
8971
8972
8973
8974
8975 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8976 != 1024);
8977}
8978
8979ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8980 char *page)
8981{
8982 struct perf_pmu_events_attr *pmu_attr =
8983 container_of(attr, struct perf_pmu_events_attr, attr);
8984
8985 if (pmu_attr->event_str)
8986 return sprintf(page, "%s\n", pmu_attr->event_str);
8987
8988 return 0;
8989}
8990
8991static int __init perf_event_sysfs_init(void)
8992{
8993 struct pmu *pmu;
8994 int ret;
8995
8996 mutex_lock(&pmus_lock);
8997
8998 ret = bus_register(&pmu_bus);
8999 if (ret)
9000 goto unlock;
9001
9002 list_for_each_entry(pmu, &pmus, entry) {
9003 if (!pmu->name || pmu->type < 0)
9004 continue;
9005
9006 ret = pmu_dev_alloc(pmu);
9007 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
9008 }
9009 pmu_bus_running = 1;
9010 ret = 0;
9011
9012unlock:
9013 mutex_unlock(&pmus_lock);
9014
9015 return ret;
9016}
9017device_initcall(perf_event_sysfs_init);
9018
9019#ifdef CONFIG_CGROUP_PERF
9020static struct cgroup_subsys_state *
9021perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9022{
9023 struct perf_cgroup *jc;
9024
9025 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
9026 if (!jc)
9027 return ERR_PTR(-ENOMEM);
9028
9029 jc->info = alloc_percpu(struct perf_cgroup_info);
9030 if (!jc->info) {
9031 kfree(jc);
9032 return ERR_PTR(-ENOMEM);
9033 }
9034
9035 return &jc->css;
9036}
9037
9038static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9039{
9040 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9041
9042 free_percpu(jc->info);
9043 kfree(jc);
9044}
9045
9046static int __perf_cgroup_move(void *info)
9047{
9048 struct task_struct *task = info;
9049 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9050 return 0;
9051}
9052
9053static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9054 struct cgroup_taskset *tset)
9055{
9056 struct task_struct *task;
9057
9058 cgroup_taskset_for_each(task, tset)
9059 task_function_call(task, __perf_cgroup_move, task);
9060}
9061
9062static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9063 struct cgroup_subsys_state *old_css,
9064 struct task_struct *task)
9065{
9066
9067
9068
9069
9070
9071 if (!(task->flags & PF_EXITING))
9072 return;
9073
9074 task_function_call(task, __perf_cgroup_move, task);
9075}
9076
9077struct cgroup_subsys perf_event_cgrp_subsys = {
9078 .css_alloc = perf_cgroup_css_alloc,
9079 .css_free = perf_cgroup_css_free,
9080 .exit = perf_cgroup_exit,
9081 .attach = perf_cgroup_attach,
9082};
9083#endif
9084