linux/kernel/trace/trace_event_perf.c
<<
>>
Prefs
   1/*
   2 * trace event based perf event profiling/tracing
   3 *
   4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
   5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/kprobes.h>
  10#include "trace.h"
  11
  12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  13
  14/*
  15 * Force it to be aligned to unsigned long to avoid misaligned accesses
  16 * suprises
  17 */
  18typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  19        perf_trace_t;
  20
  21/* Count the events in use (per event id, not per instance) */
  22static int      total_ref_count;
  23
  24static int perf_trace_event_perm(struct trace_event_call *tp_event,
  25                                 struct perf_event *p_event)
  26{
  27        if (tp_event->perf_perm) {
  28                int ret = tp_event->perf_perm(tp_event, p_event);
  29                if (ret)
  30                        return ret;
  31        }
  32
  33        /*
  34         * We checked and allowed to create parent,
  35         * allow children without checking.
  36         */
  37        if (p_event->parent)
  38                return 0;
  39
  40        /*
  41         * It's ok to check current process (owner) permissions in here,
  42         * because code below is called only via perf_event_open syscall.
  43         */
  44
  45        /* The ftrace function trace is allowed only for root. */
  46        if (ftrace_event_is_function(tp_event)) {
  47                if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  48                        return -EPERM;
  49
  50                if (!is_sampling_event(p_event))
  51                        return 0;
  52
  53                /*
  54                 * We don't allow user space callchains for  function trace
  55                 * event, due to issues with page faults while tracing page
  56                 * fault handler and its overall trickiness nature.
  57                 */
  58                if (!p_event->attr.exclude_callchain_user)
  59                        return -EINVAL;
  60
  61                /*
  62                 * Same reason to disable user stack dump as for user space
  63                 * callchains above.
  64                 */
  65                if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
  66                        return -EINVAL;
  67        }
  68
  69        /* No tracing, just counting, so no obvious leak */
  70        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  71                return 0;
  72
  73        /* Some events are ok to be traced by non-root users... */
  74        if (p_event->attach_state == PERF_ATTACH_TASK) {
  75                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  76                        return 0;
  77        }
  78
  79        /*
  80         * ...otherwise raw tracepoint data can be a severe data leak,
  81         * only allow root to have these.
  82         */
  83        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  84                return -EPERM;
  85
  86        return 0;
  87}
  88
  89static int perf_trace_event_reg(struct trace_event_call *tp_event,
  90                                struct perf_event *p_event)
  91{
  92        struct hlist_head __percpu *list;
  93        int ret = -ENOMEM;
  94        int cpu;
  95
  96        p_event->tp_event = tp_event;
  97        if (tp_event->perf_refcount++ > 0)
  98                return 0;
  99
 100        list = alloc_percpu(struct hlist_head);
 101        if (!list)
 102                goto fail;
 103
 104        for_each_possible_cpu(cpu)
 105                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
 106
 107        tp_event->perf_events = list;
 108
 109        if (!total_ref_count) {
 110                char __percpu *buf;
 111                int i;
 112
 113                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 114                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
 115                        if (!buf)
 116                                goto fail;
 117
 118                        perf_trace_buf[i] = buf;
 119                }
 120        }
 121
 122        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
 123        if (ret)
 124                goto fail;
 125
 126        total_ref_count++;
 127        return 0;
 128
 129fail:
 130        if (!total_ref_count) {
 131                int i;
 132
 133                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 134                        free_percpu(perf_trace_buf[i]);
 135                        perf_trace_buf[i] = NULL;
 136                }
 137        }
 138
 139        if (!--tp_event->perf_refcount) {
 140                free_percpu(tp_event->perf_events);
 141                tp_event->perf_events = NULL;
 142        }
 143
 144        return ret;
 145}
 146
 147static void perf_trace_event_unreg(struct perf_event *p_event)
 148{
 149        struct trace_event_call *tp_event = p_event->tp_event;
 150        int i;
 151
 152        if (--tp_event->perf_refcount > 0)
 153                goto out;
 154
 155        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
 156
 157        /*
 158         * Ensure our callback won't be called anymore. The buffers
 159         * will be freed after that.
 160         */
 161        tracepoint_synchronize_unregister();
 162
 163        free_percpu(tp_event->perf_events);
 164        tp_event->perf_events = NULL;
 165
 166        if (!--total_ref_count) {
 167                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 168                        free_percpu(perf_trace_buf[i]);
 169                        perf_trace_buf[i] = NULL;
 170                }
 171        }
 172out:
 173        module_put(tp_event->mod);
 174}
 175
 176static int perf_trace_event_open(struct perf_event *p_event)
 177{
 178        struct trace_event_call *tp_event = p_event->tp_event;
 179        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 180}
 181
 182static void perf_trace_event_close(struct perf_event *p_event)
 183{
 184        struct trace_event_call *tp_event = p_event->tp_event;
 185        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 186}
 187
 188static int perf_trace_event_init(struct trace_event_call *tp_event,
 189                                 struct perf_event *p_event)
 190{
 191        int ret;
 192
 193        ret = perf_trace_event_perm(tp_event, p_event);
 194        if (ret)
 195                return ret;
 196
 197        ret = perf_trace_event_reg(tp_event, p_event);
 198        if (ret)
 199                return ret;
 200
 201        ret = perf_trace_event_open(p_event);
 202        if (ret) {
 203                perf_trace_event_unreg(p_event);
 204                return ret;
 205        }
 206
 207        return 0;
 208}
 209
 210int perf_trace_init(struct perf_event *p_event)
 211{
 212        struct trace_event_call *tp_event;
 213        u64 event_id = p_event->attr.config;
 214        int ret = -EINVAL;
 215
 216        mutex_lock(&event_mutex);
 217        list_for_each_entry(tp_event, &ftrace_events, list) {
 218                if (tp_event->event.type == event_id &&
 219                    tp_event->class && tp_event->class->reg &&
 220                    try_module_get(tp_event->mod)) {
 221                        ret = perf_trace_event_init(tp_event, p_event);
 222                        if (ret)
 223                                module_put(tp_event->mod);
 224                        break;
 225                }
 226        }
 227        mutex_unlock(&event_mutex);
 228
 229        return ret;
 230}
 231
 232void perf_trace_destroy(struct perf_event *p_event)
 233{
 234        mutex_lock(&event_mutex);
 235        perf_trace_event_close(p_event);
 236        perf_trace_event_unreg(p_event);
 237        mutex_unlock(&event_mutex);
 238}
 239
 240int perf_trace_add(struct perf_event *p_event, int flags)
 241{
 242        struct trace_event_call *tp_event = p_event->tp_event;
 243        struct hlist_head __percpu *pcpu_list;
 244        struct hlist_head *list;
 245
 246        pcpu_list = tp_event->perf_events;
 247        if (WARN_ON_ONCE(!pcpu_list))
 248                return -EINVAL;
 249
 250        if (!(flags & PERF_EF_START))
 251                p_event->hw.state = PERF_HES_STOPPED;
 252
 253        list = this_cpu_ptr(pcpu_list);
 254        hlist_add_head_rcu(&p_event->hlist_entry, list);
 255
 256        return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
 257}
 258
 259void perf_trace_del(struct perf_event *p_event, int flags)
 260{
 261        struct trace_event_call *tp_event = p_event->tp_event;
 262        hlist_del_rcu(&p_event->hlist_entry);
 263        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 264}
 265
 266void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 267{
 268        char *raw_data;
 269        int rctx;
 270
 271        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 272
 273        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 274                      "perf buffer not large enough"))
 275                return NULL;
 276
 277        *rctxp = rctx = perf_swevent_get_recursion_context();
 278        if (rctx < 0)
 279                return NULL;
 280
 281        if (regs)
 282                *regs = this_cpu_ptr(&__perf_regs[rctx]);
 283        raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 284
 285        /* zero the dead bytes from align to not leak stack to user */
 286        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 287        return raw_data;
 288}
 289EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
 290NOKPROBE_SYMBOL(perf_trace_buf_alloc);
 291
 292void perf_trace_buf_update(void *record, u16 type)
 293{
 294        struct trace_entry *entry = record;
 295        int pc = preempt_count();
 296        unsigned long flags;
 297
 298        local_save_flags(flags);
 299        tracing_generic_entry_update(entry, flags, pc);
 300        entry->type = type;
 301}
 302NOKPROBE_SYMBOL(perf_trace_buf_update);
 303
 304#ifdef CONFIG_FUNCTION_TRACER
 305static void
 306perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 307                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
 308{
 309        struct ftrace_entry *entry;
 310        struct hlist_head *head;
 311        struct pt_regs regs;
 312        int rctx;
 313
 314        head = this_cpu_ptr(event_function.perf_events);
 315        if (hlist_empty(head))
 316                return;
 317
 318#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 319                    sizeof(u64)) - sizeof(u32))
 320
 321        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 322
 323        memset(&regs, 0, sizeof(regs));
 324        perf_fetch_caller_regs(&regs);
 325
 326        entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 327        if (!entry)
 328                return;
 329
 330        entry->ip = ip;
 331        entry->parent_ip = parent_ip;
 332        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 333                              1, &regs, head, NULL);
 334
 335#undef ENTRY_SIZE
 336}
 337
 338static int perf_ftrace_function_register(struct perf_event *event)
 339{
 340        struct ftrace_ops *ops = &event->ftrace_ops;
 341
 342        ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
 343        ops->func = perf_ftrace_function_call;
 344        return register_ftrace_function(ops);
 345}
 346
 347static int perf_ftrace_function_unregister(struct perf_event *event)
 348{
 349        struct ftrace_ops *ops = &event->ftrace_ops;
 350        int ret = unregister_ftrace_function(ops);
 351        ftrace_free_filter(ops);
 352        return ret;
 353}
 354
 355static void perf_ftrace_function_enable(struct perf_event *event)
 356{
 357        ftrace_function_local_enable(&event->ftrace_ops);
 358}
 359
 360static void perf_ftrace_function_disable(struct perf_event *event)
 361{
 362        ftrace_function_local_disable(&event->ftrace_ops);
 363}
 364
 365int perf_ftrace_event_register(struct trace_event_call *call,
 366                               enum trace_reg type, void *data)
 367{
 368        switch (type) {
 369        case TRACE_REG_REGISTER:
 370        case TRACE_REG_UNREGISTER:
 371                break;
 372        case TRACE_REG_PERF_REGISTER:
 373        case TRACE_REG_PERF_UNREGISTER:
 374                return 0;
 375        case TRACE_REG_PERF_OPEN:
 376                return perf_ftrace_function_register(data);
 377        case TRACE_REG_PERF_CLOSE:
 378                return perf_ftrace_function_unregister(data);
 379        case TRACE_REG_PERF_ADD:
 380                perf_ftrace_function_enable(data);
 381                return 0;
 382        case TRACE_REG_PERF_DEL:
 383                perf_ftrace_function_disable(data);
 384                return 0;
 385        }
 386
 387        return -EINVAL;
 388}
 389#endif /* CONFIG_FUNCTION_TRACER */
 390