linux/kernel/trace/trace_event_perf.c
<<
>>
Prefs
   1/*
   2 * trace event based perf event profiling/tracing
   3 *
   4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
   5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/kprobes.h>
  10#include "trace.h"
  11
  12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  13
  14/*
  15 * Force it to be aligned to unsigned long to avoid misaligned accesses
  16 * suprises
  17 */
  18typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  19        perf_trace_t;
  20
  21/* Count the events in use (per event id, not per instance) */
  22static int      total_ref_count;
  23
  24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
  25                                 struct perf_event *p_event)
  26{
  27        /* The ftrace function trace is allowed only for root. */
  28        if (ftrace_event_is_function(tp_event) &&
  29            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
  30                return -EPERM;
  31
  32        /* No tracing, just counting, so no obvious leak */
  33        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  34                return 0;
  35
  36        /* Some events are ok to be traced by non-root users... */
  37        if (p_event->attach_state == PERF_ATTACH_TASK) {
  38                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  39                        return 0;
  40        }
  41
  42        /*
  43         * ...otherwise raw tracepoint data can be a severe data leak,
  44         * only allow root to have these.
  45         */
  46        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  47                return -EPERM;
  48
  49        return 0;
  50}
  51
  52static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
  53                                struct perf_event *p_event)
  54{
  55        struct hlist_head __percpu *list;
  56        int ret = -ENOMEM;
  57        int cpu;
  58
  59        p_event->tp_event = tp_event;
  60        if (tp_event->perf_refcount++ > 0)
  61                return 0;
  62
  63        list = alloc_percpu(struct hlist_head);
  64        if (!list)
  65                goto fail;
  66
  67        for_each_possible_cpu(cpu)
  68                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
  69
  70        tp_event->perf_events = list;
  71
  72        if (!total_ref_count) {
  73                char __percpu *buf;
  74                int i;
  75
  76                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  77                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
  78                        if (!buf)
  79                                goto fail;
  80
  81                        perf_trace_buf[i] = buf;
  82                }
  83        }
  84
  85        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
  86        if (ret)
  87                goto fail;
  88
  89        total_ref_count++;
  90        return 0;
  91
  92fail:
  93        if (!total_ref_count) {
  94                int i;
  95
  96                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  97                        free_percpu(perf_trace_buf[i]);
  98                        perf_trace_buf[i] = NULL;
  99                }
 100        }
 101
 102        if (!--tp_event->perf_refcount) {
 103                free_percpu(tp_event->perf_events);
 104                tp_event->perf_events = NULL;
 105        }
 106
 107        return ret;
 108}
 109
 110static void perf_trace_event_unreg(struct perf_event *p_event)
 111{
 112        struct ftrace_event_call *tp_event = p_event->tp_event;
 113        int i;
 114
 115        if (--tp_event->perf_refcount > 0)
 116                goto out;
 117
 118        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
 119
 120        /*
 121         * Ensure our callback won't be called anymore. The buffers
 122         * will be freed after that.
 123         */
 124        tracepoint_synchronize_unregister();
 125
 126        free_percpu(tp_event->perf_events);
 127        tp_event->perf_events = NULL;
 128
 129        if (!--total_ref_count) {
 130                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 131                        free_percpu(perf_trace_buf[i]);
 132                        perf_trace_buf[i] = NULL;
 133                }
 134        }
 135out:
 136        module_put(tp_event->mod);
 137}
 138
 139static int perf_trace_event_open(struct perf_event *p_event)
 140{
 141        struct ftrace_event_call *tp_event = p_event->tp_event;
 142        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 143}
 144
 145static void perf_trace_event_close(struct perf_event *p_event)
 146{
 147        struct ftrace_event_call *tp_event = p_event->tp_event;
 148        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 149}
 150
 151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 152                                 struct perf_event *p_event)
 153{
 154        int ret;
 155
 156        ret = perf_trace_event_perm(tp_event, p_event);
 157        if (ret)
 158                return ret;
 159
 160        ret = perf_trace_event_reg(tp_event, p_event);
 161        if (ret)
 162                return ret;
 163
 164        ret = perf_trace_event_open(p_event);
 165        if (ret) {
 166                perf_trace_event_unreg(p_event);
 167                return ret;
 168        }
 169
 170        return 0;
 171}
 172
 173int perf_trace_init(struct perf_event *p_event)
 174{
 175        struct ftrace_event_call *tp_event;
 176        int event_id = p_event->attr.config;
 177        int ret = -EINVAL;
 178
 179        mutex_lock(&event_mutex);
 180        list_for_each_entry(tp_event, &ftrace_events, list) {
 181                if (tp_event->event.type == event_id &&
 182                    tp_event->class && tp_event->class->reg &&
 183                    try_module_get(tp_event->mod)) {
 184                        ret = perf_trace_event_init(tp_event, p_event);
 185                        if (ret)
 186                                module_put(tp_event->mod);
 187                        break;
 188                }
 189        }
 190        mutex_unlock(&event_mutex);
 191
 192        return ret;
 193}
 194
 195void perf_trace_destroy(struct perf_event *p_event)
 196{
 197        mutex_lock(&event_mutex);
 198        perf_trace_event_close(p_event);
 199        perf_trace_event_unreg(p_event);
 200        mutex_unlock(&event_mutex);
 201}
 202
 203int perf_trace_add(struct perf_event *p_event, int flags)
 204{
 205        struct ftrace_event_call *tp_event = p_event->tp_event;
 206        struct hlist_head __percpu *pcpu_list;
 207        struct hlist_head *list;
 208
 209        pcpu_list = tp_event->perf_events;
 210        if (WARN_ON_ONCE(!pcpu_list))
 211                return -EINVAL;
 212
 213        if (!(flags & PERF_EF_START))
 214                p_event->hw.state = PERF_HES_STOPPED;
 215
 216        list = this_cpu_ptr(pcpu_list);
 217        hlist_add_head_rcu(&p_event->hlist_entry, list);
 218
 219        return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
 220}
 221
 222void perf_trace_del(struct perf_event *p_event, int flags)
 223{
 224        struct ftrace_event_call *tp_event = p_event->tp_event;
 225        hlist_del_rcu(&p_event->hlist_entry);
 226        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 227}
 228
 229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 230                                       struct pt_regs *regs, int *rctxp)
 231{
 232        struct trace_entry *entry;
 233        unsigned long flags;
 234        char *raw_data;
 235        int pc;
 236
 237        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 238
 239        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 240                        "perf buffer not large enough"))
 241                return NULL;
 242
 243        pc = preempt_count();
 244
 245        *rctxp = perf_swevent_get_recursion_context();
 246        if (*rctxp < 0)
 247                return NULL;
 248
 249        raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
 250
 251        /* zero the dead bytes from align to not leak stack to user */
 252        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 253
 254        entry = (struct trace_entry *)raw_data;
 255        local_save_flags(flags);
 256        tracing_generic_entry_update(entry, flags, pc);
 257        entry->type = type;
 258
 259        return raw_data;
 260}
 261EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
 262
 263#ifdef CONFIG_FUNCTION_TRACER
 264static void
 265perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 266                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
 267{
 268        struct ftrace_entry *entry;
 269        struct hlist_head *head;
 270        struct pt_regs regs;
 271        int rctx;
 272
 273        head = this_cpu_ptr(event_function.perf_events);
 274        if (hlist_empty(head))
 275                return;
 276
 277#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 278                    sizeof(u64)) - sizeof(u32))
 279
 280        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 281
 282        perf_fetch_caller_regs(&regs);
 283
 284        entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
 285        if (!entry)
 286                return;
 287
 288        entry->ip = ip;
 289        entry->parent_ip = parent_ip;
 290        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
 291                              1, &regs, head, NULL);
 292
 293#undef ENTRY_SIZE
 294}
 295
 296static int perf_ftrace_function_register(struct perf_event *event)
 297{
 298        struct ftrace_ops *ops = &event->ftrace_ops;
 299
 300        ops->flags |= FTRACE_OPS_FL_CONTROL;
 301        ops->func = perf_ftrace_function_call;
 302        return register_ftrace_function(ops);
 303}
 304
 305static int perf_ftrace_function_unregister(struct perf_event *event)
 306{
 307        struct ftrace_ops *ops = &event->ftrace_ops;
 308        int ret = unregister_ftrace_function(ops);
 309        ftrace_free_filter(ops);
 310        return ret;
 311}
 312
 313static void perf_ftrace_function_enable(struct perf_event *event)
 314{
 315        ftrace_function_local_enable(&event->ftrace_ops);
 316}
 317
 318static void perf_ftrace_function_disable(struct perf_event *event)
 319{
 320        ftrace_function_local_disable(&event->ftrace_ops);
 321}
 322
 323int perf_ftrace_event_register(struct ftrace_event_call *call,
 324                               enum trace_reg type, void *data)
 325{
 326        switch (type) {
 327        case TRACE_REG_REGISTER:
 328        case TRACE_REG_UNREGISTER:
 329                break;
 330        case TRACE_REG_PERF_REGISTER:
 331        case TRACE_REG_PERF_UNREGISTER:
 332                return 0;
 333        case TRACE_REG_PERF_OPEN:
 334                return perf_ftrace_function_register(data);
 335        case TRACE_REG_PERF_CLOSE:
 336                return perf_ftrace_function_unregister(data);
 337        case TRACE_REG_PERF_ADD:
 338                perf_ftrace_function_enable(data);
 339                return 0;
 340        case TRACE_REG_PERF_DEL:
 341                perf_ftrace_function_disable(data);
 342                return 0;
 343        }
 344
 345        return -EINVAL;
 346}
 347#endif /* CONFIG_FUNCTION_TRACER */
 348