linux/kernel/trace/trace_event_perf.c
<<
>>
Prefs
   1/*
   2 * trace event based perf event profiling/tracing
   3 *
   4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
   5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/kprobes.h>
  10#include "trace.h"
  11
  12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  13
  14/*
  15 * Force it to be aligned to unsigned long to avoid misaligned accesses
  16 * suprises
  17 */
  18typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  19        perf_trace_t;
  20
  21/* Count the events in use (per event id, not per instance) */
  22static int      total_ref_count;
  23
  24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
  25                                 struct perf_event *p_event)
  26{
  27        /* The ftrace function trace is allowed only for root. */
  28        if (ftrace_event_is_function(tp_event) &&
  29            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
  30                return -EPERM;
  31
  32        /* No tracing, just counting, so no obvious leak */
  33        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  34                return 0;
  35
  36        /* Some events are ok to be traced by non-root users... */
  37        if (p_event->attach_state == PERF_ATTACH_TASK) {
  38                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  39                        return 0;
  40        }
  41
  42        /*
  43         * ...otherwise raw tracepoint data can be a severe data leak,
  44         * only allow root to have these.
  45         */
  46        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  47                return -EPERM;
  48
  49        return 0;
  50}
  51
  52static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
  53                                struct perf_event *p_event)
  54{
  55        struct hlist_head __percpu *list;
  56        int ret = -ENOMEM;
  57        int cpu;
  58
  59        p_event->tp_event = tp_event;
  60        if (tp_event->perf_refcount++ > 0)
  61                return 0;
  62
  63        list = alloc_percpu(struct hlist_head);
  64        if (!list)
  65                goto fail;
  66
  67        for_each_possible_cpu(cpu)
  68                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
  69
  70        tp_event->perf_events = list;
  71
  72        if (!total_ref_count) {
  73                char __percpu *buf;
  74                int i;
  75
  76                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  77                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
  78                        if (!buf)
  79                                goto fail;
  80
  81                        perf_trace_buf[i] = buf;
  82                }
  83        }
  84
  85        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
  86        if (ret)
  87                goto fail;
  88
  89        total_ref_count++;
  90        return 0;
  91
  92fail:
  93        if (!total_ref_count) {
  94                int i;
  95
  96                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  97                        free_percpu(perf_trace_buf[i]);
  98                        perf_trace_buf[i] = NULL;
  99                }
 100        }
 101
 102        if (!--tp_event->perf_refcount) {
 103                free_percpu(tp_event->perf_events);
 104                tp_event->perf_events = NULL;
 105        }
 106
 107        return ret;
 108}
 109
 110static void perf_trace_event_unreg(struct perf_event *p_event)
 111{
 112        struct ftrace_event_call *tp_event = p_event->tp_event;
 113        int i;
 114
 115        if (--tp_event->perf_refcount > 0)
 116                goto out;
 117
 118        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
 119
 120        /*
 121         * Ensure our callback won't be called anymore. The buffers
 122         * will be freed after that.
 123         */
 124        tracepoint_synchronize_unregister();
 125
 126        free_percpu(tp_event->perf_events);
 127        tp_event->perf_events = NULL;
 128
 129        if (!--total_ref_count) {
 130                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 131                        free_percpu(perf_trace_buf[i]);
 132                        perf_trace_buf[i] = NULL;
 133                }
 134        }
 135out:
 136        module_put(tp_event->mod);
 137}
 138
 139static int perf_trace_event_open(struct perf_event *p_event)
 140{
 141        struct ftrace_event_call *tp_event = p_event->tp_event;
 142        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 143}
 144
 145static void perf_trace_event_close(struct perf_event *p_event)
 146{
 147        struct ftrace_event_call *tp_event = p_event->tp_event;
 148        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 149}
 150
 151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 152                                 struct perf_event *p_event)
 153{
 154        int ret;
 155
 156        ret = perf_trace_event_perm(tp_event, p_event);
 157        if (ret)
 158                return ret;
 159
 160        ret = perf_trace_event_reg(tp_event, p_event);
 161        if (ret)
 162                return ret;
 163
 164        ret = perf_trace_event_open(p_event);
 165        if (ret) {
 166                perf_trace_event_unreg(p_event);
 167                return ret;
 168        }
 169
 170        return 0;
 171}
 172
 173int perf_trace_init(struct perf_event *p_event)
 174{
 175        struct ftrace_event_call *tp_event;
 176        int event_id = p_event->attr.config;
 177        int ret = -EINVAL;
 178
 179        mutex_lock(&event_mutex);
 180        list_for_each_entry(tp_event, &ftrace_events, list) {
 181                if (tp_event->event.type == event_id &&
 182                    tp_event->class && tp_event->class->reg &&
 183                    try_module_get(tp_event->mod)) {
 184                        ret = perf_trace_event_init(tp_event, p_event);
 185                        if (ret)
 186                                module_put(tp_event->mod);
 187                        break;
 188                }
 189        }
 190        mutex_unlock(&event_mutex);
 191
 192        return ret;
 193}
 194
 195void perf_trace_destroy(struct perf_event *p_event)
 196{
 197        mutex_lock(&event_mutex);
 198        perf_trace_event_close(p_event);
 199        perf_trace_event_unreg(p_event);
 200        mutex_unlock(&event_mutex);
 201}
 202
 203int perf_trace_add(struct perf_event *p_event, int flags)
 204{
 205        struct ftrace_event_call *tp_event = p_event->tp_event;
 206        struct hlist_head __percpu *pcpu_list;
 207        struct hlist_head *list;
 208
 209        pcpu_list = tp_event->perf_events;
 210        if (WARN_ON_ONCE(!pcpu_list))
 211                return -EINVAL;
 212
 213        if (!(flags & PERF_EF_START))
 214                p_event->hw.state = PERF_HES_STOPPED;
 215
 216        list = this_cpu_ptr(pcpu_list);
 217        hlist_add_head_rcu(&p_event->hlist_entry, list);
 218
 219        return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
 220}
 221
 222void perf_trace_del(struct perf_event *p_event, int flags)
 223{
 224        struct ftrace_event_call *tp_event = p_event->tp_event;
 225        hlist_del_rcu(&p_event->hlist_entry);
 226        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 227}
 228
 229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 230                                       struct pt_regs *regs, int *rctxp)
 231{
 232        struct trace_entry *entry;
 233        unsigned long flags;
 234        char *raw_data;
 235        int pc;
 236
 237        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 238
 239        pc = preempt_count();
 240
 241        *rctxp = perf_swevent_get_recursion_context();
 242        if (*rctxp < 0)
 243                return NULL;
 244
 245        raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
 246
 247        /* zero the dead bytes from align to not leak stack to user */
 248        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 249
 250        entry = (struct trace_entry *)raw_data;
 251        local_save_flags(flags);
 252        tracing_generic_entry_update(entry, flags, pc);
 253        entry->type = type;
 254
 255        return raw_data;
 256}
 257EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
 258
 259#ifdef CONFIG_FUNCTION_TRACER
 260static void
 261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 262                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
 263{
 264        struct ftrace_entry *entry;
 265        struct hlist_head *head;
 266        struct pt_regs regs;
 267        int rctx;
 268
 269#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 270                    sizeof(u64)) - sizeof(u32))
 271
 272        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 273
 274        perf_fetch_caller_regs(&regs);
 275
 276        entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
 277        if (!entry)
 278                return;
 279
 280        entry->ip = ip;
 281        entry->parent_ip = parent_ip;
 282
 283        head = this_cpu_ptr(event_function.perf_events);
 284        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
 285                              1, &regs, head, NULL);
 286
 287#undef ENTRY_SIZE
 288}
 289
 290static int perf_ftrace_function_register(struct perf_event *event)
 291{
 292        struct ftrace_ops *ops = &event->ftrace_ops;
 293
 294        ops->flags |= FTRACE_OPS_FL_CONTROL;
 295        ops->func = perf_ftrace_function_call;
 296        return register_ftrace_function(ops);
 297}
 298
 299static int perf_ftrace_function_unregister(struct perf_event *event)
 300{
 301        struct ftrace_ops *ops = &event->ftrace_ops;
 302        int ret = unregister_ftrace_function(ops);
 303        ftrace_free_filter(ops);
 304        return ret;
 305}
 306
 307static void perf_ftrace_function_enable(struct perf_event *event)
 308{
 309        ftrace_function_local_enable(&event->ftrace_ops);
 310}
 311
 312static void perf_ftrace_function_disable(struct perf_event *event)
 313{
 314        ftrace_function_local_disable(&event->ftrace_ops);
 315}
 316
 317int perf_ftrace_event_register(struct ftrace_event_call *call,
 318                               enum trace_reg type, void *data)
 319{
 320        switch (type) {
 321        case TRACE_REG_REGISTER:
 322        case TRACE_REG_UNREGISTER:
 323                break;
 324        case TRACE_REG_PERF_REGISTER:
 325        case TRACE_REG_PERF_UNREGISTER:
 326                return 0;
 327        case TRACE_REG_PERF_OPEN:
 328                return perf_ftrace_function_register(data);
 329        case TRACE_REG_PERF_CLOSE:
 330                return perf_ftrace_function_unregister(data);
 331        case TRACE_REG_PERF_ADD:
 332                perf_ftrace_function_enable(data);
 333                return 0;
 334        case TRACE_REG_PERF_DEL:
 335                perf_ftrace_function_disable(data);
 336                return 0;
 337        }
 338
 339        return -EINVAL;
 340}
 341#endif /* CONFIG_FUNCTION_TRACER */
 342