linux/kernel/trace/trace_event_perf.c
<<
>>
Prefs
   1/*
   2 * trace event based perf event profiling/tracing
   3 *
   4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
   5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/kprobes.h>
  10#include "trace.h"
  11#include "trace_probe.h"
  12
  13static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  14
  15/*
  16 * Force it to be aligned to unsigned long to avoid misaligned accesses
  17 * suprises
  18 */
  19typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  20        perf_trace_t;
  21
  22/* Count the events in use (per event id, not per instance) */
  23static int      total_ref_count;
  24
  25static int perf_trace_event_perm(struct trace_event_call *tp_event,
  26                                 struct perf_event *p_event)
  27{
  28        if (tp_event->perf_perm) {
  29                int ret = tp_event->perf_perm(tp_event, p_event);
  30                if (ret)
  31                        return ret;
  32        }
  33
  34        /*
  35         * We checked and allowed to create parent,
  36         * allow children without checking.
  37         */
  38        if (p_event->parent)
  39                return 0;
  40
  41        /*
  42         * It's ok to check current process (owner) permissions in here,
  43         * because code below is called only via perf_event_open syscall.
  44         */
  45
  46        /* The ftrace function trace is allowed only for root. */
  47        if (ftrace_event_is_function(tp_event)) {
  48                if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  49                        return -EPERM;
  50
  51                if (!is_sampling_event(p_event))
  52                        return 0;
  53
  54                /*
  55                 * We don't allow user space callchains for  function trace
  56                 * event, due to issues with page faults while tracing page
  57                 * fault handler and its overall trickiness nature.
  58                 */
  59                if (!p_event->attr.exclude_callchain_user)
  60                        return -EINVAL;
  61
  62                /*
  63                 * Same reason to disable user stack dump as for user space
  64                 * callchains above.
  65                 */
  66                if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
  67                        return -EINVAL;
  68        }
  69
  70        /* No tracing, just counting, so no obvious leak */
  71        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  72                return 0;
  73
  74        /* Some events are ok to be traced by non-root users... */
  75        if (p_event->attach_state == PERF_ATTACH_TASK) {
  76                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  77                        return 0;
  78        }
  79
  80        /*
  81         * ...otherwise raw tracepoint data can be a severe data leak,
  82         * only allow root to have these.
  83         */
  84        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  85                return -EPERM;
  86
  87        return 0;
  88}
  89
  90static int perf_trace_event_reg(struct trace_event_call *tp_event,
  91                                struct perf_event *p_event)
  92{
  93        struct hlist_head __percpu *list;
  94        int ret = -ENOMEM;
  95        int cpu;
  96
  97        p_event->tp_event = tp_event;
  98        if (tp_event->perf_refcount++ > 0)
  99                return 0;
 100
 101        list = alloc_percpu(struct hlist_head);
 102        if (!list)
 103                goto fail;
 104
 105        for_each_possible_cpu(cpu)
 106                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
 107
 108        tp_event->perf_events = list;
 109
 110        if (!total_ref_count) {
 111                char __percpu *buf;
 112                int i;
 113
 114                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 115                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
 116                        if (!buf)
 117                                goto fail;
 118
 119                        perf_trace_buf[i] = buf;
 120                }
 121        }
 122
 123        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
 124        if (ret)
 125                goto fail;
 126
 127        total_ref_count++;
 128        return 0;
 129
 130fail:
 131        if (!total_ref_count) {
 132                int i;
 133
 134                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 135                        free_percpu(perf_trace_buf[i]);
 136                        perf_trace_buf[i] = NULL;
 137                }
 138        }
 139
 140        if (!--tp_event->perf_refcount) {
 141                free_percpu(tp_event->perf_events);
 142                tp_event->perf_events = NULL;
 143        }
 144
 145        return ret;
 146}
 147
 148static void perf_trace_event_unreg(struct perf_event *p_event)
 149{
 150        struct trace_event_call *tp_event = p_event->tp_event;
 151        int i;
 152
 153        if (--tp_event->perf_refcount > 0)
 154                goto out;
 155
 156        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
 157
 158        /*
 159         * Ensure our callback won't be called anymore. The buffers
 160         * will be freed after that.
 161         */
 162        tracepoint_synchronize_unregister();
 163
 164        free_percpu(tp_event->perf_events);
 165        tp_event->perf_events = NULL;
 166
 167        if (!--total_ref_count) {
 168                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 169                        free_percpu(perf_trace_buf[i]);
 170                        perf_trace_buf[i] = NULL;
 171                }
 172        }
 173out:
 174        module_put(tp_event->mod);
 175}
 176
 177static int perf_trace_event_open(struct perf_event *p_event)
 178{
 179        struct trace_event_call *tp_event = p_event->tp_event;
 180        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 181}
 182
 183static void perf_trace_event_close(struct perf_event *p_event)
 184{
 185        struct trace_event_call *tp_event = p_event->tp_event;
 186        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 187}
 188
 189static int perf_trace_event_init(struct trace_event_call *tp_event,
 190                                 struct perf_event *p_event)
 191{
 192        int ret;
 193
 194        ret = perf_trace_event_perm(tp_event, p_event);
 195        if (ret)
 196                return ret;
 197
 198        ret = perf_trace_event_reg(tp_event, p_event);
 199        if (ret)
 200                return ret;
 201
 202        ret = perf_trace_event_open(p_event);
 203        if (ret) {
 204                perf_trace_event_unreg(p_event);
 205                return ret;
 206        }
 207
 208        return 0;
 209}
 210
 211int perf_trace_init(struct perf_event *p_event)
 212{
 213        struct trace_event_call *tp_event;
 214        u64 event_id = p_event->attr.config;
 215        int ret = -EINVAL;
 216
 217        mutex_lock(&event_mutex);
 218        list_for_each_entry(tp_event, &ftrace_events, list) {
 219                if (tp_event->event.type == event_id &&
 220                    tp_event->class && tp_event->class->reg &&
 221                    try_module_get(tp_event->mod)) {
 222                        ret = perf_trace_event_init(tp_event, p_event);
 223                        if (ret)
 224                                module_put(tp_event->mod);
 225                        break;
 226                }
 227        }
 228        mutex_unlock(&event_mutex);
 229
 230        return ret;
 231}
 232
 233void perf_trace_destroy(struct perf_event *p_event)
 234{
 235        mutex_lock(&event_mutex);
 236        perf_trace_event_close(p_event);
 237        perf_trace_event_unreg(p_event);
 238        mutex_unlock(&event_mutex);
 239}
 240
 241#ifdef CONFIG_KPROBE_EVENTS
 242int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
 243{
 244        int ret;
 245        char *func = NULL;
 246        struct trace_event_call *tp_event;
 247
 248        if (p_event->attr.kprobe_func) {
 249                func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
 250                if (!func)
 251                        return -ENOMEM;
 252                ret = strncpy_from_user(
 253                        func, u64_to_user_ptr(p_event->attr.kprobe_func),
 254                        KSYM_NAME_LEN);
 255                if (ret == KSYM_NAME_LEN)
 256                        ret = -E2BIG;
 257                if (ret < 0)
 258                        goto out;
 259
 260                if (func[0] == '\0') {
 261                        kfree(func);
 262                        func = NULL;
 263                }
 264        }
 265
 266        tp_event = create_local_trace_kprobe(
 267                func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
 268                p_event->attr.probe_offset, is_retprobe);
 269        if (IS_ERR(tp_event)) {
 270                ret = PTR_ERR(tp_event);
 271                goto out;
 272        }
 273
 274        ret = perf_trace_event_init(tp_event, p_event);
 275        if (ret)
 276                destroy_local_trace_kprobe(tp_event);
 277out:
 278        kfree(func);
 279        return ret;
 280}
 281
 282void perf_kprobe_destroy(struct perf_event *p_event)
 283{
 284        perf_trace_event_close(p_event);
 285        perf_trace_event_unreg(p_event);
 286
 287        destroy_local_trace_kprobe(p_event->tp_event);
 288}
 289#endif /* CONFIG_KPROBE_EVENTS */
 290
 291#ifdef CONFIG_UPROBE_EVENTS
 292int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
 293{
 294        int ret;
 295        char *path = NULL;
 296        struct trace_event_call *tp_event;
 297
 298        if (!p_event->attr.uprobe_path)
 299                return -EINVAL;
 300        path = kzalloc(PATH_MAX, GFP_KERNEL);
 301        if (!path)
 302                return -ENOMEM;
 303        ret = strncpy_from_user(
 304                path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
 305        if (ret == PATH_MAX)
 306                return -E2BIG;
 307        if (ret < 0)
 308                goto out;
 309        if (path[0] == '\0') {
 310                ret = -EINVAL;
 311                goto out;
 312        }
 313
 314        tp_event = create_local_trace_uprobe(
 315                path, p_event->attr.probe_offset, is_retprobe);
 316        if (IS_ERR(tp_event)) {
 317                ret = PTR_ERR(tp_event);
 318                goto out;
 319        }
 320
 321        /*
 322         * local trace_uprobe need to hold event_mutex to call
 323         * uprobe_buffer_enable() and uprobe_buffer_disable().
 324         * event_mutex is not required for local trace_kprobes.
 325         */
 326        mutex_lock(&event_mutex);
 327        ret = perf_trace_event_init(tp_event, p_event);
 328        if (ret)
 329                destroy_local_trace_uprobe(tp_event);
 330        mutex_unlock(&event_mutex);
 331out:
 332        kfree(path);
 333        return ret;
 334}
 335
 336void perf_uprobe_destroy(struct perf_event *p_event)
 337{
 338        mutex_lock(&event_mutex);
 339        perf_trace_event_close(p_event);
 340        perf_trace_event_unreg(p_event);
 341        mutex_unlock(&event_mutex);
 342        destroy_local_trace_uprobe(p_event->tp_event);
 343}
 344#endif /* CONFIG_UPROBE_EVENTS */
 345
 346int perf_trace_add(struct perf_event *p_event, int flags)
 347{
 348        struct trace_event_call *tp_event = p_event->tp_event;
 349
 350        if (!(flags & PERF_EF_START))
 351                p_event->hw.state = PERF_HES_STOPPED;
 352
 353        /*
 354         * If TRACE_REG_PERF_ADD returns false; no custom action was performed
 355         * and we need to take the default action of enqueueing our event on
 356         * the right per-cpu hlist.
 357         */
 358        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
 359                struct hlist_head __percpu *pcpu_list;
 360                struct hlist_head *list;
 361
 362                pcpu_list = tp_event->perf_events;
 363                if (WARN_ON_ONCE(!pcpu_list))
 364                        return -EINVAL;
 365
 366                list = this_cpu_ptr(pcpu_list);
 367                hlist_add_head_rcu(&p_event->hlist_entry, list);
 368        }
 369
 370        return 0;
 371}
 372
 373void perf_trace_del(struct perf_event *p_event, int flags)
 374{
 375        struct trace_event_call *tp_event = p_event->tp_event;
 376
 377        /*
 378         * If TRACE_REG_PERF_DEL returns false; no custom action was performed
 379         * and we need to take the default action of dequeueing our event from
 380         * the right per-cpu hlist.
 381         */
 382        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
 383                hlist_del_rcu(&p_event->hlist_entry);
 384}
 385
 386void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 387{
 388        char *raw_data;
 389        int rctx;
 390
 391        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 392
 393        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 394                      "perf buffer not large enough"))
 395                return NULL;
 396
 397        *rctxp = rctx = perf_swevent_get_recursion_context();
 398        if (rctx < 0)
 399                return NULL;
 400
 401        if (regs)
 402                *regs = this_cpu_ptr(&__perf_regs[rctx]);
 403        raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 404
 405        /* zero the dead bytes from align to not leak stack to user */
 406        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 407        return raw_data;
 408}
 409EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
 410NOKPROBE_SYMBOL(perf_trace_buf_alloc);
 411
 412void perf_trace_buf_update(void *record, u16 type)
 413{
 414        struct trace_entry *entry = record;
 415        int pc = preempt_count();
 416        unsigned long flags;
 417
 418        local_save_flags(flags);
 419        tracing_generic_entry_update(entry, flags, pc);
 420        entry->type = type;
 421}
 422NOKPROBE_SYMBOL(perf_trace_buf_update);
 423
 424#ifdef CONFIG_FUNCTION_TRACER
 425static void
 426perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 427                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
 428{
 429        struct ftrace_entry *entry;
 430        struct perf_event *event;
 431        struct hlist_head head;
 432        struct pt_regs regs;
 433        int rctx;
 434
 435        if ((unsigned long)ops->private != smp_processor_id())
 436                return;
 437
 438        event = container_of(ops, struct perf_event, ftrace_ops);
 439
 440        /*
 441         * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
 442         * the perf code does is hlist_for_each_entry_rcu(), so we can
 443         * get away with simply setting the @head.first pointer in order
 444         * to create a singular list.
 445         */
 446        head.first = &event->hlist_entry;
 447
 448#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 449                    sizeof(u64)) - sizeof(u32))
 450
 451        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 452
 453        memset(&regs, 0, sizeof(regs));
 454        perf_fetch_caller_regs(&regs);
 455
 456        entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 457        if (!entry)
 458                return;
 459
 460        entry->ip = ip;
 461        entry->parent_ip = parent_ip;
 462        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 463                              1, &regs, &head, NULL);
 464
 465#undef ENTRY_SIZE
 466}
 467
 468static int perf_ftrace_function_register(struct perf_event *event)
 469{
 470        struct ftrace_ops *ops = &event->ftrace_ops;
 471
 472        ops->flags   = FTRACE_OPS_FL_RCU;
 473        ops->func    = perf_ftrace_function_call;
 474        ops->private = (void *)(unsigned long)nr_cpu_ids;
 475
 476        return register_ftrace_function(ops);
 477}
 478
 479static int perf_ftrace_function_unregister(struct perf_event *event)
 480{
 481        struct ftrace_ops *ops = &event->ftrace_ops;
 482        int ret = unregister_ftrace_function(ops);
 483        ftrace_free_filter(ops);
 484        return ret;
 485}
 486
 487int perf_ftrace_event_register(struct trace_event_call *call,
 488                               enum trace_reg type, void *data)
 489{
 490        struct perf_event *event = data;
 491
 492        switch (type) {
 493        case TRACE_REG_REGISTER:
 494        case TRACE_REG_UNREGISTER:
 495                break;
 496        case TRACE_REG_PERF_REGISTER:
 497        case TRACE_REG_PERF_UNREGISTER:
 498                return 0;
 499        case TRACE_REG_PERF_OPEN:
 500                return perf_ftrace_function_register(data);
 501        case TRACE_REG_PERF_CLOSE:
 502                return perf_ftrace_function_unregister(data);
 503        case TRACE_REG_PERF_ADD:
 504                event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
 505                return 1;
 506        case TRACE_REG_PERF_DEL:
 507                event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
 508                return 1;
 509        }
 510
 511        return -EINVAL;
 512}
 513#endif /* CONFIG_FUNCTION_TRACER */
 514