linux/kernel/trace/trace_event_perf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * trace event based perf event profiling/tracing
   4 *
   5 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
   6 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
   7 */
   8
   9#include <linux/module.h>
  10#include <linux/kprobes.h>
  11#include <linux/security.h>
  12#include "trace.h"
  13#include "trace_probe.h"
  14
  15static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  16
  17/*
  18 * Force it to be aligned to unsigned long to avoid misaligned accesses
  19 * suprises
  20 */
  21typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  22        perf_trace_t;
  23
  24/* Count the events in use (per event id, not per instance) */
  25static int      total_ref_count;
  26
  27static int perf_trace_event_perm(struct trace_event_call *tp_event,
  28                                 struct perf_event *p_event)
  29{
  30        int ret;
  31
  32        if (tp_event->perf_perm) {
  33                ret = tp_event->perf_perm(tp_event, p_event);
  34                if (ret)
  35                        return ret;
  36        }
  37
  38        /*
  39         * We checked and allowed to create parent,
  40         * allow children without checking.
  41         */
  42        if (p_event->parent)
  43                return 0;
  44
  45        /*
  46         * It's ok to check current process (owner) permissions in here,
  47         * because code below is called only via perf_event_open syscall.
  48         */
  49
  50        /* The ftrace function trace is allowed only for root. */
  51        if (ftrace_event_is_function(tp_event)) {
  52                ret = perf_allow_tracepoint(&p_event->attr);
  53                if (ret)
  54                        return ret;
  55
  56                if (!is_sampling_event(p_event))
  57                        return 0;
  58
  59                /*
  60                 * We don't allow user space callchains for  function trace
  61                 * event, due to issues with page faults while tracing page
  62                 * fault handler and its overall trickiness nature.
  63                 */
  64                if (!p_event->attr.exclude_callchain_user)
  65                        return -EINVAL;
  66
  67                /*
  68                 * Same reason to disable user stack dump as for user space
  69                 * callchains above.
  70                 */
  71                if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
  72                        return -EINVAL;
  73        }
  74
  75        /* No tracing, just counting, so no obvious leak */
  76        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  77                return 0;
  78
  79        /* Some events are ok to be traced by non-root users... */
  80        if (p_event->attach_state == PERF_ATTACH_TASK) {
  81                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  82                        return 0;
  83        }
  84
  85        /*
  86         * ...otherwise raw tracepoint data can be a severe data leak,
  87         * only allow root to have these.
  88         */
  89        ret = perf_allow_tracepoint(&p_event->attr);
  90        if (ret)
  91                return ret;
  92
  93        return 0;
  94}
  95
  96static int perf_trace_event_reg(struct trace_event_call *tp_event,
  97                                struct perf_event *p_event)
  98{
  99        struct hlist_head __percpu *list;
 100        int ret = -ENOMEM;
 101        int cpu;
 102
 103        p_event->tp_event = tp_event;
 104        if (tp_event->perf_refcount++ > 0)
 105                return 0;
 106
 107        list = alloc_percpu(struct hlist_head);
 108        if (!list)
 109                goto fail;
 110
 111        for_each_possible_cpu(cpu)
 112                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
 113
 114        tp_event->perf_events = list;
 115
 116        if (!total_ref_count) {
 117                char __percpu *buf;
 118                int i;
 119
 120                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 121                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
 122                        if (!buf)
 123                                goto fail;
 124
 125                        perf_trace_buf[i] = buf;
 126                }
 127        }
 128
 129        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
 130        if (ret)
 131                goto fail;
 132
 133        total_ref_count++;
 134        return 0;
 135
 136fail:
 137        if (!total_ref_count) {
 138                int i;
 139
 140                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 141                        free_percpu(perf_trace_buf[i]);
 142                        perf_trace_buf[i] = NULL;
 143                }
 144        }
 145
 146        if (!--tp_event->perf_refcount) {
 147                free_percpu(tp_event->perf_events);
 148                tp_event->perf_events = NULL;
 149        }
 150
 151        return ret;
 152}
 153
 154static void perf_trace_event_unreg(struct perf_event *p_event)
 155{
 156        struct trace_event_call *tp_event = p_event->tp_event;
 157        int i;
 158
 159        if (--tp_event->perf_refcount > 0)
 160                goto out;
 161
 162        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
 163
 164        /*
 165         * Ensure our callback won't be called anymore. The buffers
 166         * will be freed after that.
 167         */
 168        tracepoint_synchronize_unregister();
 169
 170        free_percpu(tp_event->perf_events);
 171        tp_event->perf_events = NULL;
 172
 173        if (!--total_ref_count) {
 174                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 175                        free_percpu(perf_trace_buf[i]);
 176                        perf_trace_buf[i] = NULL;
 177                }
 178        }
 179out:
 180        module_put(tp_event->mod);
 181}
 182
 183static int perf_trace_event_open(struct perf_event *p_event)
 184{
 185        struct trace_event_call *tp_event = p_event->tp_event;
 186        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 187}
 188
 189static void perf_trace_event_close(struct perf_event *p_event)
 190{
 191        struct trace_event_call *tp_event = p_event->tp_event;
 192        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 193}
 194
 195static int perf_trace_event_init(struct trace_event_call *tp_event,
 196                                 struct perf_event *p_event)
 197{
 198        int ret;
 199
 200        ret = perf_trace_event_perm(tp_event, p_event);
 201        if (ret)
 202                return ret;
 203
 204        ret = perf_trace_event_reg(tp_event, p_event);
 205        if (ret)
 206                return ret;
 207
 208        ret = perf_trace_event_open(p_event);
 209        if (ret) {
 210                perf_trace_event_unreg(p_event);
 211                return ret;
 212        }
 213
 214        return 0;
 215}
 216
 217int perf_trace_init(struct perf_event *p_event)
 218{
 219        struct trace_event_call *tp_event;
 220        u64 event_id = p_event->attr.config;
 221        int ret = -EINVAL;
 222
 223        mutex_lock(&event_mutex);
 224        list_for_each_entry(tp_event, &ftrace_events, list) {
 225                if (tp_event->event.type == event_id &&
 226                    tp_event->class && tp_event->class->reg &&
 227                    try_module_get(tp_event->mod)) {
 228                        ret = perf_trace_event_init(tp_event, p_event);
 229                        if (ret)
 230                                module_put(tp_event->mod);
 231                        break;
 232                }
 233        }
 234        mutex_unlock(&event_mutex);
 235
 236        return ret;
 237}
 238
 239void perf_trace_destroy(struct perf_event *p_event)
 240{
 241        mutex_lock(&event_mutex);
 242        perf_trace_event_close(p_event);
 243        perf_trace_event_unreg(p_event);
 244        mutex_unlock(&event_mutex);
 245}
 246
 247#ifdef CONFIG_KPROBE_EVENTS
 248int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
 249{
 250        int ret;
 251        char *func = NULL;
 252        struct trace_event_call *tp_event;
 253
 254        if (p_event->attr.kprobe_func) {
 255                func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
 256                if (!func)
 257                        return -ENOMEM;
 258                ret = strncpy_from_user(
 259                        func, u64_to_user_ptr(p_event->attr.kprobe_func),
 260                        KSYM_NAME_LEN);
 261                if (ret == KSYM_NAME_LEN)
 262                        ret = -E2BIG;
 263                if (ret < 0)
 264                        goto out;
 265
 266                if (func[0] == '\0') {
 267                        kfree(func);
 268                        func = NULL;
 269                }
 270        }
 271
 272        tp_event = create_local_trace_kprobe(
 273                func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
 274                p_event->attr.probe_offset, is_retprobe);
 275        if (IS_ERR(tp_event)) {
 276                ret = PTR_ERR(tp_event);
 277                goto out;
 278        }
 279
 280        mutex_lock(&event_mutex);
 281        ret = perf_trace_event_init(tp_event, p_event);
 282        if (ret)
 283                destroy_local_trace_kprobe(tp_event);
 284        mutex_unlock(&event_mutex);
 285out:
 286        kfree(func);
 287        return ret;
 288}
 289
 290void perf_kprobe_destroy(struct perf_event *p_event)
 291{
 292        mutex_lock(&event_mutex);
 293        perf_trace_event_close(p_event);
 294        perf_trace_event_unreg(p_event);
 295        mutex_unlock(&event_mutex);
 296
 297        destroy_local_trace_kprobe(p_event->tp_event);
 298}
 299#endif /* CONFIG_KPROBE_EVENTS */
 300
 301#ifdef CONFIG_UPROBE_EVENTS
 302int perf_uprobe_init(struct perf_event *p_event,
 303                     unsigned long ref_ctr_offset, bool is_retprobe)
 304{
 305        int ret;
 306        char *path = NULL;
 307        struct trace_event_call *tp_event;
 308
 309        if (!p_event->attr.uprobe_path)
 310                return -EINVAL;
 311
 312        path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
 313                            PATH_MAX);
 314        if (IS_ERR(path)) {
 315                ret = PTR_ERR(path);
 316                return (ret == -EINVAL) ? -E2BIG : ret;
 317        }
 318        if (path[0] == '\0') {
 319                ret = -EINVAL;
 320                goto out;
 321        }
 322
 323        tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
 324                                             ref_ctr_offset, is_retprobe);
 325        if (IS_ERR(tp_event)) {
 326                ret = PTR_ERR(tp_event);
 327                goto out;
 328        }
 329
 330        /*
 331         * local trace_uprobe need to hold event_mutex to call
 332         * uprobe_buffer_enable() and uprobe_buffer_disable().
 333         * event_mutex is not required for local trace_kprobes.
 334         */
 335        mutex_lock(&event_mutex);
 336        ret = perf_trace_event_init(tp_event, p_event);
 337        if (ret)
 338                destroy_local_trace_uprobe(tp_event);
 339        mutex_unlock(&event_mutex);
 340out:
 341        kfree(path);
 342        return ret;
 343}
 344
 345void perf_uprobe_destroy(struct perf_event *p_event)
 346{
 347        mutex_lock(&event_mutex);
 348        perf_trace_event_close(p_event);
 349        perf_trace_event_unreg(p_event);
 350        mutex_unlock(&event_mutex);
 351        destroy_local_trace_uprobe(p_event->tp_event);
 352}
 353#endif /* CONFIG_UPROBE_EVENTS */
 354
 355int perf_trace_add(struct perf_event *p_event, int flags)
 356{
 357        struct trace_event_call *tp_event = p_event->tp_event;
 358
 359        if (!(flags & PERF_EF_START))
 360                p_event->hw.state = PERF_HES_STOPPED;
 361
 362        /*
 363         * If TRACE_REG_PERF_ADD returns false; no custom action was performed
 364         * and we need to take the default action of enqueueing our event on
 365         * the right per-cpu hlist.
 366         */
 367        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
 368                struct hlist_head __percpu *pcpu_list;
 369                struct hlist_head *list;
 370
 371                pcpu_list = tp_event->perf_events;
 372                if (WARN_ON_ONCE(!pcpu_list))
 373                        return -EINVAL;
 374
 375                list = this_cpu_ptr(pcpu_list);
 376                hlist_add_head_rcu(&p_event->hlist_entry, list);
 377        }
 378
 379        return 0;
 380}
 381
 382void perf_trace_del(struct perf_event *p_event, int flags)
 383{
 384        struct trace_event_call *tp_event = p_event->tp_event;
 385
 386        /*
 387         * If TRACE_REG_PERF_DEL returns false; no custom action was performed
 388         * and we need to take the default action of dequeueing our event from
 389         * the right per-cpu hlist.
 390         */
 391        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
 392                hlist_del_rcu(&p_event->hlist_entry);
 393}
 394
 395void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 396{
 397        char *raw_data;
 398        int rctx;
 399
 400        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 401
 402        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 403                      "perf buffer not large enough"))
 404                return NULL;
 405
 406        *rctxp = rctx = perf_swevent_get_recursion_context();
 407        if (rctx < 0)
 408                return NULL;
 409
 410        if (regs)
 411                *regs = this_cpu_ptr(&__perf_regs[rctx]);
 412        raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 413
 414        /* zero the dead bytes from align to not leak stack to user */
 415        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 416        return raw_data;
 417}
 418EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
 419NOKPROBE_SYMBOL(perf_trace_buf_alloc);
 420
 421void perf_trace_buf_update(void *record, u16 type)
 422{
 423        struct trace_entry *entry = record;
 424        int pc = preempt_count();
 425        unsigned long flags;
 426
 427        local_save_flags(flags);
 428        tracing_generic_entry_update(entry, type, flags, pc);
 429}
 430NOKPROBE_SYMBOL(perf_trace_buf_update);
 431
 432#ifdef CONFIG_FUNCTION_TRACER
 433static void
 434perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 435                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
 436{
 437        struct ftrace_entry *entry;
 438        struct perf_event *event;
 439        struct hlist_head head;
 440        struct pt_regs regs;
 441        int rctx;
 442
 443        if ((unsigned long)ops->private != smp_processor_id())
 444                return;
 445
 446        event = container_of(ops, struct perf_event, ftrace_ops);
 447
 448        /*
 449         * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
 450         * the perf code does is hlist_for_each_entry_rcu(), so we can
 451         * get away with simply setting the @head.first pointer in order
 452         * to create a singular list.
 453         */
 454        head.first = &event->hlist_entry;
 455
 456#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 457                    sizeof(u64)) - sizeof(u32))
 458
 459        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 460
 461        memset(&regs, 0, sizeof(regs));
 462        perf_fetch_caller_regs(&regs);
 463
 464        entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 465        if (!entry)
 466                return;
 467
 468        entry->ip = ip;
 469        entry->parent_ip = parent_ip;
 470        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 471                              1, &regs, &head, NULL);
 472
 473#undef ENTRY_SIZE
 474}
 475
 476static int perf_ftrace_function_register(struct perf_event *event)
 477{
 478        struct ftrace_ops *ops = &event->ftrace_ops;
 479
 480        ops->flags   = FTRACE_OPS_FL_RCU;
 481        ops->func    = perf_ftrace_function_call;
 482        ops->private = (void *)(unsigned long)nr_cpu_ids;
 483
 484        return register_ftrace_function(ops);
 485}
 486
 487static int perf_ftrace_function_unregister(struct perf_event *event)
 488{
 489        struct ftrace_ops *ops = &event->ftrace_ops;
 490        int ret = unregister_ftrace_function(ops);
 491        ftrace_free_filter(ops);
 492        return ret;
 493}
 494
 495int perf_ftrace_event_register(struct trace_event_call *call,
 496                               enum trace_reg type, void *data)
 497{
 498        struct perf_event *event = data;
 499
 500        switch (type) {
 501        case TRACE_REG_REGISTER:
 502        case TRACE_REG_UNREGISTER:
 503                break;
 504        case TRACE_REG_PERF_REGISTER:
 505        case TRACE_REG_PERF_UNREGISTER:
 506                return 0;
 507        case TRACE_REG_PERF_OPEN:
 508                return perf_ftrace_function_register(data);
 509        case TRACE_REG_PERF_CLOSE:
 510                return perf_ftrace_function_unregister(data);
 511        case TRACE_REG_PERF_ADD:
 512                event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
 513                return 1;
 514        case TRACE_REG_PERF_DEL:
 515                event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
 516                return 1;
 517        }
 518
 519        return -EINVAL;
 520}
 521#endif /* CONFIG_FUNCTION_TRACER */
 522