linux/kernel/trace/trace_uprobe.c
<<
>>
Prefs
   1/*
   2 * uprobes-based tracing events
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 * GNU General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  16 *
  17 * Copyright (C) IBM Corporation, 2010-2012
  18 * Author:      Srikar Dronamraju <srikar@linux.vnet.ibm.com>
  19 */
  20#define pr_fmt(fmt)     "trace_kprobe: " fmt
  21
  22#include <linux/module.h>
  23#include <linux/uaccess.h>
  24#include <linux/uprobes.h>
  25#include <linux/namei.h>
  26#include <linux/string.h>
  27#include <linux/rculist.h>
  28
  29#include "trace_probe.h"
  30
  31#define UPROBE_EVENT_SYSTEM     "uprobes"
  32
  33struct uprobe_trace_entry_head {
  34        struct trace_entry      ent;
  35        unsigned long           vaddr[];
  36};
  37
  38#define SIZEOF_TRACE_ENTRY(is_return)                   \
  39        (sizeof(struct uprobe_trace_entry_head) +       \
  40         sizeof(unsigned long) * (is_return ? 2 : 1))
  41
  42#define DATAOF_TRACE_ENTRY(entry, is_return)            \
  43        ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
  44
  45struct trace_uprobe_filter {
  46        rwlock_t                rwlock;
  47        int                     nr_systemwide;
  48        struct list_head        perf_events;
  49};
  50
  51/*
  52 * uprobe event core functions
  53 */
  54struct trace_uprobe {
  55        struct list_head                list;
  56        struct trace_uprobe_filter      filter;
  57        struct uprobe_consumer          consumer;
  58        struct inode                    *inode;
  59        char                            *filename;
  60        unsigned long                   offset;
  61        unsigned long                   nhit;
  62        struct trace_probe              tp;
  63};
  64
  65#define SIZEOF_TRACE_UPROBE(n)                          \
  66        (offsetof(struct trace_uprobe, tp.args) +       \
  67        (sizeof(struct probe_arg) * (n)))
  68
  69static int register_uprobe_event(struct trace_uprobe *tu);
  70static int unregister_uprobe_event(struct trace_uprobe *tu);
  71
  72static DEFINE_MUTEX(uprobe_lock);
  73static LIST_HEAD(uprobe_list);
  74
  75struct uprobe_dispatch_data {
  76        struct trace_uprobe     *tu;
  77        unsigned long           bp_addr;
  78};
  79
  80static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
  81static int uretprobe_dispatcher(struct uprobe_consumer *con,
  82                                unsigned long func, struct pt_regs *regs);
  83
  84#ifdef CONFIG_STACK_GROWSUP
  85static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
  86{
  87        return addr - (n * sizeof(long));
  88}
  89#else
  90static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
  91{
  92        return addr + (n * sizeof(long));
  93}
  94#endif
  95
  96static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
  97{
  98        unsigned long ret;
  99        unsigned long addr = user_stack_pointer(regs);
 100
 101        addr = adjust_stack_addr(addr, n);
 102
 103        if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
 104                return 0;
 105
 106        return ret;
 107}
 108
 109/*
 110 * Uprobes-specific fetch functions
 111 */
 112#define DEFINE_FETCH_stack(type)                                        \
 113static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,          \
 114                                         void *offset, void *dest)      \
 115{                                                                       \
 116        *(type *)dest = (type)get_user_stack_nth(regs,                  \
 117                                              ((unsigned long)offset)); \
 118}
 119DEFINE_BASIC_FETCH_FUNCS(stack)
 120/* No string on the stack entry */
 121#define fetch_stack_string      NULL
 122#define fetch_stack_string_size NULL
 123
 124#define DEFINE_FETCH_memory(type)                                       \
 125static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,         \
 126                                          void *addr, void *dest)       \
 127{                                                                       \
 128        type retval;                                                    \
 129        void __user *vaddr = (void __force __user *) addr;              \
 130                                                                        \
 131        if (copy_from_user(&retval, vaddr, sizeof(type)))               \
 132                *(type *)dest = 0;                                      \
 133        else                                                            \
 134                *(type *) dest = retval;                                \
 135}
 136DEFINE_BASIC_FETCH_FUNCS(memory)
 137/*
 138 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
 139 * length and relative data location.
 140 */
 141static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 142                                            void *addr, void *dest)
 143{
 144        long ret;
 145        u32 rloc = *(u32 *)dest;
 146        int maxlen  = get_rloc_len(rloc);
 147        u8 *dst = get_rloc_data(dest);
 148        void __user *src = (void __force __user *) addr;
 149
 150        if (!maxlen)
 151                return;
 152
 153        ret = strncpy_from_user(dst, src, maxlen);
 154
 155        if (ret < 0) {  /* Failed to fetch string */
 156                ((u8 *)get_rloc_data(dest))[0] = '\0';
 157                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
 158        } else {
 159                *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
 160        }
 161}
 162
 163static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
 164                                                 void *addr, void *dest)
 165{
 166        int len;
 167        void __user *vaddr = (void __force __user *) addr;
 168
 169        len = strnlen_user(vaddr, MAX_STRING_SIZE);
 170
 171        if (len == 0 || len > MAX_STRING_SIZE)  /* Failed to check length */
 172                *(u32 *)dest = 0;
 173        else
 174                *(u32 *)dest = len;
 175}
 176
 177static unsigned long translate_user_vaddr(void *file_offset)
 178{
 179        unsigned long base_addr;
 180        struct uprobe_dispatch_data *udd;
 181
 182        udd = (void *) current->utask->vaddr;
 183
 184        base_addr = udd->bp_addr - udd->tu->offset;
 185        return base_addr + (unsigned long)file_offset;
 186}
 187
 188#define DEFINE_FETCH_file_offset(type)                                  \
 189static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,    \
 190                                               void *offset, void *dest)\
 191{                                                                       \
 192        void *vaddr = (void *)translate_user_vaddr(offset);             \
 193                                                                        \
 194        FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest);               \
 195}
 196DEFINE_BASIC_FETCH_FUNCS(file_offset)
 197DEFINE_FETCH_file_offset(string)
 198DEFINE_FETCH_file_offset(string_size)
 199
 200/* Fetch type information table */
 201static const struct fetch_type uprobes_fetch_type_table[] = {
 202        /* Special types */
 203        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
 204                                        sizeof(u32), 1, "__data_loc char[]"),
 205        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
 206                                        string_size, sizeof(u32), 0, "u32"),
 207        /* Basic types */
 208        ASSIGN_FETCH_TYPE(u8,  u8,  0),
 209        ASSIGN_FETCH_TYPE(u16, u16, 0),
 210        ASSIGN_FETCH_TYPE(u32, u32, 0),
 211        ASSIGN_FETCH_TYPE(u64, u64, 0),
 212        ASSIGN_FETCH_TYPE(s8,  u8,  1),
 213        ASSIGN_FETCH_TYPE(s16, u16, 1),
 214        ASSIGN_FETCH_TYPE(s32, u32, 1),
 215        ASSIGN_FETCH_TYPE(s64, u64, 1),
 216        ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0),
 217        ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
 218        ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
 219        ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
 220
 221        ASSIGN_FETCH_TYPE_END
 222};
 223
 224static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 225{
 226        rwlock_init(&filter->rwlock);
 227        filter->nr_systemwide = 0;
 228        INIT_LIST_HEAD(&filter->perf_events);
 229}
 230
 231static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
 232{
 233        return !filter->nr_systemwide && list_empty(&filter->perf_events);
 234}
 235
 236static inline bool is_ret_probe(struct trace_uprobe *tu)
 237{
 238        return tu->consumer.ret_handler != NULL;
 239}
 240
 241/*
 242 * Allocate new trace_uprobe and initialize it (including uprobes).
 243 */
 244static struct trace_uprobe *
 245alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 246{
 247        struct trace_uprobe *tu;
 248
 249        if (!event || !is_good_name(event))
 250                return ERR_PTR(-EINVAL);
 251
 252        if (!group || !is_good_name(group))
 253                return ERR_PTR(-EINVAL);
 254
 255        tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
 256        if (!tu)
 257                return ERR_PTR(-ENOMEM);
 258
 259        tu->tp.call.class = &tu->tp.class;
 260        tu->tp.call.name = kstrdup(event, GFP_KERNEL);
 261        if (!tu->tp.call.name)
 262                goto error;
 263
 264        tu->tp.class.system = kstrdup(group, GFP_KERNEL);
 265        if (!tu->tp.class.system)
 266                goto error;
 267
 268        INIT_LIST_HEAD(&tu->list);
 269        INIT_LIST_HEAD(&tu->tp.files);
 270        tu->consumer.handler = uprobe_dispatcher;
 271        if (is_ret)
 272                tu->consumer.ret_handler = uretprobe_dispatcher;
 273        init_trace_uprobe_filter(&tu->filter);
 274        return tu;
 275
 276error:
 277        kfree(tu->tp.call.name);
 278        kfree(tu);
 279
 280        return ERR_PTR(-ENOMEM);
 281}
 282
 283static void free_trace_uprobe(struct trace_uprobe *tu)
 284{
 285        int i;
 286
 287        for (i = 0; i < tu->tp.nr_args; i++)
 288                traceprobe_free_probe_arg(&tu->tp.args[i]);
 289
 290        iput(tu->inode);
 291        kfree(tu->tp.call.class->system);
 292        kfree(tu->tp.call.name);
 293        kfree(tu->filename);
 294        kfree(tu);
 295}
 296
 297static struct trace_uprobe *find_probe_event(const char *event, const char *group)
 298{
 299        struct trace_uprobe *tu;
 300
 301        list_for_each_entry(tu, &uprobe_list, list)
 302                if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
 303                    strcmp(tu->tp.call.class->system, group) == 0)
 304                        return tu;
 305
 306        return NULL;
 307}
 308
 309/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
 310static int unregister_trace_uprobe(struct trace_uprobe *tu)
 311{
 312        int ret;
 313
 314        ret = unregister_uprobe_event(tu);
 315        if (ret)
 316                return ret;
 317
 318        list_del(&tu->list);
 319        free_trace_uprobe(tu);
 320        return 0;
 321}
 322
 323/* Register a trace_uprobe and probe_event */
 324static int register_trace_uprobe(struct trace_uprobe *tu)
 325{
 326        struct trace_uprobe *old_tu;
 327        int ret;
 328
 329        mutex_lock(&uprobe_lock);
 330
 331        /* register as an event */
 332        old_tu = find_probe_event(trace_event_name(&tu->tp.call),
 333                        tu->tp.call.class->system);
 334        if (old_tu) {
 335                /* delete old event */
 336                ret = unregister_trace_uprobe(old_tu);
 337                if (ret)
 338                        goto end;
 339        }
 340
 341        ret = register_uprobe_event(tu);
 342        if (ret) {
 343                pr_warn("Failed to register probe event(%d)\n", ret);
 344                goto end;
 345        }
 346
 347        list_add_tail(&tu->list, &uprobe_list);
 348
 349end:
 350        mutex_unlock(&uprobe_lock);
 351
 352        return ret;
 353}
 354
 355/*
 356 * Argument syntax:
 357 *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
 358 *
 359 *  - Remove uprobe: -:[GRP/]EVENT
 360 */
 361static int create_trace_uprobe(int argc, char **argv)
 362{
 363        struct trace_uprobe *tu;
 364        struct inode *inode;
 365        char *arg, *event, *group, *filename;
 366        char buf[MAX_EVENT_NAME_LEN];
 367        struct path path;
 368        unsigned long offset;
 369        bool is_delete, is_return;
 370        int i, ret;
 371
 372        inode = NULL;
 373        ret = 0;
 374        is_delete = false;
 375        is_return = false;
 376        event = NULL;
 377        group = NULL;
 378
 379        /* argc must be >= 1 */
 380        if (argv[0][0] == '-')
 381                is_delete = true;
 382        else if (argv[0][0] == 'r')
 383                is_return = true;
 384        else if (argv[0][0] != 'p') {
 385                pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
 386                return -EINVAL;
 387        }
 388
 389        if (argv[0][1] == ':') {
 390                event = &argv[0][2];
 391                arg = strchr(event, '/');
 392
 393                if (arg) {
 394                        group = event;
 395                        event = arg + 1;
 396                        event[-1] = '\0';
 397
 398                        if (strlen(group) == 0) {
 399                                pr_info("Group name is not specified\n");
 400                                return -EINVAL;
 401                        }
 402                }
 403                if (strlen(event) == 0) {
 404                        pr_info("Event name is not specified\n");
 405                        return -EINVAL;
 406                }
 407        }
 408        if (!group)
 409                group = UPROBE_EVENT_SYSTEM;
 410
 411        if (is_delete) {
 412                int ret;
 413
 414                if (!event) {
 415                        pr_info("Delete command needs an event name.\n");
 416                        return -EINVAL;
 417                }
 418                mutex_lock(&uprobe_lock);
 419                tu = find_probe_event(event, group);
 420
 421                if (!tu) {
 422                        mutex_unlock(&uprobe_lock);
 423                        pr_info("Event %s/%s doesn't exist.\n", group, event);
 424                        return -ENOENT;
 425                }
 426                /* delete an event */
 427                ret = unregister_trace_uprobe(tu);
 428                mutex_unlock(&uprobe_lock);
 429                return ret;
 430        }
 431
 432        if (argc < 2) {
 433                pr_info("Probe point is not specified.\n");
 434                return -EINVAL;
 435        }
 436        /* Find the last occurrence, in case the path contains ':' too. */
 437        arg = strrchr(argv[1], ':');
 438        if (!arg) {
 439                ret = -EINVAL;
 440                goto fail_address_parse;
 441        }
 442
 443        *arg++ = '\0';
 444        filename = argv[1];
 445        ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 446        if (ret)
 447                goto fail_address_parse;
 448
 449        inode = igrab(d_inode(path.dentry));
 450        path_put(&path);
 451
 452        if (!inode || !S_ISREG(inode->i_mode)) {
 453                ret = -EINVAL;
 454                goto fail_address_parse;
 455        }
 456
 457        ret = kstrtoul(arg, 0, &offset);
 458        if (ret)
 459                goto fail_address_parse;
 460
 461        argc -= 2;
 462        argv += 2;
 463
 464        /* setup a probe */
 465        if (!event) {
 466                char *tail;
 467                char *ptr;
 468
 469                tail = kstrdup(kbasename(filename), GFP_KERNEL);
 470                if (!tail) {
 471                        ret = -ENOMEM;
 472                        goto fail_address_parse;
 473                }
 474
 475                ptr = strpbrk(tail, ".-_");
 476                if (ptr)
 477                        *ptr = '\0';
 478
 479                snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
 480                event = buf;
 481                kfree(tail);
 482        }
 483
 484        tu = alloc_trace_uprobe(group, event, argc, is_return);
 485        if (IS_ERR(tu)) {
 486                pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
 487                ret = PTR_ERR(tu);
 488                goto fail_address_parse;
 489        }
 490        tu->offset = offset;
 491        tu->inode = inode;
 492        tu->filename = kstrdup(filename, GFP_KERNEL);
 493
 494        if (!tu->filename) {
 495                pr_info("Failed to allocate filename.\n");
 496                ret = -ENOMEM;
 497                goto error;
 498        }
 499
 500        /* parse arguments */
 501        ret = 0;
 502        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
 503                struct probe_arg *parg = &tu->tp.args[i];
 504
 505                /* Increment count for freeing args in error case */
 506                tu->tp.nr_args++;
 507
 508                /* Parse argument name */
 509                arg = strchr(argv[i], '=');
 510                if (arg) {
 511                        *arg++ = '\0';
 512                        parg->name = kstrdup(argv[i], GFP_KERNEL);
 513                } else {
 514                        arg = argv[i];
 515                        /* If argument name is omitted, set "argN" */
 516                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
 517                        parg->name = kstrdup(buf, GFP_KERNEL);
 518                }
 519
 520                if (!parg->name) {
 521                        pr_info("Failed to allocate argument[%d] name.\n", i);
 522                        ret = -ENOMEM;
 523                        goto error;
 524                }
 525
 526                if (!is_good_name(parg->name)) {
 527                        pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
 528                        ret = -EINVAL;
 529                        goto error;
 530                }
 531
 532                if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
 533                        pr_info("Argument[%d] name '%s' conflicts with "
 534                                "another field.\n", i, argv[i]);
 535                        ret = -EINVAL;
 536                        goto error;
 537                }
 538
 539                /* Parse fetch argument */
 540                ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
 541                                                 is_return, false,
 542                                                 uprobes_fetch_type_table);
 543                if (ret) {
 544                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
 545                        goto error;
 546                }
 547        }
 548
 549        ret = register_trace_uprobe(tu);
 550        if (ret)
 551                goto error;
 552        return 0;
 553
 554error:
 555        free_trace_uprobe(tu);
 556        return ret;
 557
 558fail_address_parse:
 559        iput(inode);
 560
 561        pr_info("Failed to parse address or file.\n");
 562
 563        return ret;
 564}
 565
 566static int cleanup_all_probes(void)
 567{
 568        struct trace_uprobe *tu;
 569        int ret = 0;
 570
 571        mutex_lock(&uprobe_lock);
 572        while (!list_empty(&uprobe_list)) {
 573                tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
 574                ret = unregister_trace_uprobe(tu);
 575                if (ret)
 576                        break;
 577        }
 578        mutex_unlock(&uprobe_lock);
 579        return ret;
 580}
 581
 582/* Probes listing interfaces */
 583static void *probes_seq_start(struct seq_file *m, loff_t *pos)
 584{
 585        mutex_lock(&uprobe_lock);
 586        return seq_list_start(&uprobe_list, *pos);
 587}
 588
 589static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
 590{
 591        return seq_list_next(v, &uprobe_list, pos);
 592}
 593
 594static void probes_seq_stop(struct seq_file *m, void *v)
 595{
 596        mutex_unlock(&uprobe_lock);
 597}
 598
 599static int probes_seq_show(struct seq_file *m, void *v)
 600{
 601        struct trace_uprobe *tu = v;
 602        char c = is_ret_probe(tu) ? 'r' : 'p';
 603        int i;
 604
 605        seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
 606                        trace_event_name(&tu->tp.call));
 607        seq_printf(m, " %s:", tu->filename);
 608
 609        /* Don't print "0x  (null)" when offset is 0 */
 610        if (tu->offset) {
 611                seq_printf(m, "0x%p", (void *)tu->offset);
 612        } else {
 613                switch (sizeof(void *)) {
 614                case 4:
 615                        seq_printf(m, "0x00000000");
 616                        break;
 617                case 8:
 618                default:
 619                        seq_printf(m, "0x0000000000000000");
 620                        break;
 621                }
 622        }
 623
 624        for (i = 0; i < tu->tp.nr_args; i++)
 625                seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
 626
 627        seq_putc(m, '\n');
 628        return 0;
 629}
 630
 631static const struct seq_operations probes_seq_op = {
 632        .start  = probes_seq_start,
 633        .next   = probes_seq_next,
 634        .stop   = probes_seq_stop,
 635        .show   = probes_seq_show
 636};
 637
 638static int probes_open(struct inode *inode, struct file *file)
 639{
 640        int ret;
 641
 642        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
 643                ret = cleanup_all_probes();
 644                if (ret)
 645                        return ret;
 646        }
 647
 648        return seq_open(file, &probes_seq_op);
 649}
 650
 651static ssize_t probes_write(struct file *file, const char __user *buffer,
 652                            size_t count, loff_t *ppos)
 653{
 654        return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
 655}
 656
 657static const struct file_operations uprobe_events_ops = {
 658        .owner          = THIS_MODULE,
 659        .open           = probes_open,
 660        .read           = seq_read,
 661        .llseek         = seq_lseek,
 662        .release        = seq_release,
 663        .write          = probes_write,
 664};
 665
 666/* Probes profiling interfaces */
 667static int probes_profile_seq_show(struct seq_file *m, void *v)
 668{
 669        struct trace_uprobe *tu = v;
 670
 671        seq_printf(m, "  %s %-44s %15lu\n", tu->filename,
 672                        trace_event_name(&tu->tp.call), tu->nhit);
 673        return 0;
 674}
 675
 676static const struct seq_operations profile_seq_op = {
 677        .start  = probes_seq_start,
 678        .next   = probes_seq_next,
 679        .stop   = probes_seq_stop,
 680        .show   = probes_profile_seq_show
 681};
 682
 683static int profile_open(struct inode *inode, struct file *file)
 684{
 685        return seq_open(file, &profile_seq_op);
 686}
 687
 688static const struct file_operations uprobe_profile_ops = {
 689        .owner          = THIS_MODULE,
 690        .open           = profile_open,
 691        .read           = seq_read,
 692        .llseek         = seq_lseek,
 693        .release        = seq_release,
 694};
 695
 696struct uprobe_cpu_buffer {
 697        struct mutex mutex;
 698        void *buf;
 699};
 700static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
 701static int uprobe_buffer_refcnt;
 702
 703static int uprobe_buffer_init(void)
 704{
 705        int cpu, err_cpu;
 706
 707        uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
 708        if (uprobe_cpu_buffer == NULL)
 709                return -ENOMEM;
 710
 711        for_each_possible_cpu(cpu) {
 712                struct page *p = alloc_pages_node(cpu_to_node(cpu),
 713                                                  GFP_KERNEL, 0);
 714                if (p == NULL) {
 715                        err_cpu = cpu;
 716                        goto err;
 717                }
 718                per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
 719                mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
 720        }
 721
 722        return 0;
 723
 724err:
 725        for_each_possible_cpu(cpu) {
 726                if (cpu == err_cpu)
 727                        break;
 728                free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
 729        }
 730
 731        free_percpu(uprobe_cpu_buffer);
 732        return -ENOMEM;
 733}
 734
 735static int uprobe_buffer_enable(void)
 736{
 737        int ret = 0;
 738
 739        BUG_ON(!mutex_is_locked(&event_mutex));
 740
 741        if (uprobe_buffer_refcnt++ == 0) {
 742                ret = uprobe_buffer_init();
 743                if (ret < 0)
 744                        uprobe_buffer_refcnt--;
 745        }
 746
 747        return ret;
 748}
 749
 750static void uprobe_buffer_disable(void)
 751{
 752        int cpu;
 753
 754        BUG_ON(!mutex_is_locked(&event_mutex));
 755
 756        if (--uprobe_buffer_refcnt == 0) {
 757                for_each_possible_cpu(cpu)
 758                        free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
 759                                                             cpu)->buf);
 760
 761                free_percpu(uprobe_cpu_buffer);
 762                uprobe_cpu_buffer = NULL;
 763        }
 764}
 765
 766static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
 767{
 768        struct uprobe_cpu_buffer *ucb;
 769        int cpu;
 770
 771        cpu = raw_smp_processor_id();
 772        ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
 773
 774        /*
 775         * Use per-cpu buffers for fastest access, but we might migrate
 776         * so the mutex makes sure we have sole access to it.
 777         */
 778        mutex_lock(&ucb->mutex);
 779
 780        return ucb;
 781}
 782
 783static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 784{
 785        mutex_unlock(&ucb->mutex);
 786}
 787
 788static void __uprobe_trace_func(struct trace_uprobe *tu,
 789                                unsigned long func, struct pt_regs *regs,
 790                                struct uprobe_cpu_buffer *ucb, int dsize,
 791                                struct trace_event_file *trace_file)
 792{
 793        struct uprobe_trace_entry_head *entry;
 794        struct ring_buffer_event *event;
 795        struct ring_buffer *buffer;
 796        void *data;
 797        int size, esize;
 798        struct trace_event_call *call = &tu->tp.call;
 799
 800        WARN_ON(call != trace_file->event_call);
 801
 802        if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 803                return;
 804
 805        if (trace_trigger_soft_disabled(trace_file))
 806                return;
 807
 808        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 809        size = esize + tu->tp.size + dsize;
 810        event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 811                                                call->event.type, size, 0, 0);
 812        if (!event)
 813                return;
 814
 815        entry = ring_buffer_event_data(event);
 816        if (is_ret_probe(tu)) {
 817                entry->vaddr[0] = func;
 818                entry->vaddr[1] = instruction_pointer(regs);
 819                data = DATAOF_TRACE_ENTRY(entry, true);
 820        } else {
 821                entry->vaddr[0] = instruction_pointer(regs);
 822                data = DATAOF_TRACE_ENTRY(entry, false);
 823        }
 824
 825        memcpy(data, ucb->buf, tu->tp.size + dsize);
 826
 827        event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
 828}
 829
 830/* uprobe handler */
 831static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
 832                             struct uprobe_cpu_buffer *ucb, int dsize)
 833{
 834        struct event_file_link *link;
 835
 836        if (is_ret_probe(tu))
 837                return 0;
 838
 839        rcu_read_lock();
 840        list_for_each_entry_rcu(link, &tu->tp.files, list)
 841                __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
 842        rcu_read_unlock();
 843
 844        return 0;
 845}
 846
 847static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 848                                 struct pt_regs *regs,
 849                                 struct uprobe_cpu_buffer *ucb, int dsize)
 850{
 851        struct event_file_link *link;
 852
 853        rcu_read_lock();
 854        list_for_each_entry_rcu(link, &tu->tp.files, list)
 855                __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
 856        rcu_read_unlock();
 857}
 858
 859/* Event entry printers */
 860static enum print_line_t
 861print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
 862{
 863        struct uprobe_trace_entry_head *entry;
 864        struct trace_seq *s = &iter->seq;
 865        struct trace_uprobe *tu;
 866        u8 *data;
 867        int i;
 868
 869        entry = (struct uprobe_trace_entry_head *)iter->ent;
 870        tu = container_of(event, struct trace_uprobe, tp.call.event);
 871
 872        if (is_ret_probe(tu)) {
 873                trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
 874                                 trace_event_name(&tu->tp.call),
 875                                 entry->vaddr[1], entry->vaddr[0]);
 876                data = DATAOF_TRACE_ENTRY(entry, true);
 877        } else {
 878                trace_seq_printf(s, "%s: (0x%lx)",
 879                                 trace_event_name(&tu->tp.call),
 880                                 entry->vaddr[0]);
 881                data = DATAOF_TRACE_ENTRY(entry, false);
 882        }
 883
 884        for (i = 0; i < tu->tp.nr_args; i++) {
 885                struct probe_arg *parg = &tu->tp.args[i];
 886
 887                if (!parg->type->print(s, parg->name, data + parg->offset, entry))
 888                        goto out;
 889        }
 890
 891        trace_seq_putc(s, '\n');
 892
 893 out:
 894        return trace_handle_return(s);
 895}
 896
 897typedef bool (*filter_func_t)(struct uprobe_consumer *self,
 898                                enum uprobe_filter_ctx ctx,
 899                                struct mm_struct *mm);
 900
 901static int
 902probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
 903                   filter_func_t filter)
 904{
 905        bool enabled = trace_probe_is_enabled(&tu->tp);
 906        struct event_file_link *link = NULL;
 907        int ret;
 908
 909        if (file) {
 910                if (tu->tp.flags & TP_FLAG_PROFILE)
 911                        return -EINTR;
 912
 913                link = kmalloc(sizeof(*link), GFP_KERNEL);
 914                if (!link)
 915                        return -ENOMEM;
 916
 917                link->file = file;
 918                list_add_tail_rcu(&link->list, &tu->tp.files);
 919
 920                tu->tp.flags |= TP_FLAG_TRACE;
 921        } else {
 922                if (tu->tp.flags & TP_FLAG_TRACE)
 923                        return -EINTR;
 924
 925                tu->tp.flags |= TP_FLAG_PROFILE;
 926        }
 927
 928        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 929
 930        if (enabled)
 931                return 0;
 932
 933        ret = uprobe_buffer_enable();
 934        if (ret)
 935                goto err_flags;
 936
 937        tu->consumer.filter = filter;
 938        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
 939        if (ret)
 940                goto err_buffer;
 941
 942        return 0;
 943
 944 err_buffer:
 945        uprobe_buffer_disable();
 946
 947 err_flags:
 948        if (file) {
 949                list_del(&link->list);
 950                kfree(link);
 951                tu->tp.flags &= ~TP_FLAG_TRACE;
 952        } else {
 953                tu->tp.flags &= ~TP_FLAG_PROFILE;
 954        }
 955        return ret;
 956}
 957
 958static void
 959probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 960{
 961        if (!trace_probe_is_enabled(&tu->tp))
 962                return;
 963
 964        if (file) {
 965                struct event_file_link *link;
 966
 967                link = find_event_file_link(&tu->tp, file);
 968                if (!link)
 969                        return;
 970
 971                list_del_rcu(&link->list);
 972                /* synchronize with u{,ret}probe_trace_func */
 973                synchronize_sched();
 974                kfree(link);
 975
 976                if (!list_empty(&tu->tp.files))
 977                        return;
 978        }
 979
 980        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 981
 982        uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
 983        tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
 984
 985        uprobe_buffer_disable();
 986}
 987
 988static int uprobe_event_define_fields(struct trace_event_call *event_call)
 989{
 990        int ret, i, size;
 991        struct uprobe_trace_entry_head field;
 992        struct trace_uprobe *tu = event_call->data;
 993
 994        if (is_ret_probe(tu)) {
 995                DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
 996                DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
 997                size = SIZEOF_TRACE_ENTRY(true);
 998        } else {
 999                DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
1000                size = SIZEOF_TRACE_ENTRY(false);
1001        }
1002        /* Set argument names as fields */
1003        for (i = 0; i < tu->tp.nr_args; i++) {
1004                struct probe_arg *parg = &tu->tp.args[i];
1005
1006                ret = trace_define_field(event_call, parg->type->fmttype,
1007                                         parg->name, size + parg->offset,
1008                                         parg->type->size, parg->type->is_signed,
1009                                         FILTER_OTHER);
1010
1011                if (ret)
1012                        return ret;
1013        }
1014        return 0;
1015}
1016
1017#ifdef CONFIG_PERF_EVENTS
1018static bool
1019__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
1020{
1021        struct perf_event *event;
1022
1023        if (filter->nr_systemwide)
1024                return true;
1025
1026        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
1027                if (event->hw.target->mm == mm)
1028                        return true;
1029        }
1030
1031        return false;
1032}
1033
1034static inline bool
1035uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
1036{
1037        return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
1038}
1039
1040static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
1041{
1042        bool done;
1043
1044        write_lock(&tu->filter.rwlock);
1045        if (event->hw.target) {
1046                list_del(&event->hw.tp_list);
1047                done = tu->filter.nr_systemwide ||
1048                        (event->hw.target->flags & PF_EXITING) ||
1049                        uprobe_filter_event(tu, event);
1050        } else {
1051                tu->filter.nr_systemwide--;
1052                done = tu->filter.nr_systemwide;
1053        }
1054        write_unlock(&tu->filter.rwlock);
1055
1056        if (!done)
1057                return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
1058
1059        return 0;
1060}
1061
1062static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
1063{
1064        bool done;
1065        int err;
1066
1067        write_lock(&tu->filter.rwlock);
1068        if (event->hw.target) {
1069                /*
1070                 * event->parent != NULL means copy_process(), we can avoid
1071                 * uprobe_apply(). current->mm must be probed and we can rely
1072                 * on dup_mmap() which preserves the already installed bp's.
1073                 *
1074                 * attr.enable_on_exec means that exec/mmap will install the
1075                 * breakpoints we need.
1076                 */
1077                done = tu->filter.nr_systemwide ||
1078                        event->parent || event->attr.enable_on_exec ||
1079                        uprobe_filter_event(tu, event);
1080                list_add(&event->hw.tp_list, &tu->filter.perf_events);
1081        } else {
1082                done = tu->filter.nr_systemwide;
1083                tu->filter.nr_systemwide++;
1084        }
1085        write_unlock(&tu->filter.rwlock);
1086
1087        err = 0;
1088        if (!done) {
1089                err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
1090                if (err)
1091                        uprobe_perf_close(tu, event);
1092        }
1093        return err;
1094}
1095
1096static bool uprobe_perf_filter(struct uprobe_consumer *uc,
1097                                enum uprobe_filter_ctx ctx, struct mm_struct *mm)
1098{
1099        struct trace_uprobe *tu;
1100        int ret;
1101
1102        tu = container_of(uc, struct trace_uprobe, consumer);
1103        read_lock(&tu->filter.rwlock);
1104        ret = __uprobe_perf_filter(&tu->filter, mm);
1105        read_unlock(&tu->filter.rwlock);
1106
1107        return ret;
1108}
1109
1110static void __uprobe_perf_func(struct trace_uprobe *tu,
1111                               unsigned long func, struct pt_regs *regs,
1112                               struct uprobe_cpu_buffer *ucb, int dsize)
1113{
1114        struct trace_event_call *call = &tu->tp.call;
1115        struct uprobe_trace_entry_head *entry;
1116        struct bpf_prog *prog = call->prog;
1117        struct hlist_head *head;
1118        void *data;
1119        int size, esize;
1120        int rctx;
1121
1122        if (prog && !trace_call_bpf(prog, regs))
1123                return;
1124
1125        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1126
1127        size = esize + tu->tp.size + dsize;
1128        size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1129        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1130                return;
1131
1132        preempt_disable();
1133        head = this_cpu_ptr(call->perf_events);
1134        if (hlist_empty(head))
1135                goto out;
1136
1137        entry = perf_trace_buf_alloc(size, NULL, &rctx);
1138        if (!entry)
1139                goto out;
1140
1141        if (is_ret_probe(tu)) {
1142                entry->vaddr[0] = func;
1143                entry->vaddr[1] = instruction_pointer(regs);
1144                data = DATAOF_TRACE_ENTRY(entry, true);
1145        } else {
1146                entry->vaddr[0] = instruction_pointer(regs);
1147                data = DATAOF_TRACE_ENTRY(entry, false);
1148        }
1149
1150        memcpy(data, ucb->buf, tu->tp.size + dsize);
1151
1152        if (size - esize > tu->tp.size + dsize) {
1153                int len = tu->tp.size + dsize;
1154
1155                memset(data + len, 0, size - esize - len);
1156        }
1157
1158        perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
1159                              head, NULL, NULL);
1160 out:
1161        preempt_enable();
1162}
1163
1164/* uprobe profile handler */
1165static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
1166                            struct uprobe_cpu_buffer *ucb, int dsize)
1167{
1168        if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
1169                return UPROBE_HANDLER_REMOVE;
1170
1171        if (!is_ret_probe(tu))
1172                __uprobe_perf_func(tu, 0, regs, ucb, dsize);
1173        return 0;
1174}
1175
1176static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
1177                                struct pt_regs *regs,
1178                                struct uprobe_cpu_buffer *ucb, int dsize)
1179{
1180        __uprobe_perf_func(tu, func, regs, ucb, dsize);
1181}
1182#endif  /* CONFIG_PERF_EVENTS */
1183
1184static int
1185trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
1186                      void *data)
1187{
1188        struct trace_uprobe *tu = event->data;
1189        struct trace_event_file *file = data;
1190
1191        switch (type) {
1192        case TRACE_REG_REGISTER:
1193                return probe_event_enable(tu, file, NULL);
1194
1195        case TRACE_REG_UNREGISTER:
1196                probe_event_disable(tu, file);
1197                return 0;
1198
1199#ifdef CONFIG_PERF_EVENTS
1200        case TRACE_REG_PERF_REGISTER:
1201                return probe_event_enable(tu, NULL, uprobe_perf_filter);
1202
1203        case TRACE_REG_PERF_UNREGISTER:
1204                probe_event_disable(tu, NULL);
1205                return 0;
1206
1207        case TRACE_REG_PERF_OPEN:
1208                return uprobe_perf_open(tu, data);
1209
1210        case TRACE_REG_PERF_CLOSE:
1211                return uprobe_perf_close(tu, data);
1212
1213#endif
1214        default:
1215                return 0;
1216        }
1217        return 0;
1218}
1219
1220static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1221{
1222        struct trace_uprobe *tu;
1223        struct uprobe_dispatch_data udd;
1224        struct uprobe_cpu_buffer *ucb;
1225        int dsize, esize;
1226        int ret = 0;
1227
1228
1229        tu = container_of(con, struct trace_uprobe, consumer);
1230        tu->nhit++;
1231
1232        udd.tu = tu;
1233        udd.bp_addr = instruction_pointer(regs);
1234
1235        current->utask->vaddr = (unsigned long) &udd;
1236
1237        if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1238                return 0;
1239
1240        dsize = __get_data_size(&tu->tp, regs);
1241        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1242
1243        ucb = uprobe_buffer_get();
1244        store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1245
1246        if (tu->tp.flags & TP_FLAG_TRACE)
1247                ret |= uprobe_trace_func(tu, regs, ucb, dsize);
1248
1249#ifdef CONFIG_PERF_EVENTS
1250        if (tu->tp.flags & TP_FLAG_PROFILE)
1251                ret |= uprobe_perf_func(tu, regs, ucb, dsize);
1252#endif
1253        uprobe_buffer_put(ucb);
1254        return ret;
1255}
1256
1257static int uretprobe_dispatcher(struct uprobe_consumer *con,
1258                                unsigned long func, struct pt_regs *regs)
1259{
1260        struct trace_uprobe *tu;
1261        struct uprobe_dispatch_data udd;
1262        struct uprobe_cpu_buffer *ucb;
1263        int dsize, esize;
1264
1265        tu = container_of(con, struct trace_uprobe, consumer);
1266
1267        udd.tu = tu;
1268        udd.bp_addr = func;
1269
1270        current->utask->vaddr = (unsigned long) &udd;
1271
1272        if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1273                return 0;
1274
1275        dsize = __get_data_size(&tu->tp, regs);
1276        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1277
1278        ucb = uprobe_buffer_get();
1279        store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1280
1281        if (tu->tp.flags & TP_FLAG_TRACE)
1282                uretprobe_trace_func(tu, func, regs, ucb, dsize);
1283
1284#ifdef CONFIG_PERF_EVENTS
1285        if (tu->tp.flags & TP_FLAG_PROFILE)
1286                uretprobe_perf_func(tu, func, regs, ucb, dsize);
1287#endif
1288        uprobe_buffer_put(ucb);
1289        return 0;
1290}
1291
1292static struct trace_event_functions uprobe_funcs = {
1293        .trace          = print_uprobe_event
1294};
1295
1296static int register_uprobe_event(struct trace_uprobe *tu)
1297{
1298        struct trace_event_call *call = &tu->tp.call;
1299        int ret;
1300
1301        /* Initialize trace_event_call */
1302        INIT_LIST_HEAD(&call->class->fields);
1303        call->event.funcs = &uprobe_funcs;
1304        call->class->define_fields = uprobe_event_define_fields;
1305
1306        if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
1307                return -ENOMEM;
1308
1309        ret = register_trace_event(&call->event);
1310        if (!ret) {
1311                kfree(call->print_fmt);
1312                return -ENODEV;
1313        }
1314
1315        call->flags = TRACE_EVENT_FL_UPROBE;
1316        call->class->reg = trace_uprobe_register;
1317        call->data = tu;
1318        ret = trace_add_event_call(call);
1319
1320        if (ret) {
1321                pr_info("Failed to register uprobe event: %s\n",
1322                        trace_event_name(call));
1323                kfree(call->print_fmt);
1324                unregister_trace_event(&call->event);
1325        }
1326
1327        return ret;
1328}
1329
1330static int unregister_uprobe_event(struct trace_uprobe *tu)
1331{
1332        int ret;
1333
1334        /* tu->event is unregistered in trace_remove_event_call() */
1335        ret = trace_remove_event_call(&tu->tp.call);
1336        if (ret)
1337                return ret;
1338        kfree(tu->tp.call.print_fmt);
1339        tu->tp.call.print_fmt = NULL;
1340        return 0;
1341}
1342
1343/* Make a trace interface for controling probe points */
1344static __init int init_uprobe_trace(void)
1345{
1346        struct dentry *d_tracer;
1347
1348        d_tracer = tracing_init_dentry();
1349        if (IS_ERR(d_tracer))
1350                return 0;
1351
1352        trace_create_file("uprobe_events", 0644, d_tracer,
1353                                    NULL, &uprobe_events_ops);
1354        /* Profile interface */
1355        trace_create_file("uprobe_profile", 0444, d_tracer,
1356                                    NULL, &uprobe_profile_ops);
1357        return 0;
1358}
1359
1360fs_initcall(init_uprobe_trace);
1361