LXR linux/drivers/oprofile/buffer

   1/**
   2 * @file buffer_sync.c
   3 *
   4 * @remark Copyright 2002-2009 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf
   9 * @author Robert Richter <robert.richter@amd.com>
  10 *
  11 * This is the core of the buffer management. Each
  12 * CPU buffer is processed and entered into the
  13 * global event buffer. Such processing is necessary
  14 * in several circumstances, mentioned below.
  15 *
  16 * The processing does the job of converting the
  17 * transitory EIP value into a persistent dentry/offset
  18 * value that the profiler can record at its leisure.
  19 *
  20 * See fs/dcookies.c for a description of the dentry/offset
  21 * objects.
  22 */
  23
  24#include <linux/file.h>
  25#include <linux/mm.h>
  26#include <linux/workqueue.h>
  27#include <linux/notifier.h>
  28#include <linux/dcookies.h>
  29#include <linux/profile.h>
  30#include <linux/module.h>
  31#include <linux/fs.h>
  32#include <linux/oprofile.h>
  33#include <linux/sched.h>
  34#include <linux/gfp.h>
  35
  36#include "oprofile_stats.h"
  37#include "event_buffer.h"
  38#include "cpu_buffer.h"
  39#include "buffer_sync.h"
  40
  41static LIST_HEAD(dying_tasks);
  42static LIST_HEAD(dead_tasks);
  43static cpumask_var_t marked_cpus;
  44static DEFINE_SPINLOCK(task_mortuary);
  45static void process_task_mortuary(void);
  46
  47/* Take ownership of the task struct and place it on the
  48 * list for processing. Only after two full buffer syncs
  49 * does the task eventually get freed, because by then
  50 * we are sure we will not reference it again.
  51 * Can be invoked from softirq via RCU callback due to
  52 * call_rcu() of the task struct, hence the _irqsave.
  53 */
  54static int
  55task_free_notify(struct notifier_block *self, unsigned long val, void *data)
  56{
  57        unsigned long flags;
  58        struct task_struct *task = data;
  59        spin_lock_irqsave(&task_mortuary, flags);
  60        list_add(&task->tasks, &dying_tasks);
  61        spin_unlock_irqrestore(&task_mortuary, flags);
  62        return NOTIFY_OK;
  63}
  64
  65
  66/* The task is on its way out. A sync of the buffer means we can catch
  67 * any remaining samples for this task.
  68 */
  69static int
  70task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
  71{
  72        /* To avoid latency problems, we only process the current CPU,
  73         * hoping that most samples for the task are on this CPU
  74         */
  75        sync_buffer(raw_smp_processor_id());
  76        return 0;
  77}
  78
  79
  80/* The task is about to try a do_munmap(). We peek at what it's going to
  81 * do, and if it's an executable region, process the samples first, so
  82 * we don't lose any. This does not have to be exact, it's a QoI issue
  83 * only.
  84 */
  85static int
  86munmap_notify(struct notifier_block *self, unsigned long val, void *data)
  87{
  88        unsigned long addr = (unsigned long)data;
  89        struct mm_struct *mm = current->mm;
  90        struct vm_area_struct *mpnt;
  91
  92        down_read(&mm->mmap_sem);
  93
  94        mpnt = find_vma(mm, addr);
  95        if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
  96                up_read(&mm->mmap_sem);
  97                /* To avoid latency problems, we only process the current CPU,
  98                 * hoping that most samples for the task are on this CPU
  99                 */
 100                sync_buffer(raw_smp_processor_id());
 101                return 0;
 102        }
 103
 104        up_read(&mm->mmap_sem);
 105        return 0;
 106}
 107
 108
 109/* We need to be told about new modules so we don't attribute to a previously
 110 * loaded module, or drop the samples on the floor.
 111 */
 112static int
 113module_load_notify(struct notifier_block *self, unsigned long val, void *data)
 114{
 115#ifdef CONFIG_MODULES
 116        if (val != MODULE_STATE_COMING)
 117                return 0;
 118
 119        /* FIXME: should we process all CPU buffers ? */
 120        mutex_lock(&buffer_mutex);
 121        add_event_entry(ESCAPE_CODE);
 122        add_event_entry(MODULE_LOADED_CODE);
 123        mutex_unlock(&buffer_mutex);
 124#endif
 125        return 0;
 126}
 127
 128
 129static struct notifier_block task_free_nb = {
 130        .notifier_call  = task_free_notify,
 131};
 132
 133static struct notifier_block task_exit_nb = {
 134        .notifier_call  = task_exit_notify,
 135};
 136
 137static struct notifier_block munmap_nb = {
 138        .notifier_call  = munmap_notify,
 139};
 140
 141static struct notifier_block module_load_nb = {
 142        .notifier_call = module_load_notify,
 143};
 144
 145static void free_all_tasks(void)
 146{
 147        /* make sure we don't leak task structs */
 148        process_task_mortuary();
 149        process_task_mortuary();
 150}
 151
 152int sync_start(void)
 153{
 154        int err;
 155
 156        if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 157                return -ENOMEM;
 158
 159        err = task_handoff_register(&task_free_nb);
 160        if (err)
 161                goto out1;
 162        err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
 163        if (err)
 164                goto out2;
 165        err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
 166        if (err)
 167                goto out3;
 168        err = register_module_notifier(&module_load_nb);
 169        if (err)
 170                goto out4;
 171
 172        start_cpu_work();
 173
 174out:
 175        return err;
 176out4:
 177        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 178out3:
 179        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 180out2:
 181        task_handoff_unregister(&task_free_nb);
 182        free_all_tasks();
 183out1:
 184        free_cpumask_var(marked_cpus);
 185        goto out;
 186}
 187
 188
 189void sync_stop(void)
 190{
 191        end_cpu_work();
 192        unregister_module_notifier(&module_load_nb);
 193        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 194        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 195        task_handoff_unregister(&task_free_nb);
 196        barrier();                      /* do all of the above first */
 197
 198        flush_cpu_work();
 199
 200        free_all_tasks();
 201        free_cpumask_var(marked_cpus);
 202}
 203
 204
 205/* Optimisation. We can manage without taking the dcookie sem
 206 * because we cannot reach this code without at least one
 207 * dcookie user still being registered (namely, the reader
 208 * of the event buffer). */
 209static inline unsigned long fast_get_dcookie(const struct path *path)
 210{
 211        unsigned long cookie;
 212
 213        if (path->dentry->d_flags & DCACHE_COOKIE)
 214                return (unsigned long)path->dentry;
 215        get_dcookie(path, &cookie);
 216        return cookie;
 217}
 218
 219
 220/* Look up the dcookie for the task's mm->exe_file,
 221 * which corresponds loosely to "application name". This is
 222 * not strictly necessary but allows oprofile to associate
 223 * shared-library samples with particular applications
 224 */
 225static unsigned long get_exec_dcookie(struct mm_struct *mm)
 226{
 227        unsigned long cookie = NO_COOKIE;
 228        struct file *exe_file;
 229
 230        if (!mm)
 231                goto done;
 232
 233        exe_file = get_mm_exe_file(mm);
 234        if (!exe_file)
 235                goto done;
 236
 237        cookie = fast_get_dcookie(&exe_file->f_path);
 238        fput(exe_file);
 239done:
 240        return cookie;
 241}
 242
 243
 244/* Convert the EIP value of a sample into a persistent dentry/offset
 245 * pair that can then be added to the global event buffer. We make
 246 * sure to do this lookup before a mm->mmap modification happens so
 247 * we don't lose track.
 248 *
 249 * The caller must ensure the mm is not nil (ie: not a kernel thread).
 250 */
 251static unsigned long
 252lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 253{
 254        unsigned long cookie = NO_COOKIE;
 255        struct vm_area_struct *vma;
 256
 257        down_read(&mm->mmap_sem);
 258        for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 259
 260                if (addr < vma->vm_start || addr >= vma->vm_end)
 261                        continue;
 262
 263                if (vma->vm_file) {
 264                        cookie = fast_get_dcookie(&vma->vm_file->f_path);
 265                        *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
 266                                vma->vm_start;
 267                } else {
 268                        /* must be an anonymous map */
 269                        *offset = addr;
 270                }
 271
 272                break;
 273        }
 274
 275        if (!vma)
 276                cookie = INVALID_COOKIE;
 277        up_read(&mm->mmap_sem);
 278
 279        return cookie;
 280}
 281
 282static unsigned long last_cookie = INVALID_COOKIE;
 283
 284static void add_cpu_switch(int i)
 285{
 286        add_event_entry(ESCAPE_CODE);
 287        add_event_entry(CPU_SWITCH_CODE);
 288        add_event_entry(i);
 289        last_cookie = INVALID_COOKIE;
 290}
 291
 292static void add_kernel_ctx_switch(unsigned int in_kernel)
 293{
 294        add_event_entry(ESCAPE_CODE);
 295        if (in_kernel)
 296                add_event_entry(KERNEL_ENTER_SWITCH_CODE);
 297        else
 298                add_event_entry(KERNEL_EXIT_SWITCH_CODE);
 299}
 300
 301static void
 302add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
 303{
 304        add_event_entry(ESCAPE_CODE);
 305        add_event_entry(CTX_SWITCH_CODE);
 306        add_event_entry(task->pid);
 307        add_event_entry(cookie);
 308        /* Another code for daemon back-compat */
 309        add_event_entry(ESCAPE_CODE);
 310        add_event_entry(CTX_TGID_CODE);
 311        add_event_entry(task->tgid);
 312}
 313
 314
 315static void add_cookie_switch(unsigned long cookie)
 316{
 317        add_event_entry(ESCAPE_CODE);
 318        add_event_entry(COOKIE_SWITCH_CODE);
 319        add_event_entry(cookie);
 320}
 321
 322
 323static void add_trace_begin(void)
 324{
 325        add_event_entry(ESCAPE_CODE);
 326        add_event_entry(TRACE_BEGIN_CODE);
 327}
 328
 329static void add_data(struct op_entry *entry, struct mm_struct *mm)
 330{
 331        unsigned long code, pc, val;
 332        unsigned long cookie;
 333        off_t offset;
 334
 335        if (!op_cpu_buffer_get_data(entry, &code))
 336                return;
 337        if (!op_cpu_buffer_get_data(entry, &pc))
 338                return;
 339        if (!op_cpu_buffer_get_size(entry))
 340                return;
 341
 342        if (mm) {
 343                cookie = lookup_dcookie(mm, pc, &offset);
 344
 345                if (cookie == NO_COOKIE)
 346                        offset = pc;
 347                if (cookie == INVALID_COOKIE) {
 348                        atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 349                        offset = pc;
 350                }
 351                if (cookie != last_cookie) {
 352                        add_cookie_switch(cookie);
 353                        last_cookie = cookie;
 354                }
 355        } else
 356                offset = pc;
 357
 358        add_event_entry(ESCAPE_CODE);
 359        add_event_entry(code);
 360        add_event_entry(offset);        /* Offset from Dcookie */
 361
 362        while (op_cpu_buffer_get_data(entry, &val))
 363                add_event_entry(val);
 364}
 365
 366static inline void add_sample_entry(unsigned long offset, unsigned long event)
 367{
 368        add_event_entry(offset);
 369        add_event_entry(event);
 370}
 371
 372
 373/*
 374 * Add a sample to the global event buffer. If possible the
 375 * sample is converted into a persistent dentry/offset pair
 376 * for later lookup from userspace. Return 0 on failure.
 377 */
 378static int
 379add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
 380{
 381        unsigned long cookie;
 382        off_t offset;
 383
 384        if (in_kernel) {
 385                add_sample_entry(s->eip, s->event);
 386                return 1;
 387        }
 388
 389        /* add userspace sample */
 390
 391        if (!mm) {
 392                atomic_inc(&oprofile_stats.sample_lost_no_mm);
 393                return 0;
 394        }
 395
 396        cookie = lookup_dcookie(mm, s->eip, &offset);
 397
 398        if (cookie == INVALID_COOKIE) {
 399                atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 400                return 0;
 401        }
 402
 403        if (cookie != last_cookie) {
 404                add_cookie_switch(cookie);
 405                last_cookie = cookie;
 406        }
 407
 408        add_sample_entry(offset, s->event);
 409
 410        return 1;
 411}
 412
 413
 414static void release_mm(struct mm_struct *mm)
 415{
 416        if (!mm)
 417                return;
 418        mmput(mm);
 419}
 420
 421static inline int is_code(unsigned long val)
 422{
 423        return val == ESCAPE_CODE;
 424}
 425
 426
 427/* Move tasks along towards death. Any tasks on dead_tasks
 428 * will definitely have no remaining references in any
 429 * CPU buffers at this point, because we use two lists,
 430 * and to have reached the list, it must have gone through
 431 * one full sync already.
 432 */
 433static void process_task_mortuary(void)
 434{
 435        unsigned long flags;
 436        LIST_HEAD(local_dead_tasks);
 437        struct task_struct *task;
 438        struct task_struct *ttask;
 439
 440        spin_lock_irqsave(&task_mortuary, flags);
 441
 442        list_splice_init(&dead_tasks, &local_dead_tasks);
 443        list_splice_init(&dying_tasks, &dead_tasks);
 444
 445        spin_unlock_irqrestore(&task_mortuary, flags);
 446
 447        list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 448                list_del(&task->tasks);
 449                free_task(task);
 450        }
 451}
 452
 453
 454static void mark_done(int cpu)
 455{
 456        int i;
 457
 458        cpumask_set_cpu(cpu, marked_cpus);
 459
 460        for_each_online_cpu(i) {
 461                if (!cpumask_test_cpu(i, marked_cpus))
 462                        return;
 463        }
 464
 465        /* All CPUs have been processed at least once,
 466         * we can process the mortuary once
 467         */
 468        process_task_mortuary();
 469
 470        cpumask_clear(marked_cpus);
 471}
 472
 473
 474/* FIXME: this is not sufficient if we implement syscall barrier backtrace
 475 * traversal, the code switch to sb_sample_start at first kernel enter/exit
 476 * switch so we need a fifth state and some special handling in sync_buffer()
 477 */
 478typedef enum {
 479        sb_bt_ignore = -2,
 480        sb_buffer_start,
 481        sb_bt_start,
 482        sb_sample_start,
 483} sync_buffer_state;
 484
 485/* Sync one of the CPU's buffers into the global event buffer.
 486 * Here we need to go through each batch of samples punctuated
 487 * by context switch notes, taking the task's mmap_sem and doing
 488 * lookup in task->mm->mmap to convert EIP into dcookie/offset
 489 * value.
 490 */
 491void sync_buffer(int cpu)
 492{
 493        struct mm_struct *mm = NULL;
 494        struct mm_struct *oldmm;
 495        unsigned long val;
 496        struct task_struct *new;
 497        unsigned long cookie = 0;
 498        int in_kernel = 1;
 499        sync_buffer_state state = sb_buffer_start;
 500        unsigned int i;
 501        unsigned long available;
 502        unsigned long flags;
 503        struct op_entry entry;
 504        struct op_sample *sample;
 505
 506        mutex_lock(&buffer_mutex);
 507
 508        add_cpu_switch(cpu);
 509
 510        op_cpu_buffer_reset(cpu);
 511        available = op_cpu_buffer_entries(cpu);
 512
 513        for (i = 0; i < available; ++i) {
 514                sample = op_cpu_buffer_read_entry(&entry, cpu);
 515                if (!sample)
 516                        break;
 517
 518                if (is_code(sample->eip)) {
 519                        flags = sample->event;
 520                        if (flags & TRACE_BEGIN) {
 521                                state = sb_bt_start;
 522                                add_trace_begin();
 523                        }
 524                        if (flags & KERNEL_CTX_SWITCH) {
 525                                /* kernel/userspace switch */
 526                                in_kernel = flags & IS_KERNEL;
 527                                if (state == sb_buffer_start)
 528                                        state = sb_sample_start;
 529                                add_kernel_ctx_switch(flags & IS_KERNEL);
 530                        }
 531                        if (flags & USER_CTX_SWITCH
 532                            && op_cpu_buffer_get_data(&entry, &val)) {
 533                                /* userspace context switch */
 534                                new = (struct task_struct *)val;
 535                                oldmm = mm;
 536                                release_mm(oldmm);
 537                                mm = get_task_mm(new);
 538                                if (mm != oldmm)
 539                                        cookie = get_exec_dcookie(mm);
 540                                add_user_ctx_switch(new, cookie);
 541                        }
 542                        if (op_cpu_buffer_get_size(&entry))
 543                                add_data(&entry, mm);
 544                        continue;
 545                }
 546
 547                if (state < sb_bt_start)
 548                        /* ignore sample */
 549                        continue;
 550
 551                if (add_sample(mm, sample, in_kernel))
 552                        continue;
 553
 554                /* ignore backtraces if failed to add a sample */
 555                if (state == sb_bt_start) {
 556                        state = sb_bt_ignore;
 557                        atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 558                }
 559        }
 560        release_mm(mm);
 561
 562        mark_done(cpu);
 563
 564        mutex_unlock(&buffer_mutex);
 565}
 566
 567/* The function can be used to add a buffer worth of data directly to
 568 * the kernel buffer. The buffer is assumed to be a circular buffer.
 569 * Take the entries from index start and end at index end, wrapping
 570 * at max_entries.
 571 */
 572void oprofile_put_buff(unsigned long *buf, unsigned int start,
 573                       unsigned int stop, unsigned int max)
 574{
 575        int i;
 576
 577        i = start;
 578
 579        mutex_lock(&buffer_mutex);
 580        while (i != stop) {
 581                add_event_entry(buf[i++]);
 582
 583                if (i >= max)
 584                        i = 0;
 585        }
 586
 587        mutex_unlock(&buffer_mutex);
 588}
 589
 590