LXR linux/drivers/oprofile/buffer

   1/**
   2 * @file buffer_sync.c
   3 *
   4 * @remark Copyright 2002-2009 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf
   9 * @author Robert Richter <robert.richter@amd.com>
  10 *
  11 * This is the core of the buffer management. Each
  12 * CPU buffer is processed and entered into the
  13 * global event buffer. Such processing is necessary
  14 * in several circumstances, mentioned below.
  15 *
  16 * The processing does the job of converting the
  17 * transitory EIP value into a persistent dentry/offset
  18 * value that the profiler can record at its leisure.
  19 *
  20 * See fs/dcookies.c for a description of the dentry/offset
  21 * objects.
  22 */
  23
  24#include <linux/mm.h>
  25#include <linux/workqueue.h>
  26#include <linux/notifier.h>
  27#include <linux/dcookies.h>
  28#include <linux/profile.h>
  29#include <linux/module.h>
  30#include <linux/fs.h>
  31#include <linux/oprofile.h>
  32#include <linux/sched.h>
  33
  34#include "oprofile_stats.h"
  35#include "event_buffer.h"
  36#include "cpu_buffer.h"
  37#include "buffer_sync.h"
  38
  39static LIST_HEAD(dying_tasks);
  40static LIST_HEAD(dead_tasks);
  41static cpumask_var_t marked_cpus;
  42static DEFINE_SPINLOCK(task_mortuary);
  43static void process_task_mortuary(void);
  44
  45/* Take ownership of the task struct and place it on the
  46 * list for processing. Only after two full buffer syncs
  47 * does the task eventually get freed, because by then
  48 * we are sure we will not reference it again.
  49 * Can be invoked from softirq via RCU callback due to
  50 * call_rcu() of the task struct, hence the _irqsave.
  51 */
  52static int
  53task_free_notify(struct notifier_block *self, unsigned long val, void *data)
  54{
  55        unsigned long flags;
  56        struct task_struct *task = data;
  57        spin_lock_irqsave(&task_mortuary, flags);
  58        list_add(&task->tasks, &dying_tasks);
  59        spin_unlock_irqrestore(&task_mortuary, flags);
  60        return NOTIFY_OK;
  61}
  62
  63
  64/* The task is on its way out. A sync of the buffer means we can catch
  65 * any remaining samples for this task.
  66 */
  67static int
  68task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
  69{
  70        /* To avoid latency problems, we only process the current CPU,
  71         * hoping that most samples for the task are on this CPU
  72         */
  73        sync_buffer(raw_smp_processor_id());
  74        return 0;
  75}
  76
  77
  78/* The task is about to try a do_munmap(). We peek at what it's going to
  79 * do, and if it's an executable region, process the samples first, so
  80 * we don't lose any. This does not have to be exact, it's a QoI issue
  81 * only.
  82 */
  83static int
  84munmap_notify(struct notifier_block *self, unsigned long val, void *data)
  85{
  86        unsigned long addr = (unsigned long)data;
  87        struct mm_struct *mm = current->mm;
  88        struct vm_area_struct *mpnt;
  89
  90        down_read(&mm->mmap_sem);
  91
  92        mpnt = find_vma(mm, addr);
  93        if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
  94                up_read(&mm->mmap_sem);
  95                /* To avoid latency problems, we only process the current CPU,
  96                 * hoping that most samples for the task are on this CPU
  97                 */
  98                sync_buffer(raw_smp_processor_id());
  99                return 0;
 100        }
 101
 102        up_read(&mm->mmap_sem);
 103        return 0;
 104}
 105
 106
 107/* We need to be told about new modules so we don't attribute to a previously
 108 * loaded module, or drop the samples on the floor.
 109 */
 110static int
 111module_load_notify(struct notifier_block *self, unsigned long val, void *data)
 112{
 113#ifdef CONFIG_MODULES
 114        if (val != MODULE_STATE_COMING)
 115                return 0;
 116
 117        /* FIXME: should we process all CPU buffers ? */
 118        mutex_lock(&buffer_mutex);
 119        add_event_entry(ESCAPE_CODE);
 120        add_event_entry(MODULE_LOADED_CODE);
 121        mutex_unlock(&buffer_mutex);
 122#endif
 123        return 0;
 124}
 125
 126
 127static struct notifier_block task_free_nb = {
 128        .notifier_call  = task_free_notify,
 129};
 130
 131static struct notifier_block task_exit_nb = {
 132        .notifier_call  = task_exit_notify,
 133};
 134
 135static struct notifier_block munmap_nb = {
 136        .notifier_call  = munmap_notify,
 137};
 138
 139static struct notifier_block module_load_nb = {
 140        .notifier_call = module_load_notify,
 141};
 142
 143
 144static void end_sync(void)
 145{
 146        end_cpu_work();
 147        /* make sure we don't leak task structs */
 148        process_task_mortuary();
 149        process_task_mortuary();
 150}
 151
 152
 153int sync_start(void)
 154{
 155        int err;
 156
 157        if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 158                return -ENOMEM;
 159
 160        start_cpu_work();
 161
 162        err = task_handoff_register(&task_free_nb);
 163        if (err)
 164                goto out1;
 165        err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
 166        if (err)
 167                goto out2;
 168        err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
 169        if (err)
 170                goto out3;
 171        err = register_module_notifier(&module_load_nb);
 172        if (err)
 173                goto out4;
 174
 175out:
 176        return err;
 177out4:
 178        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 179out3:
 180        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 181out2:
 182        task_handoff_unregister(&task_free_nb);
 183out1:
 184        end_sync();
 185        free_cpumask_var(marked_cpus);
 186        goto out;
 187}
 188
 189
 190void sync_stop(void)
 191{
 192        unregister_module_notifier(&module_load_nb);
 193        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 194        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 195        task_handoff_unregister(&task_free_nb);
 196        end_sync();
 197        free_cpumask_var(marked_cpus);
 198}
 199
 200
 201/* Optimisation. We can manage without taking the dcookie sem
 202 * because we cannot reach this code without at least one
 203 * dcookie user still being registered (namely, the reader
 204 * of the event buffer). */
 205static inline unsigned long fast_get_dcookie(struct path *path)
 206{
 207        unsigned long cookie;
 208
 209        if (path->dentry->d_flags & DCACHE_COOKIE)
 210                return (unsigned long)path->dentry;
 211        get_dcookie(path, &cookie);
 212        return cookie;
 213}
 214
 215
 216/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
 217 * which corresponds loosely to "application name". This is
 218 * not strictly necessary but allows oprofile to associate
 219 * shared-library samples with particular applications
 220 */
 221static unsigned long get_exec_dcookie(struct mm_struct *mm)
 222{
 223        unsigned long cookie = NO_COOKIE;
 224        struct vm_area_struct *vma;
 225
 226        if (!mm)
 227                goto out;
 228
 229        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 230                if (!vma->vm_file)
 231                        continue;
 232                if (!(vma->vm_flags & VM_EXECUTABLE))
 233                        continue;
 234                cookie = fast_get_dcookie(&vma->vm_file->f_path);
 235                break;
 236        }
 237
 238out:
 239        return cookie;
 240}
 241
 242
 243/* Convert the EIP value of a sample into a persistent dentry/offset
 244 * pair that can then be added to the global event buffer. We make
 245 * sure to do this lookup before a mm->mmap modification happens so
 246 * we don't lose track.
 247 */
 248static unsigned long
 249lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 250{
 251        unsigned long cookie = NO_COOKIE;
 252        struct vm_area_struct *vma;
 253
 254        for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 255
 256                if (addr < vma->vm_start || addr >= vma->vm_end)
 257                        continue;
 258
 259                if (vma->vm_file) {
 260                        cookie = fast_get_dcookie(&vma->vm_file->f_path);
 261                        *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
 262                                vma->vm_start;
 263                } else {
 264                        /* must be an anonymous map */
 265                        *offset = addr;
 266                }
 267
 268                break;
 269        }
 270
 271        if (!vma)
 272                cookie = INVALID_COOKIE;
 273
 274        return cookie;
 275}
 276
 277static unsigned long last_cookie = INVALID_COOKIE;
 278
 279static void add_cpu_switch(int i)
 280{
 281        add_event_entry(ESCAPE_CODE);
 282        add_event_entry(CPU_SWITCH_CODE);
 283        add_event_entry(i);
 284        last_cookie = INVALID_COOKIE;
 285}
 286
 287static void add_kernel_ctx_switch(unsigned int in_kernel)
 288{
 289        add_event_entry(ESCAPE_CODE);
 290        if (in_kernel)
 291                add_event_entry(KERNEL_ENTER_SWITCH_CODE);
 292        else
 293                add_event_entry(KERNEL_EXIT_SWITCH_CODE);
 294}
 295
 296static void
 297add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
 298{
 299        add_event_entry(ESCAPE_CODE);
 300        add_event_entry(CTX_SWITCH_CODE);
 301        add_event_entry(task->pid);
 302        add_event_entry(cookie);
 303        /* Another code for daemon back-compat */
 304        add_event_entry(ESCAPE_CODE);
 305        add_event_entry(CTX_TGID_CODE);
 306        add_event_entry(task->tgid);
 307}
 308
 309
 310static void add_cookie_switch(unsigned long cookie)
 311{
 312        add_event_entry(ESCAPE_CODE);
 313        add_event_entry(COOKIE_SWITCH_CODE);
 314        add_event_entry(cookie);
 315}
 316
 317
 318static void add_trace_begin(void)
 319{
 320        add_event_entry(ESCAPE_CODE);
 321        add_event_entry(TRACE_BEGIN_CODE);
 322}
 323
 324static void add_data(struct op_entry *entry, struct mm_struct *mm)
 325{
 326        unsigned long code, pc, val;
 327        unsigned long cookie;
 328        off_t offset;
 329
 330        if (!op_cpu_buffer_get_data(entry, &code))
 331                return;
 332        if (!op_cpu_buffer_get_data(entry, &pc))
 333                return;
 334        if (!op_cpu_buffer_get_size(entry))
 335                return;
 336
 337        if (mm) {
 338                cookie = lookup_dcookie(mm, pc, &offset);
 339
 340                if (cookie == NO_COOKIE)
 341                        offset = pc;
 342                if (cookie == INVALID_COOKIE) {
 343                        atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 344                        offset = pc;
 345                }
 346                if (cookie != last_cookie) {
 347                        add_cookie_switch(cookie);
 348                        last_cookie = cookie;
 349                }
 350        } else
 351                offset = pc;
 352
 353        add_event_entry(ESCAPE_CODE);
 354        add_event_entry(code);
 355        add_event_entry(offset);        /* Offset from Dcookie */
 356
 357        while (op_cpu_buffer_get_data(entry, &val))
 358                add_event_entry(val);
 359}
 360
 361static inline void add_sample_entry(unsigned long offset, unsigned long event)
 362{
 363        add_event_entry(offset);
 364        add_event_entry(event);
 365}
 366
 367
 368/*
 369 * Add a sample to the global event buffer. If possible the
 370 * sample is converted into a persistent dentry/offset pair
 371 * for later lookup from userspace. Return 0 on failure.
 372 */
 373static int
 374add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
 375{
 376        unsigned long cookie;
 377        off_t offset;
 378
 379        if (in_kernel) {
 380                add_sample_entry(s->eip, s->event);
 381                return 1;
 382        }
 383
 384        /* add userspace sample */
 385
 386        if (!mm) {
 387                atomic_inc(&oprofile_stats.sample_lost_no_mm);
 388                return 0;
 389        }
 390
 391        cookie = lookup_dcookie(mm, s->eip, &offset);
 392
 393        if (cookie == INVALID_COOKIE) {
 394                atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 395                return 0;
 396        }
 397
 398        if (cookie != last_cookie) {
 399                add_cookie_switch(cookie);
 400                last_cookie = cookie;
 401        }
 402
 403        add_sample_entry(offset, s->event);
 404
 405        return 1;
 406}
 407
 408
 409static void release_mm(struct mm_struct *mm)
 410{
 411        if (!mm)
 412                return;
 413        up_read(&mm->mmap_sem);
 414        mmput(mm);
 415}
 416
 417
 418static struct mm_struct *take_tasks_mm(struct task_struct *task)
 419{
 420        struct mm_struct *mm = get_task_mm(task);
 421        if (mm)
 422                down_read(&mm->mmap_sem);
 423        return mm;
 424}
 425
 426
 427static inline int is_code(unsigned long val)
 428{
 429        return val == ESCAPE_CODE;
 430}
 431
 432
 433/* Move tasks along towards death. Any tasks on dead_tasks
 434 * will definitely have no remaining references in any
 435 * CPU buffers at this point, because we use two lists,
 436 * and to have reached the list, it must have gone through
 437 * one full sync already.
 438 */
 439static void process_task_mortuary(void)
 440{
 441        unsigned long flags;
 442        LIST_HEAD(local_dead_tasks);
 443        struct task_struct *task;
 444        struct task_struct *ttask;
 445
 446        spin_lock_irqsave(&task_mortuary, flags);
 447
 448        list_splice_init(&dead_tasks, &local_dead_tasks);
 449        list_splice_init(&dying_tasks, &dead_tasks);
 450
 451        spin_unlock_irqrestore(&task_mortuary, flags);
 452
 453        list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 454                list_del(&task->tasks);
 455                free_task(task);
 456        }
 457}
 458
 459
 460static void mark_done(int cpu)
 461{
 462        int i;
 463
 464        cpumask_set_cpu(cpu, marked_cpus);
 465
 466        for_each_online_cpu(i) {
 467                if (!cpumask_test_cpu(i, marked_cpus))
 468                        return;
 469        }
 470
 471        /* All CPUs have been processed at least once,
 472         * we can process the mortuary once
 473         */
 474        process_task_mortuary();
 475
 476        cpumask_clear(marked_cpus);
 477}
 478
 479
 480/* FIXME: this is not sufficient if we implement syscall barrier backtrace
 481 * traversal, the code switch to sb_sample_start at first kernel enter/exit
 482 * switch so we need a fifth state and some special handling in sync_buffer()
 483 */
 484typedef enum {
 485        sb_bt_ignore = -2,
 486        sb_buffer_start,
 487        sb_bt_start,
 488        sb_sample_start,
 489} sync_buffer_state;
 490
 491/* Sync one of the CPU's buffers into the global event buffer.
 492 * Here we need to go through each batch of samples punctuated
 493 * by context switch notes, taking the task's mmap_sem and doing
 494 * lookup in task->mm->mmap to convert EIP into dcookie/offset
 495 * value.
 496 */
 497void sync_buffer(int cpu)
 498{
 499        struct mm_struct *mm = NULL;
 500        struct mm_struct *oldmm;
 501        unsigned long val;
 502        struct task_struct *new;
 503        unsigned long cookie = 0;
 504        int in_kernel = 1;
 505        sync_buffer_state state = sb_buffer_start;
 506        unsigned int i;
 507        unsigned long available;
 508        unsigned long flags;
 509        struct op_entry entry;
 510        struct op_sample *sample;
 511
 512        mutex_lock(&buffer_mutex);
 513
 514        add_cpu_switch(cpu);
 515
 516        op_cpu_buffer_reset(cpu);
 517        available = op_cpu_buffer_entries(cpu);
 518
 519        for (i = 0; i < available; ++i) {
 520                sample = op_cpu_buffer_read_entry(&entry, cpu);
 521                if (!sample)
 522                        break;
 523
 524                if (is_code(sample->eip)) {
 525                        flags = sample->event;
 526                        if (flags & TRACE_BEGIN) {
 527                                state = sb_bt_start;
 528                                add_trace_begin();
 529                        }
 530                        if (flags & KERNEL_CTX_SWITCH) {
 531                                /* kernel/userspace switch */
 532                                in_kernel = flags & IS_KERNEL;
 533                                if (state == sb_buffer_start)
 534                                        state = sb_sample_start;
 535                                add_kernel_ctx_switch(flags & IS_KERNEL);
 536                        }
 537                        if (flags & USER_CTX_SWITCH
 538                            && op_cpu_buffer_get_data(&entry, &val)) {
 539                                /* userspace context switch */
 540                                new = (struct task_struct *)val;
 541                                oldmm = mm;
 542                                release_mm(oldmm);
 543                                mm = take_tasks_mm(new);
 544                                if (mm != oldmm)
 545                                        cookie = get_exec_dcookie(mm);
 546                                add_user_ctx_switch(new, cookie);
 547                        }
 548                        if (op_cpu_buffer_get_size(&entry))
 549                                add_data(&entry, mm);
 550                        continue;
 551                }
 552
 553                if (state < sb_bt_start)
 554                        /* ignore sample */
 555                        continue;
 556
 557                if (add_sample(mm, sample, in_kernel))
 558                        continue;
 559
 560                /* ignore backtraces if failed to add a sample */
 561                if (state == sb_bt_start) {
 562                        state = sb_bt_ignore;
 563                        atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 564                }
 565        }
 566        release_mm(mm);
 567
 568        mark_done(cpu);
 569
 570        mutex_unlock(&buffer_mutex);
 571}
 572
 573/* The function can be used to add a buffer worth of data directly to
 574 * the kernel buffer. The buffer is assumed to be a circular buffer.
 575 * Take the entries from index start and end at index end, wrapping
 576 * at max_entries.
 577 */
 578void oprofile_put_buff(unsigned long *buf, unsigned int start,
 579                       unsigned int stop, unsigned int max)
 580{
 581        int i;
 582
 583        i = start;
 584
 585        mutex_lock(&buffer_mutex);
 586        while (i != stop) {
 587                add_event_entry(buf[i++]);
 588
 589                if (i >= max)
 590                        i = 0;
 591        }
 592
 593        mutex_unlock(&buffer_mutex);
 594}
 595
 596