linux/drivers/oprofile/buffer_sync.c
<<
>>
Prefs
   1/**
   2 * @file buffer_sync.c
   3 *
   4 * @remark Copyright 2002-2009 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf
   9 * @author Robert Richter <robert.richter@amd.com>
  10 *
  11 * This is the core of the buffer management. Each
  12 * CPU buffer is processed and entered into the
  13 * global event buffer. Such processing is necessary
  14 * in several circumstances, mentioned below.
  15 *
  16 * The processing does the job of converting the
  17 * transitory EIP value into a persistent dentry/offset
  18 * value that the profiler can record at its leisure.
  19 *
  20 * See fs/dcookies.c for a description of the dentry/offset
  21 * objects.
  22 */
  23
  24#include <linux/file.h>
  25#include <linux/mm.h>
  26#include <linux/workqueue.h>
  27#include <linux/notifier.h>
  28#include <linux/dcookies.h>
  29#include <linux/profile.h>
  30#include <linux/module.h>
  31#include <linux/fs.h>
  32#include <linux/oprofile.h>
  33#include <linux/sched.h>
  34#include <linux/sched/mm.h>
  35#include <linux/sched/task.h>
  36#include <linux/gfp.h>
  37
  38#include "oprofile_stats.h"
  39#include "event_buffer.h"
  40#include "cpu_buffer.h"
  41#include "buffer_sync.h"
  42
  43static LIST_HEAD(dying_tasks);
  44static LIST_HEAD(dead_tasks);
  45static cpumask_var_t marked_cpus;
  46static DEFINE_SPINLOCK(task_mortuary);
  47static void process_task_mortuary(void);
  48
  49/* Take ownership of the task struct and place it on the
  50 * list for processing. Only after two full buffer syncs
  51 * does the task eventually get freed, because by then
  52 * we are sure we will not reference it again.
  53 * Can be invoked from softirq via RCU callback due to
  54 * call_rcu() of the task struct, hence the _irqsave.
  55 */
  56static int
  57task_free_notify(struct notifier_block *self, unsigned long val, void *data)
  58{
  59        unsigned long flags;
  60        struct task_struct *task = data;
  61        spin_lock_irqsave(&task_mortuary, flags);
  62        list_add(&task->tasks, &dying_tasks);
  63        spin_unlock_irqrestore(&task_mortuary, flags);
  64        return NOTIFY_OK;
  65}
  66
  67
  68/* The task is on its way out. A sync of the buffer means we can catch
  69 * any remaining samples for this task.
  70 */
  71static int
  72task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
  73{
  74        /* To avoid latency problems, we only process the current CPU,
  75         * hoping that most samples for the task are on this CPU
  76         */
  77        sync_buffer(raw_smp_processor_id());
  78        return 0;
  79}
  80
  81
  82/* The task is about to try a do_munmap(). We peek at what it's going to
  83 * do, and if it's an executable region, process the samples first, so
  84 * we don't lose any. This does not have to be exact, it's a QoI issue
  85 * only.
  86 */
  87static int
  88munmap_notify(struct notifier_block *self, unsigned long val, void *data)
  89{
  90        unsigned long addr = (unsigned long)data;
  91        struct mm_struct *mm = current->mm;
  92        struct vm_area_struct *mpnt;
  93
  94        down_read(&mm->mmap_sem);
  95
  96        mpnt = find_vma(mm, addr);
  97        if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
  98                up_read(&mm->mmap_sem);
  99                /* To avoid latency problems, we only process the current CPU,
 100                 * hoping that most samples for the task are on this CPU
 101                 */
 102                sync_buffer(raw_smp_processor_id());
 103                return 0;
 104        }
 105
 106        up_read(&mm->mmap_sem);
 107        return 0;
 108}
 109
 110
 111/* We need to be told about new modules so we don't attribute to a previously
 112 * loaded module, or drop the samples on the floor.
 113 */
 114static int
 115module_load_notify(struct notifier_block *self, unsigned long val, void *data)
 116{
 117#ifdef CONFIG_MODULES
 118        if (val != MODULE_STATE_COMING)
 119                return 0;
 120
 121        /* FIXME: should we process all CPU buffers ? */
 122        mutex_lock(&buffer_mutex);
 123        add_event_entry(ESCAPE_CODE);
 124        add_event_entry(MODULE_LOADED_CODE);
 125        mutex_unlock(&buffer_mutex);
 126#endif
 127        return 0;
 128}
 129
 130
 131static struct notifier_block task_free_nb = {
 132        .notifier_call  = task_free_notify,
 133};
 134
 135static struct notifier_block task_exit_nb = {
 136        .notifier_call  = task_exit_notify,
 137};
 138
 139static struct notifier_block munmap_nb = {
 140        .notifier_call  = munmap_notify,
 141};
 142
 143static struct notifier_block module_load_nb = {
 144        .notifier_call = module_load_notify,
 145};
 146
 147static void free_all_tasks(void)
 148{
 149        /* make sure we don't leak task structs */
 150        process_task_mortuary();
 151        process_task_mortuary();
 152}
 153
 154int sync_start(void)
 155{
 156        int err;
 157
 158        if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 159                return -ENOMEM;
 160
 161        err = task_handoff_register(&task_free_nb);
 162        if (err)
 163                goto out1;
 164        err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
 165        if (err)
 166                goto out2;
 167        err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
 168        if (err)
 169                goto out3;
 170        err = register_module_notifier(&module_load_nb);
 171        if (err)
 172                goto out4;
 173
 174        start_cpu_work();
 175
 176out:
 177        return err;
 178out4:
 179        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 180out3:
 181        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 182out2:
 183        task_handoff_unregister(&task_free_nb);
 184        free_all_tasks();
 185out1:
 186        free_cpumask_var(marked_cpus);
 187        goto out;
 188}
 189
 190
 191void sync_stop(void)
 192{
 193        end_cpu_work();
 194        unregister_module_notifier(&module_load_nb);
 195        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 196        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 197        task_handoff_unregister(&task_free_nb);
 198        barrier();                      /* do all of the above first */
 199
 200        flush_cpu_work();
 201
 202        free_all_tasks();
 203        free_cpumask_var(marked_cpus);
 204}
 205
 206
 207/* Optimisation. We can manage without taking the dcookie sem
 208 * because we cannot reach this code without at least one
 209 * dcookie user still being registered (namely, the reader
 210 * of the event buffer). */
 211static inline unsigned long fast_get_dcookie(const struct path *path)
 212{
 213        unsigned long cookie;
 214
 215        if (path->dentry->d_flags & DCACHE_COOKIE)
 216                return (unsigned long)path->dentry;
 217        get_dcookie(path, &cookie);
 218        return cookie;
 219}
 220
 221
 222/* Look up the dcookie for the task's mm->exe_file,
 223 * which corresponds loosely to "application name". This is
 224 * not strictly necessary but allows oprofile to associate
 225 * shared-library samples with particular applications
 226 */
 227static unsigned long get_exec_dcookie(struct mm_struct *mm)
 228{
 229        unsigned long cookie = NO_COOKIE;
 230        struct file *exe_file;
 231
 232        if (!mm)
 233                goto done;
 234
 235        exe_file = get_mm_exe_file(mm);
 236        if (!exe_file)
 237                goto done;
 238
 239        cookie = fast_get_dcookie(&exe_file->f_path);
 240        fput(exe_file);
 241done:
 242        return cookie;
 243}
 244
 245
 246/* Convert the EIP value of a sample into a persistent dentry/offset
 247 * pair that can then be added to the global event buffer. We make
 248 * sure to do this lookup before a mm->mmap modification happens so
 249 * we don't lose track.
 250 *
 251 * The caller must ensure the mm is not nil (ie: not a kernel thread).
 252 */
 253static unsigned long
 254lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 255{
 256        unsigned long cookie = NO_COOKIE;
 257        struct vm_area_struct *vma;
 258
 259        down_read(&mm->mmap_sem);
 260        for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 261
 262                if (addr < vma->vm_start || addr >= vma->vm_end)
 263                        continue;
 264
 265                if (vma->vm_file) {
 266                        cookie = fast_get_dcookie(&vma->vm_file->f_path);
 267                        *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
 268                                vma->vm_start;
 269                } else {
 270                        /* must be an anonymous map */
 271                        *offset = addr;
 272                }
 273
 274                break;
 275        }
 276
 277        if (!vma)
 278                cookie = INVALID_COOKIE;
 279        up_read(&mm->mmap_sem);
 280
 281        return cookie;
 282}
 283
 284static unsigned long last_cookie = INVALID_COOKIE;
 285
 286static void add_cpu_switch(int i)
 287{
 288        add_event_entry(ESCAPE_CODE);
 289        add_event_entry(CPU_SWITCH_CODE);
 290        add_event_entry(i);
 291        last_cookie = INVALID_COOKIE;
 292}
 293
 294static void add_kernel_ctx_switch(unsigned int in_kernel)
 295{
 296        add_event_entry(ESCAPE_CODE);
 297        if (in_kernel)
 298                add_event_entry(KERNEL_ENTER_SWITCH_CODE);
 299        else
 300                add_event_entry(KERNEL_EXIT_SWITCH_CODE);
 301}
 302
 303static void
 304add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
 305{
 306        add_event_entry(ESCAPE_CODE);
 307        add_event_entry(CTX_SWITCH_CODE);
 308        add_event_entry(task->pid);
 309        add_event_entry(cookie);
 310        /* Another code for daemon back-compat */
 311        add_event_entry(ESCAPE_CODE);
 312        add_event_entry(CTX_TGID_CODE);
 313        add_event_entry(task->tgid);
 314}
 315
 316
 317static void add_cookie_switch(unsigned long cookie)
 318{
 319        add_event_entry(ESCAPE_CODE);
 320        add_event_entry(COOKIE_SWITCH_CODE);
 321        add_event_entry(cookie);
 322}
 323
 324
 325static void add_trace_begin(void)
 326{
 327        add_event_entry(ESCAPE_CODE);
 328        add_event_entry(TRACE_BEGIN_CODE);
 329}
 330
 331static void add_data(struct op_entry *entry, struct mm_struct *mm)
 332{
 333        unsigned long code, pc, val;
 334        unsigned long cookie;
 335        off_t offset;
 336
 337        if (!op_cpu_buffer_get_data(entry, &code))
 338                return;
 339        if (!op_cpu_buffer_get_data(entry, &pc))
 340                return;
 341        if (!op_cpu_buffer_get_size(entry))
 342                return;
 343
 344        if (mm) {
 345                cookie = lookup_dcookie(mm, pc, &offset);
 346
 347                if (cookie == NO_COOKIE)
 348                        offset = pc;
 349                if (cookie == INVALID_COOKIE) {
 350                        atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 351                        offset = pc;
 352                }
 353                if (cookie != last_cookie) {
 354                        add_cookie_switch(cookie);
 355                        last_cookie = cookie;
 356                }
 357        } else
 358                offset = pc;
 359
 360        add_event_entry(ESCAPE_CODE);
 361        add_event_entry(code);
 362        add_event_entry(offset);        /* Offset from Dcookie */
 363
 364        while (op_cpu_buffer_get_data(entry, &val))
 365                add_event_entry(val);
 366}
 367
 368static inline void add_sample_entry(unsigned long offset, unsigned long event)
 369{
 370        add_event_entry(offset);
 371        add_event_entry(event);
 372}
 373
 374
 375/*
 376 * Add a sample to the global event buffer. If possible the
 377 * sample is converted into a persistent dentry/offset pair
 378 * for later lookup from userspace. Return 0 on failure.
 379 */
 380static int
 381add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
 382{
 383        unsigned long cookie;
 384        off_t offset;
 385
 386        if (in_kernel) {
 387                add_sample_entry(s->eip, s->event);
 388                return 1;
 389        }
 390
 391        /* add userspace sample */
 392
 393        if (!mm) {
 394                atomic_inc(&oprofile_stats.sample_lost_no_mm);
 395                return 0;
 396        }
 397
 398        cookie = lookup_dcookie(mm, s->eip, &offset);
 399
 400        if (cookie == INVALID_COOKIE) {
 401                atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 402                return 0;
 403        }
 404
 405        if (cookie != last_cookie) {
 406                add_cookie_switch(cookie);
 407                last_cookie = cookie;
 408        }
 409
 410        add_sample_entry(offset, s->event);
 411
 412        return 1;
 413}
 414
 415
 416static void release_mm(struct mm_struct *mm)
 417{
 418        if (!mm)
 419                return;
 420        mmput(mm);
 421}
 422
 423static inline int is_code(unsigned long val)
 424{
 425        return val == ESCAPE_CODE;
 426}
 427
 428
 429/* Move tasks along towards death. Any tasks on dead_tasks
 430 * will definitely have no remaining references in any
 431 * CPU buffers at this point, because we use two lists,
 432 * and to have reached the list, it must have gone through
 433 * one full sync already.
 434 */
 435static void process_task_mortuary(void)
 436{
 437        unsigned long flags;
 438        LIST_HEAD(local_dead_tasks);
 439        struct task_struct *task;
 440        struct task_struct *ttask;
 441
 442        spin_lock_irqsave(&task_mortuary, flags);
 443
 444        list_splice_init(&dead_tasks, &local_dead_tasks);
 445        list_splice_init(&dying_tasks, &dead_tasks);
 446
 447        spin_unlock_irqrestore(&task_mortuary, flags);
 448
 449        list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 450                list_del(&task->tasks);
 451                free_task(task);
 452        }
 453}
 454
 455
 456static void mark_done(int cpu)
 457{
 458        int i;
 459
 460        cpumask_set_cpu(cpu, marked_cpus);
 461
 462        for_each_online_cpu(i) {
 463                if (!cpumask_test_cpu(i, marked_cpus))
 464                        return;
 465        }
 466
 467        /* All CPUs have been processed at least once,
 468         * we can process the mortuary once
 469         */
 470        process_task_mortuary();
 471
 472        cpumask_clear(marked_cpus);
 473}
 474
 475
 476/* FIXME: this is not sufficient if we implement syscall barrier backtrace
 477 * traversal, the code switch to sb_sample_start at first kernel enter/exit
 478 * switch so we need a fifth state and some special handling in sync_buffer()
 479 */
 480typedef enum {
 481        sb_bt_ignore = -2,
 482        sb_buffer_start,
 483        sb_bt_start,
 484        sb_sample_start,
 485} sync_buffer_state;
 486
 487/* Sync one of the CPU's buffers into the global event buffer.
 488 * Here we need to go through each batch of samples punctuated
 489 * by context switch notes, taking the task's mmap_sem and doing
 490 * lookup in task->mm->mmap to convert EIP into dcookie/offset
 491 * value.
 492 */
 493void sync_buffer(int cpu)
 494{
 495        struct mm_struct *mm = NULL;
 496        struct mm_struct *oldmm;
 497        unsigned long val;
 498        struct task_struct *new;
 499        unsigned long cookie = 0;
 500        int in_kernel = 1;
 501        sync_buffer_state state = sb_buffer_start;
 502        unsigned int i;
 503        unsigned long available;
 504        unsigned long flags;
 505        struct op_entry entry;
 506        struct op_sample *sample;
 507
 508        mutex_lock(&buffer_mutex);
 509
 510        add_cpu_switch(cpu);
 511
 512        op_cpu_buffer_reset(cpu);
 513        available = op_cpu_buffer_entries(cpu);
 514
 515        for (i = 0; i < available; ++i) {
 516                sample = op_cpu_buffer_read_entry(&entry, cpu);
 517                if (!sample)
 518                        break;
 519
 520                if (is_code(sample->eip)) {
 521                        flags = sample->event;
 522                        if (flags & TRACE_BEGIN) {
 523                                state = sb_bt_start;
 524                                add_trace_begin();
 525                        }
 526                        if (flags & KERNEL_CTX_SWITCH) {
 527                                /* kernel/userspace switch */
 528                                in_kernel = flags & IS_KERNEL;
 529                                if (state == sb_buffer_start)
 530                                        state = sb_sample_start;
 531                                add_kernel_ctx_switch(flags & IS_KERNEL);
 532                        }
 533                        if (flags & USER_CTX_SWITCH
 534                            && op_cpu_buffer_get_data(&entry, &val)) {
 535                                /* userspace context switch */
 536                                new = (struct task_struct *)val;
 537                                oldmm = mm;
 538                                release_mm(oldmm);
 539                                mm = get_task_mm(new);
 540                                if (mm != oldmm)
 541                                        cookie = get_exec_dcookie(mm);
 542                                add_user_ctx_switch(new, cookie);
 543                        }
 544                        if (op_cpu_buffer_get_size(&entry))
 545                                add_data(&entry, mm);
 546                        continue;
 547                }
 548
 549                if (state < sb_bt_start)
 550                        /* ignore sample */
 551                        continue;
 552
 553                if (add_sample(mm, sample, in_kernel))
 554                        continue;
 555
 556                /* ignore backtraces if failed to add a sample */
 557                if (state == sb_bt_start) {
 558                        state = sb_bt_ignore;
 559                        atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 560                }
 561        }
 562        release_mm(mm);
 563
 564        mark_done(cpu);
 565
 566        mutex_unlock(&buffer_mutex);
 567}
 568
 569/* The function can be used to add a buffer worth of data directly to
 570 * the kernel buffer. The buffer is assumed to be a circular buffer.
 571 * Take the entries from index start and end at index end, wrapping
 572 * at max_entries.
 573 */
 574void oprofile_put_buff(unsigned long *buf, unsigned int start,
 575                       unsigned int stop, unsigned int max)
 576{
 577        int i;
 578
 579        i = start;
 580
 581        mutex_lock(&buffer_mutex);
 582        while (i != stop) {
 583                add_event_entry(buf[i++]);
 584
 585                if (i >= max)
 586                        i = 0;
 587        }
 588
 589        mutex_unlock(&buffer_mutex);
 590}
 591
 592