linux/drivers/oprofile/cpu_buffer.c
<<
>>
Prefs
   1/**
   2 * @file cpu_buffer.c
   3 *
   4 * @remark Copyright 2002-2009 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf <barry.kasindorf@amd.com>
   9 * @author Robert Richter <robert.richter@amd.com>
  10 *
  11 * Each CPU has a local buffer that stores PC value/event
  12 * pairs. We also log context switches when we notice them.
  13 * Eventually each CPU's buffer is processed into the global
  14 * event buffer by sync_buffer().
  15 *
  16 * We use a local buffer for two reasons: an NMI or similar
  17 * interrupt cannot synchronise, and high sampling rates
  18 * would lead to catastrophic global synchronisation if
  19 * a global buffer was used.
  20 */
  21
  22#include <linux/sched.h>
  23#include <linux/oprofile.h>
  24#include <linux/errno.h>
  25
  26#include "event_buffer.h"
  27#include "cpu_buffer.h"
  28#include "buffer_sync.h"
  29#include "oprof.h"
  30
  31#define OP_BUFFER_FLAGS 0
  32
  33static struct ring_buffer *op_ring_buffer;
  34DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
  35
  36static void wq_sync_buffer(struct work_struct *work);
  37
  38#define DEFAULT_TIMER_EXPIRE (HZ / 10)
  39static int work_enabled;
  40
  41unsigned long oprofile_get_cpu_buffer_size(void)
  42{
  43        return oprofile_cpu_buffer_size;
  44}
  45
  46void oprofile_cpu_buffer_inc_smpl_lost(void)
  47{
  48        struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
  49
  50        cpu_buf->sample_lost_overflow++;
  51}
  52
  53void free_cpu_buffers(void)
  54{
  55        if (op_ring_buffer)
  56                ring_buffer_free(op_ring_buffer);
  57        op_ring_buffer = NULL;
  58}
  59
  60#define RB_EVENT_HDR_SIZE 4
  61
  62int alloc_cpu_buffers(void)
  63{
  64        int i;
  65
  66        unsigned long buffer_size = oprofile_cpu_buffer_size;
  67        unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
  68                                                 RB_EVENT_HDR_SIZE);
  69
  70        op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  71        if (!op_ring_buffer)
  72                goto fail;
  73
  74        for_each_possible_cpu(i) {
  75                struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
  76
  77                b->last_task = NULL;
  78                b->last_is_kernel = -1;
  79                b->tracing = 0;
  80                b->buffer_size = buffer_size;
  81                b->sample_received = 0;
  82                b->sample_lost_overflow = 0;
  83                b->backtrace_aborted = 0;
  84                b->sample_invalid_eip = 0;
  85                b->cpu = i;
  86                INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
  87        }
  88        return 0;
  89
  90fail:
  91        free_cpu_buffers();
  92        return -ENOMEM;
  93}
  94
  95void start_cpu_work(void)
  96{
  97        int i;
  98
  99        work_enabled = 1;
 100
 101        for_each_online_cpu(i) {
 102                struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 103
 104                /*
 105                 * Spread the work by 1 jiffy per cpu so they dont all
 106                 * fire at once.
 107                 */
 108                schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
 109        }
 110}
 111
 112void end_cpu_work(void)
 113{
 114        work_enabled = 0;
 115}
 116
 117void flush_cpu_work(void)
 118{
 119        int i;
 120
 121        for_each_online_cpu(i) {
 122                struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 123
 124                /* these works are per-cpu, no need for flush_sync */
 125                flush_delayed_work(&b->work);
 126        }
 127}
 128
 129/*
 130 * This function prepares the cpu buffer to write a sample.
 131 *
 132 * Struct op_entry is used during operations on the ring buffer while
 133 * struct op_sample contains the data that is stored in the ring
 134 * buffer. Struct entry can be uninitialized. The function reserves a
 135 * data array that is specified by size. Use
 136 * op_cpu_buffer_write_commit() after preparing the sample. In case of
 137 * errors a null pointer is returned, otherwise the pointer to the
 138 * sample.
 139 *
 140 */
 141struct op_sample
 142*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 143{
 144        entry->event = ring_buffer_lock_reserve
 145                (op_ring_buffer, sizeof(struct op_sample) +
 146                 size * sizeof(entry->sample->data[0]));
 147        if (!entry->event)
 148                return NULL;
 149        entry->sample = ring_buffer_event_data(entry->event);
 150        entry->size = size;
 151        entry->data = entry->sample->data;
 152
 153        return entry->sample;
 154}
 155
 156int op_cpu_buffer_write_commit(struct op_entry *entry)
 157{
 158        return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
 159}
 160
 161struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 162{
 163        struct ring_buffer_event *e;
 164        e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL);
 165        if (!e)
 166                return NULL;
 167
 168        entry->event = e;
 169        entry->sample = ring_buffer_event_data(e);
 170        entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
 171                / sizeof(entry->sample->data[0]);
 172        entry->data = entry->sample->data;
 173        return entry->sample;
 174}
 175
 176unsigned long op_cpu_buffer_entries(int cpu)
 177{
 178        return ring_buffer_entries_cpu(op_ring_buffer, cpu);
 179}
 180
 181static int
 182op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 183            int is_kernel, struct task_struct *task)
 184{
 185        struct op_entry entry;
 186        struct op_sample *sample;
 187        unsigned long flags;
 188        int size;
 189
 190        flags = 0;
 191
 192        if (backtrace)
 193                flags |= TRACE_BEGIN;
 194
 195        /* notice a switch from user->kernel or vice versa */
 196        is_kernel = !!is_kernel;
 197        if (cpu_buf->last_is_kernel != is_kernel) {
 198                cpu_buf->last_is_kernel = is_kernel;
 199                flags |= KERNEL_CTX_SWITCH;
 200                if (is_kernel)
 201                        flags |= IS_KERNEL;
 202        }
 203
 204        /* notice a task switch */
 205        if (cpu_buf->last_task != task) {
 206                cpu_buf->last_task = task;
 207                flags |= USER_CTX_SWITCH;
 208        }
 209
 210        if (!flags)
 211                /* nothing to do */
 212                return 0;
 213
 214        if (flags & USER_CTX_SWITCH)
 215                size = 1;
 216        else
 217                size = 0;
 218
 219        sample = op_cpu_buffer_write_reserve(&entry, size);
 220        if (!sample)
 221                return -ENOMEM;
 222
 223        sample->eip = ESCAPE_CODE;
 224        sample->event = flags;
 225
 226        if (size)
 227                op_cpu_buffer_add_data(&entry, (unsigned long)task);
 228
 229        op_cpu_buffer_write_commit(&entry);
 230
 231        return 0;
 232}
 233
 234static inline int
 235op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 236              unsigned long pc, unsigned long event)
 237{
 238        struct op_entry entry;
 239        struct op_sample *sample;
 240
 241        sample = op_cpu_buffer_write_reserve(&entry, 0);
 242        if (!sample)
 243                return -ENOMEM;
 244
 245        sample->eip = pc;
 246        sample->event = event;
 247
 248        return op_cpu_buffer_write_commit(&entry);
 249}
 250
 251/*
 252 * This must be safe from any context.
 253 *
 254 * is_kernel is needed because on some architectures you cannot
 255 * tell if you are in kernel or user space simply by looking at
 256 * pc. We tag this in the buffer by generating kernel enter/exit
 257 * events whenever is_kernel changes
 258 */
 259static int
 260log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 261           unsigned long backtrace, int is_kernel, unsigned long event,
 262           struct task_struct *task)
 263{
 264        struct task_struct *tsk = task ? task : current;
 265        cpu_buf->sample_received++;
 266
 267        if (pc == ESCAPE_CODE) {
 268                cpu_buf->sample_invalid_eip++;
 269                return 0;
 270        }
 271
 272        if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
 273                goto fail;
 274
 275        if (op_add_sample(cpu_buf, pc, event))
 276                goto fail;
 277
 278        return 1;
 279
 280fail:
 281        cpu_buf->sample_lost_overflow++;
 282        return 0;
 283}
 284
 285static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 286{
 287        cpu_buf->tracing = 1;
 288}
 289
 290static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 291{
 292        cpu_buf->tracing = 0;
 293}
 294
 295static inline void
 296__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 297                          unsigned long event, int is_kernel,
 298                          struct task_struct *task)
 299{
 300        struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 301        unsigned long backtrace = oprofile_backtrace_depth;
 302
 303        /*
 304         * if log_sample() fail we can't backtrace since we lost the
 305         * source of this event
 306         */
 307        if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task))
 308                /* failed */
 309                return;
 310
 311        if (!backtrace)
 312                return;
 313
 314        oprofile_begin_trace(cpu_buf);
 315        oprofile_ops.backtrace(regs, backtrace);
 316        oprofile_end_trace(cpu_buf);
 317}
 318
 319void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
 320                                unsigned long event, int is_kernel,
 321                                struct task_struct *task)
 322{
 323        __oprofile_add_ext_sample(pc, regs, event, is_kernel, task);
 324}
 325
 326void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 327                             unsigned long event, int is_kernel)
 328{
 329        __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 330}
 331
 332void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 333{
 334        int is_kernel;
 335        unsigned long pc;
 336
 337        if (likely(regs)) {
 338                is_kernel = !user_mode(regs);
 339                pc = profile_pc(regs);
 340        } else {
 341                is_kernel = 0;    /* This value will not be used */
 342                pc = ESCAPE_CODE; /* as this causes an early return. */
 343        }
 344
 345        __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 346}
 347
 348/*
 349 * Add samples with data to the ring buffer.
 350 *
 351 * Use oprofile_add_data(&entry, val) to add data and
 352 * oprofile_write_commit(&entry) to commit the sample.
 353 */
 354void
 355oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 356                       unsigned long pc, int code, int size)
 357{
 358        struct op_sample *sample;
 359        int is_kernel = !user_mode(regs);
 360        struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 361
 362        cpu_buf->sample_received++;
 363
 364        /* no backtraces for samples with data */
 365        if (op_add_code(cpu_buf, 0, is_kernel, current))
 366                goto fail;
 367
 368        sample = op_cpu_buffer_write_reserve(entry, size + 2);
 369        if (!sample)
 370                goto fail;
 371        sample->eip = ESCAPE_CODE;
 372        sample->event = 0;              /* no flags */
 373
 374        op_cpu_buffer_add_data(entry, code);
 375        op_cpu_buffer_add_data(entry, pc);
 376
 377        return;
 378
 379fail:
 380        entry->event = NULL;
 381        cpu_buf->sample_lost_overflow++;
 382}
 383
 384int oprofile_add_data(struct op_entry *entry, unsigned long val)
 385{
 386        if (!entry->event)
 387                return 0;
 388        return op_cpu_buffer_add_data(entry, val);
 389}
 390
 391int oprofile_add_data64(struct op_entry *entry, u64 val)
 392{
 393        if (!entry->event)
 394                return 0;
 395        if (op_cpu_buffer_get_size(entry) < 2)
 396                /*
 397                 * the function returns 0 to indicate a too small
 398                 * buffer, even if there is some space left
 399                 */
 400                return 0;
 401        if (!op_cpu_buffer_add_data(entry, (u32)val))
 402                return 0;
 403        return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
 404}
 405
 406int oprofile_write_commit(struct op_entry *entry)
 407{
 408        if (!entry->event)
 409                return -EINVAL;
 410        return op_cpu_buffer_write_commit(entry);
 411}
 412
 413void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 414{
 415        struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 416        log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
 417}
 418
 419void oprofile_add_trace(unsigned long pc)
 420{
 421        struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 422
 423        if (!cpu_buf->tracing)
 424                return;
 425
 426        /*
 427         * broken frame can give an eip with the same value as an
 428         * escape code, abort the trace if we get it
 429         */
 430        if (pc == ESCAPE_CODE)
 431                goto fail;
 432
 433        if (op_add_sample(cpu_buf, pc, 0))
 434                goto fail;
 435
 436        return;
 437fail:
 438        cpu_buf->tracing = 0;
 439        cpu_buf->backtrace_aborted++;
 440        return;
 441}
 442
 443/*
 444 * This serves to avoid cpu buffer overflow, and makes sure
 445 * the task mortuary progresses
 446 *
 447 * By using schedule_delayed_work_on and then schedule_delayed_work
 448 * we guarantee this will stay on the correct cpu
 449 */
 450static void wq_sync_buffer(struct work_struct *work)
 451{
 452        struct oprofile_cpu_buffer *b =
 453                container_of(work, struct oprofile_cpu_buffer, work.work);
 454        if (b->cpu != smp_processor_id() && !cpu_online(b->cpu)) {
 455                cancel_delayed_work(&b->work);
 456                return;
 457        }
 458        sync_buffer(b->cpu);
 459
 460        /* don't re-add the work if we're shutting down */
 461        if (work_enabled)
 462                schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 463}
 464