linux/drivers/oprofile/cpu_buffer.c
<<
>>
Prefs
   1/**
   2 * @file cpu_buffer.c
   3 *
   4 * @remark Copyright 2002-2009 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf <barry.kasindorf@amd.com>
   9 * @author Robert Richter <robert.richter@amd.com>
  10 *
  11 * Each CPU has a local buffer that stores PC value/event
  12 * pairs. We also log context switches when we notice them.
  13 * Eventually each CPU's buffer is processed into the global
  14 * event buffer by sync_buffer().
  15 *
  16 * We use a local buffer for two reasons: an NMI or similar
  17 * interrupt cannot synchronise, and high sampling rates
  18 * would lead to catastrophic global synchronisation if
  19 * a global buffer was used.
  20 */
  21
  22#include <linux/sched.h>
  23#include <linux/oprofile.h>
  24#include <linux/errno.h>
  25
  26#include "event_buffer.h"
  27#include "cpu_buffer.h"
  28#include "buffer_sync.h"
  29#include "oprof.h"
  30
  31#define OP_BUFFER_FLAGS 0
  32
  33/*
  34 * Read and write access is using spin locking. Thus, writing to the
  35 * buffer by NMI handler (x86) could occur also during critical
  36 * sections when reading the buffer. To avoid this, there are 2
  37 * buffers for independent read and write access. Read access is in
  38 * process context only, write access only in the NMI handler. If the
  39 * read buffer runs empty, both buffers are swapped atomically. There
  40 * is potentially a small window during swapping where the buffers are
  41 * disabled and samples could be lost.
  42 *
  43 * Using 2 buffers is a little bit overhead, but the solution is clear
  44 * and does not require changes in the ring buffer implementation. It
  45 * can be changed to a single buffer solution when the ring buffer
  46 * access is implemented as non-locking atomic code.
  47 */
  48static struct ring_buffer *op_ring_buffer_read;
  49static struct ring_buffer *op_ring_buffer_write;
  50DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
  51
  52static void wq_sync_buffer(struct work_struct *work);
  53
  54#define DEFAULT_TIMER_EXPIRE (HZ / 10)
  55static int work_enabled;
  56
  57unsigned long oprofile_get_cpu_buffer_size(void)
  58{
  59        return oprofile_cpu_buffer_size;
  60}
  61
  62void oprofile_cpu_buffer_inc_smpl_lost(void)
  63{
  64        struct oprofile_cpu_buffer *cpu_buf
  65                = &__get_cpu_var(cpu_buffer);
  66
  67        cpu_buf->sample_lost_overflow++;
  68}
  69
  70void free_cpu_buffers(void)
  71{
  72        if (op_ring_buffer_read)
  73                ring_buffer_free(op_ring_buffer_read);
  74        op_ring_buffer_read = NULL;
  75        if (op_ring_buffer_write)
  76                ring_buffer_free(op_ring_buffer_write);
  77        op_ring_buffer_write = NULL;
  78}
  79
  80#define RB_EVENT_HDR_SIZE 4
  81
  82int alloc_cpu_buffers(void)
  83{
  84        int i;
  85
  86        unsigned long buffer_size = oprofile_cpu_buffer_size;
  87        unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
  88                                                 RB_EVENT_HDR_SIZE);
  89
  90        op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  91        if (!op_ring_buffer_read)
  92                goto fail;
  93        op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  94        if (!op_ring_buffer_write)
  95                goto fail;
  96
  97        for_each_possible_cpu(i) {
  98                struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
  99
 100                b->last_task = NULL;
 101                b->last_is_kernel = -1;
 102                b->tracing = 0;
 103                b->buffer_size = buffer_size;
 104                b->sample_received = 0;
 105                b->sample_lost_overflow = 0;
 106                b->backtrace_aborted = 0;
 107                b->sample_invalid_eip = 0;
 108                b->cpu = i;
 109                INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
 110        }
 111        return 0;
 112
 113fail:
 114        free_cpu_buffers();
 115        return -ENOMEM;
 116}
 117
 118void start_cpu_work(void)
 119{
 120        int i;
 121
 122        work_enabled = 1;
 123
 124        for_each_online_cpu(i) {
 125                struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
 126
 127                /*
 128                 * Spread the work by 1 jiffy per cpu so they dont all
 129                 * fire at once.
 130                 */
 131                schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
 132        }
 133}
 134
 135void end_cpu_work(void)
 136{
 137        int i;
 138
 139        work_enabled = 0;
 140
 141        for_each_online_cpu(i) {
 142                struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
 143
 144                cancel_delayed_work(&b->work);
 145        }
 146
 147        flush_scheduled_work();
 148}
 149
 150/*
 151 * This function prepares the cpu buffer to write a sample.
 152 *
 153 * Struct op_entry is used during operations on the ring buffer while
 154 * struct op_sample contains the data that is stored in the ring
 155 * buffer. Struct entry can be uninitialized. The function reserves a
 156 * data array that is specified by size. Use
 157 * op_cpu_buffer_write_commit() after preparing the sample. In case of
 158 * errors a null pointer is returned, otherwise the pointer to the
 159 * sample.
 160 *
 161 */
 162struct op_sample
 163*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 164{
 165        entry->event = ring_buffer_lock_reserve
 166                (op_ring_buffer_write, sizeof(struct op_sample) +
 167                 size * sizeof(entry->sample->data[0]));
 168        if (entry->event)
 169                entry->sample = ring_buffer_event_data(entry->event);
 170        else
 171                entry->sample = NULL;
 172
 173        if (!entry->sample)
 174                return NULL;
 175
 176        entry->size = size;
 177        entry->data = entry->sample->data;
 178
 179        return entry->sample;
 180}
 181
 182int op_cpu_buffer_write_commit(struct op_entry *entry)
 183{
 184        return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event);
 185}
 186
 187struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 188{
 189        struct ring_buffer_event *e;
 190        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
 191        if (e)
 192                goto event;
 193        if (ring_buffer_swap_cpu(op_ring_buffer_read,
 194                                 op_ring_buffer_write,
 195                                 cpu))
 196                return NULL;
 197        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
 198        if (e)
 199                goto event;
 200        return NULL;
 201
 202event:
 203        entry->event = e;
 204        entry->sample = ring_buffer_event_data(e);
 205        entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
 206                / sizeof(entry->sample->data[0]);
 207        entry->data = entry->sample->data;
 208        return entry->sample;
 209}
 210
 211unsigned long op_cpu_buffer_entries(int cpu)
 212{
 213        return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
 214                + ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
 215}
 216
 217static int
 218op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 219            int is_kernel, struct task_struct *task)
 220{
 221        struct op_entry entry;
 222        struct op_sample *sample;
 223        unsigned long flags;
 224        int size;
 225
 226        flags = 0;
 227
 228        if (backtrace)
 229                flags |= TRACE_BEGIN;
 230
 231        /* notice a switch from user->kernel or vice versa */
 232        is_kernel = !!is_kernel;
 233        if (cpu_buf->last_is_kernel != is_kernel) {
 234                cpu_buf->last_is_kernel = is_kernel;
 235                flags |= KERNEL_CTX_SWITCH;
 236                if (is_kernel)
 237                        flags |= IS_KERNEL;
 238        }
 239
 240        /* notice a task switch */
 241        if (cpu_buf->last_task != task) {
 242                cpu_buf->last_task = task;
 243                flags |= USER_CTX_SWITCH;
 244        }
 245
 246        if (!flags)
 247                /* nothing to do */
 248                return 0;
 249
 250        if (flags & USER_CTX_SWITCH)
 251                size = 1;
 252        else
 253                size = 0;
 254
 255        sample = op_cpu_buffer_write_reserve(&entry, size);
 256        if (!sample)
 257                return -ENOMEM;
 258
 259        sample->eip = ESCAPE_CODE;
 260        sample->event = flags;
 261
 262        if (size)
 263                op_cpu_buffer_add_data(&entry, (unsigned long)task);
 264
 265        op_cpu_buffer_write_commit(&entry);
 266
 267        return 0;
 268}
 269
 270static inline int
 271op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 272              unsigned long pc, unsigned long event)
 273{
 274        struct op_entry entry;
 275        struct op_sample *sample;
 276
 277        sample = op_cpu_buffer_write_reserve(&entry, 0);
 278        if (!sample)
 279                return -ENOMEM;
 280
 281        sample->eip = pc;
 282        sample->event = event;
 283
 284        return op_cpu_buffer_write_commit(&entry);
 285}
 286
 287/*
 288 * This must be safe from any context.
 289 *
 290 * is_kernel is needed because on some architectures you cannot
 291 * tell if you are in kernel or user space simply by looking at
 292 * pc. We tag this in the buffer by generating kernel enter/exit
 293 * events whenever is_kernel changes
 294 */
 295static int
 296log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 297           unsigned long backtrace, int is_kernel, unsigned long event)
 298{
 299        cpu_buf->sample_received++;
 300
 301        if (pc == ESCAPE_CODE) {
 302                cpu_buf->sample_invalid_eip++;
 303                return 0;
 304        }
 305
 306        if (op_add_code(cpu_buf, backtrace, is_kernel, current))
 307                goto fail;
 308
 309        if (op_add_sample(cpu_buf, pc, event))
 310                goto fail;
 311
 312        return 1;
 313
 314fail:
 315        cpu_buf->sample_lost_overflow++;
 316        return 0;
 317}
 318
 319static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 320{
 321        cpu_buf->tracing = 1;
 322}
 323
 324static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 325{
 326        cpu_buf->tracing = 0;
 327}
 328
 329static inline void
 330__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 331                          unsigned long event, int is_kernel)
 332{
 333        struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 334        unsigned long backtrace = oprofile_backtrace_depth;
 335
 336        /*
 337         * if log_sample() fail we can't backtrace since we lost the
 338         * source of this event
 339         */
 340        if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event))
 341                /* failed */
 342                return;
 343
 344        if (!backtrace)
 345                return;
 346
 347        oprofile_begin_trace(cpu_buf);
 348        oprofile_ops.backtrace(regs, backtrace);
 349        oprofile_end_trace(cpu_buf);
 350}
 351
 352void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 353                             unsigned long event, int is_kernel)
 354{
 355        __oprofile_add_ext_sample(pc, regs, event, is_kernel);
 356}
 357
 358void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 359{
 360        int is_kernel = !user_mode(regs);
 361        unsigned long pc = profile_pc(regs);
 362
 363        __oprofile_add_ext_sample(pc, regs, event, is_kernel);
 364}
 365
 366/*
 367 * Add samples with data to the ring buffer.
 368 *
 369 * Use oprofile_add_data(&entry, val) to add data and
 370 * oprofile_write_commit(&entry) to commit the sample.
 371 */
 372void
 373oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 374                       unsigned long pc, int code, int size)
 375{
 376        struct op_sample *sample;
 377        int is_kernel = !user_mode(regs);
 378        struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 379
 380        cpu_buf->sample_received++;
 381
 382        /* no backtraces for samples with data */
 383        if (op_add_code(cpu_buf, 0, is_kernel, current))
 384                goto fail;
 385
 386        sample = op_cpu_buffer_write_reserve(entry, size + 2);
 387        if (!sample)
 388                goto fail;
 389        sample->eip = ESCAPE_CODE;
 390        sample->event = 0;              /* no flags */
 391
 392        op_cpu_buffer_add_data(entry, code);
 393        op_cpu_buffer_add_data(entry, pc);
 394
 395        return;
 396
 397fail:
 398        entry->event = NULL;
 399        cpu_buf->sample_lost_overflow++;
 400}
 401
 402int oprofile_add_data(struct op_entry *entry, unsigned long val)
 403{
 404        if (!entry->event)
 405                return 0;
 406        return op_cpu_buffer_add_data(entry, val);
 407}
 408
 409int oprofile_add_data64(struct op_entry *entry, u64 val)
 410{
 411        if (!entry->event)
 412                return 0;
 413        if (op_cpu_buffer_get_size(entry) < 2)
 414                /*
 415                 * the function returns 0 to indicate a too small
 416                 * buffer, even if there is some space left
 417                 */
 418                return 0;
 419        if (!op_cpu_buffer_add_data(entry, (u32)val))
 420                return 0;
 421        return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
 422}
 423
 424int oprofile_write_commit(struct op_entry *entry)
 425{
 426        if (!entry->event)
 427                return -EINVAL;
 428        return op_cpu_buffer_write_commit(entry);
 429}
 430
 431void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 432{
 433        struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 434        log_sample(cpu_buf, pc, 0, is_kernel, event);
 435}
 436
 437void oprofile_add_trace(unsigned long pc)
 438{
 439        struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 440
 441        if (!cpu_buf->tracing)
 442                return;
 443
 444        /*
 445         * broken frame can give an eip with the same value as an
 446         * escape code, abort the trace if we get it
 447         */
 448        if (pc == ESCAPE_CODE)
 449                goto fail;
 450
 451        if (op_add_sample(cpu_buf, pc, 0))
 452                goto fail;
 453
 454        return;
 455fail:
 456        cpu_buf->tracing = 0;
 457        cpu_buf->backtrace_aborted++;
 458        return;
 459}
 460
 461/*
 462 * This serves to avoid cpu buffer overflow, and makes sure
 463 * the task mortuary progresses
 464 *
 465 * By using schedule_delayed_work_on and then schedule_delayed_work
 466 * we guarantee this will stay on the correct cpu
 467 */
 468static void wq_sync_buffer(struct work_struct *work)
 469{
 470        struct oprofile_cpu_buffer *b =
 471                container_of(work, struct oprofile_cpu_buffer, work.work);
 472        if (b->cpu != smp_processor_id()) {
 473                printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
 474                       smp_processor_id(), b->cpu);
 475
 476                if (!cpu_online(b->cpu)) {
 477                        cancel_delayed_work(&b->work);
 478                        return;
 479                }
 480        }
 481        sync_buffer(b->cpu);
 482
 483        /* don't re-add the work if we're shutting down */
 484        if (work_enabled)
 485                schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 486}
 487