LXR linux/kernel/trace/ring

   1/*
   2 * Generic ring buffer
   3 *
   4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
   5 */
   6#include <linux/ring_buffer.h>
   7#include <linux/trace_clock.h>
   8#include <linux/ftrace_irq.h>
   9#include <linux/spinlock.h>
  10#include <linux/debugfs.h>
  11#include <linux/uaccess.h>
  12#include <linux/hardirq.h>
  13#include <linux/kmemcheck.h>
  14#include <linux/module.h>
  15#include <linux/percpu.h>
  16#include <linux/mutex.h>
  17#include <linux/init.h>
  18#include <linux/hash.h>
  19#include <linux/list.h>
  20#include <linux/cpu.h>
  21#include <linux/fs.h>
  22
  23#include "trace.h"
  24
  25/*
  26 * The ring buffer header is special. We must manually up keep it.
  27 */
  28int ring_buffer_print_entry_header(struct trace_seq *s)
  29{
  30        int ret;
  31
  32        ret = trace_seq_printf(s, "# compressed entry header\n");
  33        ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
  34        ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
  35        ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
  36        ret = trace_seq_printf(s, "\n");
  37        ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
  38                               RINGBUF_TYPE_PADDING);
  39        ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
  40                               RINGBUF_TYPE_TIME_EXTEND);
  41        ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
  42                               RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
  43
  44        return ret;
  45}
  46
  47/*
  48 * The ring buffer is made up of a list of pages. A separate list of pages is
  49 * allocated for each CPU. A writer may only write to a buffer that is
  50 * associated with the CPU it is currently executing on.  A reader may read
  51 * from any per cpu buffer.
  52 *
  53 * The reader is special. For each per cpu buffer, the reader has its own
  54 * reader page. When a reader has read the entire reader page, this reader
  55 * page is swapped with another page in the ring buffer.
  56 *
  57 * Now, as long as the writer is off the reader page, the reader can do what
  58 * ever it wants with that page. The writer will never write to that page
  59 * again (as long as it is out of the ring buffer).
  60 *
  61 * Here's some silly ASCII art.
  62 *
  63 *   +------+
  64 *   |reader|          RING BUFFER
  65 *   |page  |
  66 *   +------+        +---+   +---+   +---+
  67 *                   |   |-->|   |-->|   |
  68 *                   +---+   +---+   +---+
  69 *                     ^               |
  70 *                     |               |
  71 *                     +---------------+
  72 *
  73 *
  74 *   +------+
  75 *   |reader|          RING BUFFER
  76 *   |page  |------------------v
  77 *   +------+        +---+   +---+   +---+
  78 *                   |   |-->|   |-->|   |
  79 *                   +---+   +---+   +---+
  80 *                     ^               |
  81 *                     |               |
  82 *                     +---------------+
  83 *
  84 *
  85 *   +------+
  86 *   |reader|          RING BUFFER
  87 *   |page  |------------------v
  88 *   +------+        +---+   +---+   +---+
  89 *      ^            |   |-->|   |-->|   |
  90 *      |            +---+   +---+   +---+
  91 *      |                              |
  92 *      |                              |
  93 *      +------------------------------+
  94 *
  95 *
  96 *   +------+
  97 *   |buffer|          RING BUFFER
  98 *   |page  |------------------v
  99 *   +------+        +---+   +---+   +---+
 100 *      ^            |   |   |   |-->|   |
 101 *      |   New      +---+   +---+   +---+
 102 *      |  Reader------^               |
 103 *      |   page                       |
 104 *      +------------------------------+
 105 *
 106 *
 107 * After we make this swap, the reader can hand this page off to the splice
 108 * code and be done with it. It can even allocate a new page if it needs to
 109 * and swap that into the ring buffer.
 110 *
 111 * We will be using cmpxchg soon to make all this lockless.
 112 *
 113 */
 114
 115/*
 116 * A fast way to enable or disable all ring buffers is to
 117 * call tracing_on or tracing_off. Turning off the ring buffers
 118 * prevents all ring buffers from being recorded to.
 119 * Turning this switch on, makes it OK to write to the
 120 * ring buffer, if the ring buffer is enabled itself.
 121 *
 122 * There's three layers that must be on in order to write
 123 * to the ring buffer.
 124 *
 125 * 1) This global flag must be set.
 126 * 2) The ring buffer must be enabled for recording.
 127 * 3) The per cpu buffer must be enabled for recording.
 128 *
 129 * In case of an anomaly, this global flag has a bit set that
 130 * will permantly disable all ring buffers.
 131 */
 132
 133/*
 134 * Global flag to disable all recording to ring buffers
 135 *  This has two bits: ON, DISABLED
 136 *
 137 *  ON   DISABLED
 138 * ---- ----------
 139 *   0      0        : ring buffers are off
 140 *   1      0        : ring buffers are on
 141 *   X      1        : ring buffers are permanently disabled
 142 */
 143
 144enum {
 145        RB_BUFFERS_ON_BIT       = 0,
 146        RB_BUFFERS_DISABLED_BIT = 1,
 147};
 148
 149enum {
 150        RB_BUFFERS_ON           = 1 << RB_BUFFERS_ON_BIT,
 151        RB_BUFFERS_DISABLED     = 1 << RB_BUFFERS_DISABLED_BIT,
 152};
 153
 154static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 155
 156#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
 157
 158/**
 159 * tracing_on - enable all tracing buffers
 160 *
 161 * This function enables all tracing buffers that may have been
 162 * disabled with tracing_off.
 163 */
 164void tracing_on(void)
 165{
 166        set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 167}
 168EXPORT_SYMBOL_GPL(tracing_on);
 169
 170/**
 171 * tracing_off - turn off all tracing buffers
 172 *
 173 * This function stops all tracing buffers from recording data.
 174 * It does not disable any overhead the tracers themselves may
 175 * be causing. This function simply causes all recording to
 176 * the ring buffers to fail.
 177 */
 178void tracing_off(void)
 179{
 180        clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 181}
 182EXPORT_SYMBOL_GPL(tracing_off);
 183
 184/**
 185 * tracing_off_permanent - permanently disable ring buffers
 186 *
 187 * This function, once called, will disable all ring buffers
 188 * permanently.
 189 */
 190void tracing_off_permanent(void)
 191{
 192        set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 193}
 194
 195/**
 196 * tracing_is_on - show state of ring buffers enabled
 197 */
 198int tracing_is_on(void)
 199{
 200        return ring_buffer_flags == RB_BUFFERS_ON;
 201}
 202EXPORT_SYMBOL_GPL(tracing_is_on);
 203
 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 205#define RB_ALIGNMENT            4U
 206#define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 207#define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
 208
 209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 211
 212enum {
 213        RB_LEN_TIME_EXTEND = 8,
 214        RB_LEN_TIME_STAMP = 16,
 215};
 216
 217static inline int rb_null_event(struct ring_buffer_event *event)
 218{
 219        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
 220}
 221
 222static void rb_event_set_padding(struct ring_buffer_event *event)
 223{
 224        /* padding has a NULL time_delta */
 225        event->type_len = RINGBUF_TYPE_PADDING;
 226        event->time_delta = 0;
 227}
 228
 229static unsigned
 230rb_event_data_length(struct ring_buffer_event *event)
 231{
 232        unsigned length;
 233
 234        if (event->type_len)
 235                length = event->type_len * RB_ALIGNMENT;
 236        else
 237                length = event->array[0];
 238        return length + RB_EVNT_HDR_SIZE;
 239}
 240
 241/* inline for ring buffer fast paths */
 242static unsigned
 243rb_event_length(struct ring_buffer_event *event)
 244{
 245        switch (event->type_len) {
 246        case RINGBUF_TYPE_PADDING:
 247                if (rb_null_event(event))
 248                        /* undefined */
 249                        return -1;
 250                return  event->array[0] + RB_EVNT_HDR_SIZE;
 251
 252        case RINGBUF_TYPE_TIME_EXTEND:
 253                return RB_LEN_TIME_EXTEND;
 254
 255        case RINGBUF_TYPE_TIME_STAMP:
 256                return RB_LEN_TIME_STAMP;
 257
 258        case RINGBUF_TYPE_DATA:
 259                return rb_event_data_length(event);
 260        default:
 261                BUG();
 262        }
 263        /* not hit */
 264        return 0;
 265}
 266
 267/**
 268 * ring_buffer_event_length - return the length of the event
 269 * @event: the event to get the length of
 270 */
 271unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 272{
 273        unsigned length = rb_event_length(event);
 274        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 275                return length;
 276        length -= RB_EVNT_HDR_SIZE;
 277        if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
 278                length -= sizeof(event->array[0]);
 279        return length;
 280}
 281EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 282
 283/* inline for ring buffer fast paths */
 284static void *
 285rb_event_data(struct ring_buffer_event *event)
 286{
 287        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 288        /* If length is in len field, then array[0] has the data */
 289        if (event->type_len)
 290                return (void *)&event->array[0];
 291        /* Otherwise length is in array[0] and array[1] has the data */
 292        return (void *)&event->array[1];
 293}
 294
 295/**
 296 * ring_buffer_event_data - return the data of the event
 297 * @event: the event to get the data from
 298 */
 299void *ring_buffer_event_data(struct ring_buffer_event *event)
 300{
 301        return rb_event_data(event);
 302}
 303EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 304
 305#define for_each_buffer_cpu(buffer, cpu)                \
 306        for_each_cpu(cpu, buffer->cpumask)
 307
 308#define TS_SHIFT        27
 309#define TS_MASK         ((1ULL << TS_SHIFT) - 1)
 310#define TS_DELTA_TEST   (~TS_MASK)
 311
 312struct buffer_data_page {
 313        u64              time_stamp;    /* page time stamp */
 314        local_t          commit;        /* write committed index */
 315        unsigned char    data[];        /* data of buffer page */
 316};
 317
 318/*
 319 * Note, the buffer_page list must be first. The buffer pages
 320 * are allocated in cache lines, which means that each buffer
 321 * page will be at the beginning of a cache line, and thus
 322 * the least significant bits will be zero. We use this to
 323 * add flags in the list struct pointers, to make the ring buffer
 324 * lockless.
 325 */
 326struct buffer_page {
 327        struct list_head list;          /* list of buffer pages */
 328        local_t          write;         /* index for next write */
 329        unsigned         read;          /* index for next read */
 330        local_t          entries;       /* entries on this page */
 331        struct buffer_data_page *page;  /* Actual data page */
 332};
 333
 334/*
 335 * The buffer page counters, write and entries, must be reset
 336 * atomically when crossing page boundaries. To synchronize this
 337 * update, two counters are inserted into the number. One is
 338 * the actual counter for the write position or count on the page.
 339 *
 340 * The other is a counter of updaters. Before an update happens
 341 * the update partition of the counter is incremented. This will
 342 * allow the updater to update the counter atomically.
 343 *
 344 * The counter is 20 bits, and the state data is 12.
 345 */
 346#define RB_WRITE_MASK           0xfffff
 347#define RB_WRITE_INTCNT         (1 << 20)
 348
 349static void rb_init_page(struct buffer_data_page *bpage)
 350{
 351        local_set(&bpage->commit, 0);
 352}
 353
 354/**
 355 * ring_buffer_page_len - the size of data on the page.
 356 * @page: The page to read
 357 *
 358 * Returns the amount of data on the page, including buffer page header.
 359 */
 360size_t ring_buffer_page_len(void *page)
 361{
 362        return local_read(&((struct buffer_data_page *)page)->commit)
 363                + BUF_PAGE_HDR_SIZE;
 364}
 365
 366/*
 367 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
 368 * this issue out.
 369 */
 370static void free_buffer_page(struct buffer_page *bpage)
 371{
 372        free_page((unsigned long)bpage->page);
 373        kfree(bpage);
 374}
 375
 376/*
 377 * We need to fit the time_stamp delta into 27 bits.
 378 */
 379static inline int test_time_stamp(u64 delta)
 380{
 381        if (delta & TS_DELTA_TEST)
 382                return 1;
 383        return 0;
 384}
 385
 386#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 387
 388/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 389#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 390
 391/* Max number of timestamps that can fit on a page */
 392#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
 393
 394int ring_buffer_print_page_header(struct trace_seq *s)
 395{
 396        struct buffer_data_page field;
 397        int ret;
 398
 399        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
 400                               "offset:0;\tsize:%u;\n",
 401                               (unsigned int)sizeof(field.time_stamp));
 402
 403        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
 404                               "offset:%u;\tsize:%u;\n",
 405                               (unsigned int)offsetof(typeof(field), commit),
 406                               (unsigned int)sizeof(field.commit));
 407
 408        ret = trace_seq_printf(s, "\tfield: char data;\t"
 409                               "offset:%u;\tsize:%u;\n",
 410                               (unsigned int)offsetof(typeof(field), data),
 411                               (unsigned int)BUF_PAGE_SIZE);
 412
 413        return ret;
 414}
 415
 416/*
 417 * head_page == tail_page && head == tail then buffer is empty.
 418 */
 419struct ring_buffer_per_cpu {
 420        int                             cpu;
 421        struct ring_buffer              *buffer;
 422        spinlock_t                      reader_lock;    /* serialize readers */
 423        raw_spinlock_t                  lock;
 424        struct lock_class_key           lock_key;
 425        struct list_head                *pages;
 426        struct buffer_page              *head_page;     /* read from head */
 427        struct buffer_page              *tail_page;     /* write to tail */
 428        struct buffer_page              *commit_page;   /* committed pages */
 429        struct buffer_page              *reader_page;
 430        local_t                         commit_overrun;
 431        local_t                         overrun;
 432        local_t                         entries;
 433        local_t                         committing;
 434        local_t                         commits;
 435        unsigned long                   read;
 436        u64                             write_stamp;
 437        u64                             read_stamp;
 438        atomic_t                        record_disabled;
 439};
 440
 441struct ring_buffer {
 442        unsigned                        pages;
 443        unsigned                        flags;
 444        int                             cpus;
 445        atomic_t                        record_disabled;
 446        cpumask_var_t                   cpumask;
 447
 448        struct lock_class_key           *reader_lock_key;
 449
 450        struct mutex                    mutex;
 451
 452        struct ring_buffer_per_cpu      **buffers;
 453
 454#ifdef CONFIG_HOTPLUG_CPU
 455        struct notifier_block           cpu_notify;
 456#endif
 457        u64                             (*clock)(void);
 458};
 459
 460struct ring_buffer_iter {
 461        struct ring_buffer_per_cpu      *cpu_buffer;
 462        unsigned long                   head;
 463        struct buffer_page              *head_page;
 464        u64                             read_stamp;
 465};
 466
 467/* buffer may be either ring_buffer or ring_buffer_per_cpu */
 468#define RB_WARN_ON(b, cond)                                             \
 469        ({                                                              \
 470                int _____ret = unlikely(cond);                          \
 471                if (_____ret) {                                         \
 472                        if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
 473                                struct ring_buffer_per_cpu *__b =       \
 474                                        (void *)b;                      \
 475                                atomic_inc(&__b->buffer->record_disabled); \
 476                        } else                                          \
 477                                atomic_inc(&b->record_disabled);        \
 478                        WARN_ON(1);                                     \
 479                }                                                       \
 480                _____ret;                                               \
 481        })
 482
 483/* Up this if you want to test the TIME_EXTENTS and normalization */
 484#define DEBUG_SHIFT 0
 485
 486static inline u64 rb_time_stamp(struct ring_buffer *buffer)
 487{
 488        /* shift to debug/test normalization and TIME_EXTENTS */
 489        return buffer->clock() << DEBUG_SHIFT;
 490}
 491
 492u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 493{
 494        u64 time;
 495
 496        preempt_disable_notrace();
 497        time = rb_time_stamp(buffer);
 498        preempt_enable_no_resched_notrace();
 499
 500        return time;
 501}
 502EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
 503
 504void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
 505                                      int cpu, u64 *ts)
 506{
 507        /* Just stupid testing the normalize function and deltas */
 508        *ts >>= DEBUG_SHIFT;
 509}
 510EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 511
 512/*
 513 * Making the ring buffer lockless makes things tricky.
 514 * Although writes only happen on the CPU that they are on,
 515 * and they only need to worry about interrupts. Reads can
 516 * happen on any CPU.
 517 *
 518 * The reader page is always off the ring buffer, but when the
 519 * reader finishes with a page, it needs to swap its page with
 520 * a new one from the buffer. The reader needs to take from
 521 * the head (writes go to the tail). But if a writer is in overwrite
 522 * mode and wraps, it must push the head page forward.
 523 *
 524 * Here lies the problem.
 525 *
 526 * The reader must be careful to replace only the head page, and
 527 * not another one. As described at the top of the file in the
 528 * ASCII art, the reader sets its old page to point to the next
 529 * page after head. It then sets the page after head to point to
 530 * the old reader page. But if the writer moves the head page
 531 * during this operation, the reader could end up with the tail.
 532 *
 533 * We use cmpxchg to help prevent this race. We also do something
 534 * special with the page before head. We set the LSB to 1.
 535 *
 536 * When the writer must push the page forward, it will clear the
 537 * bit that points to the head page, move the head, and then set
 538 * the bit that points to the new head page.
 539 *
 540 * We also don't want an interrupt coming in and moving the head
 541 * page on another writer. Thus we use the second LSB to catch
 542 * that too. Thus:
 543 *
 544 * head->list->prev->next        bit 1          bit 0
 545 *                              -------        -------
 546 * Normal page                     0              0
 547 * Points to head page             0              1
 548 * New head page                   1              0
 549 *
 550 * Note we can not trust the prev pointer of the head page, because:
 551 *
 552 * +----+       +-----+        +-----+
 553 * |    |------>|  T  |---X--->|  N  |
 554 * |    |<------|     |        |     |
 555 * +----+       +-----+        +-----+
 556 *   ^                           ^ |
 557 *   |          +-----+          | |
 558 *   +----------|  R  |----------+ |
 559 *              |     |<-----------+
 560 *              +-----+
 561 *
 562 * Key:  ---X-->  HEAD flag set in pointer
 563 *         T      Tail page
 564 *         R      Reader page
 565 *         N      Next page
 566 *
 567 * (see __rb_reserve_next() to see where this happens)
 568 *
 569 *  What the above shows is that the reader just swapped out
 570 *  the reader page with a page in the buffer, but before it
 571 *  could make the new header point back to the new page added
 572 *  it was preempted by a writer. The writer moved forward onto
 573 *  the new page added by the reader and is about to move forward
 574 *  again.
 575 *
 576 *  You can see, it is legitimate for the previous pointer of
 577 *  the head (or any page) not to point back to itself. But only
 578 *  temporarially.
 579 */
 580
 581#define RB_PAGE_NORMAL          0UL
 582#define RB_PAGE_HEAD            1UL
 583#define RB_PAGE_UPDATE          2UL
 584
 585
 586#define RB_FLAG_MASK            3UL
 587
 588/* PAGE_MOVED is not part of the mask */
 589#define RB_PAGE_MOVED           4UL
 590
 591/*
 592 * rb_list_head - remove any bit
 593 */
 594static struct list_head *rb_list_head(struct list_head *list)
 595{
 596        unsigned long val = (unsigned long)list;
 597
 598        return (struct list_head *)(val & ~RB_FLAG_MASK);
 599}
 600
 601/*
 602 * rb_is_head_page - test if the given page is the head page
 603 *
 604 * Because the reader may move the head_page pointer, we can
 605 * not trust what the head page is (it may be pointing to
 606 * the reader page). But if the next page is a header page,
 607 * its flags will be non zero.
 608 */
 609static int inline
 610rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 611                struct buffer_page *page, struct list_head *list)
 612{
 613        unsigned long val;
 614
 615        val = (unsigned long)list->next;
 616
 617        if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
 618                return RB_PAGE_MOVED;
 619
 620        return val & RB_FLAG_MASK;
 621}
 622
 623/*
 624 * rb_is_reader_page
 625 *
 626 * The unique thing about the reader page, is that, if the
 627 * writer is ever on it, the previous pointer never points
 628 * back to the reader page.
 629 */
 630static int rb_is_reader_page(struct buffer_page *page)
 631{
 632        struct list_head *list = page->list.prev;
 633
 634        return rb_list_head(list->next) != &page->list;
 635}
 636
 637/*
 638 * rb_set_list_to_head - set a list_head to be pointing to head.
 639 */
 640static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
 641                                struct list_head *list)
 642{
 643        unsigned long *ptr;
 644
 645        ptr = (unsigned long *)&list->next;
 646        *ptr |= RB_PAGE_HEAD;
 647        *ptr &= ~RB_PAGE_UPDATE;
 648}
 649
 650/*
 651 * rb_head_page_activate - sets up head page
 652 */
 653static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
 654{
 655        struct buffer_page *head;
 656
 657        head = cpu_buffer->head_page;
 658        if (!head)
 659                return;
 660
 661        /*
 662         * Set the previous list pointer to have the HEAD flag.
 663         */
 664        rb_set_list_to_head(cpu_buffer, head->list.prev);
 665}
 666
 667static void rb_list_head_clear(struct list_head *list)
 668{
 669        unsigned long *ptr = (unsigned long *)&list->next;
 670
 671        *ptr &= ~RB_FLAG_MASK;
 672}
 673
 674/*
 675 * rb_head_page_dactivate - clears head page ptr (for free list)
 676 */
 677static void
 678rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
 679{
 680        struct list_head *hd;
 681
 682        /* Go through the whole list and clear any pointers found. */
 683        rb_list_head_clear(cpu_buffer->pages);
 684
 685        list_for_each(hd, cpu_buffer->pages)
 686                rb_list_head_clear(hd);
 687}
 688
 689static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
 690                            struct buffer_page *head,
 691                            struct buffer_page *prev,
 692                            int old_flag, int new_flag)
 693{
 694        struct list_head *list;
 695        unsigned long val = (unsigned long)&head->list;
 696        unsigned long ret;
 697
 698        list = &prev->list;
 699
 700        val &= ~RB_FLAG_MASK;
 701
 702        ret = cmpxchg((unsigned long *)&list->next,
 703                      val | old_flag, val | new_flag);
 704
 705        /* check if the reader took the page */
 706        if ((ret & ~RB_FLAG_MASK) != val)
 707                return RB_PAGE_MOVED;
 708
 709        return ret & RB_FLAG_MASK;
 710}
 711
 712static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
 713                                   struct buffer_page *head,
 714                                   struct buffer_page *prev,
 715                                   int old_flag)
 716{
 717        return rb_head_page_set(cpu_buffer, head, prev,
 718                                old_flag, RB_PAGE_UPDATE);
 719}
 720
 721static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
 722                                 struct buffer_page *head,
 723                                 struct buffer_page *prev,
 724                                 int old_flag)
 725{
 726        return rb_head_page_set(cpu_buffer, head, prev,
 727                                old_flag, RB_PAGE_HEAD);
 728}
 729
 730static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
 731                                   struct buffer_page *head,
 732                                   struct buffer_page *prev,
 733                                   int old_flag)
 734{
 735        return rb_head_page_set(cpu_buffer, head, prev,
 736                                old_flag, RB_PAGE_NORMAL);
 737}
 738
 739static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
 740                               struct buffer_page **bpage)
 741{
 742        struct list_head *p = rb_list_head((*bpage)->list.next);
 743
 744        *bpage = list_entry(p, struct buffer_page, list);
 745}
 746
 747static struct buffer_page *
 748rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 749{
 750        struct buffer_page *head;
 751        struct buffer_page *page;
 752        struct list_head *list;
 753        int i;
 754
 755        if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
 756                return NULL;
 757
 758        /* sanity check */
 759        list = cpu_buffer->pages;
 760        if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
 761                return NULL;
 762
 763        page = head = cpu_buffer->head_page;
 764        /*
 765         * It is possible that the writer moves the header behind
 766         * where we started, and we miss in one loop.
 767         * A second loop should grab the header, but we'll do
 768         * three loops just because I'm paranoid.
 769         */
 770        for (i = 0; i < 3; i++) {
 771                do {
 772                        if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
 773                                cpu_buffer->head_page = page;
 774                                return page;
 775                        }
 776                        rb_inc_page(cpu_buffer, &page);
 777                } while (page != head);
 778        }
 779
 780        RB_WARN_ON(cpu_buffer, 1);
 781
 782        return NULL;
 783}
 784
 785static int rb_head_page_replace(struct buffer_page *old,
 786                                struct buffer_page *new)
 787{
 788        unsigned long *ptr = (unsigned long *)&old->list.prev->next;
 789        unsigned long val;
 790        unsigned long ret;
 791
 792        val = *ptr & ~RB_FLAG_MASK;
 793        val |= RB_PAGE_HEAD;
 794
 795        ret = cmpxchg(ptr, val, (unsigned long)&new->list);
 796
 797        return ret == val;
 798}
 799
 800/*
 801 * rb_tail_page_update - move the tail page forward
 802 *
 803 * Returns 1 if moved tail page, 0 if someone else did.
 804 */
 805static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 806                               struct buffer_page *tail_page,
 807                               struct buffer_page *next_page)
 808{
 809        struct buffer_page *old_tail;
 810        unsigned long old_entries;
 811        unsigned long old_write;
 812        int ret = 0;
 813
 814        /*
 815         * The tail page now needs to be moved forward.
 816         *
 817         * We need to reset the tail page, but without messing
 818         * with possible erasing of data brought in by interrupts
 819         * that have moved the tail page and are currently on it.
 820         *
 821         * We add a counter to the write field to denote this.
 822         */
 823        old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
 824        old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
 825
 826        /*
 827         * Just make sure we have seen our old_write and synchronize
 828         * with any interrupts that come in.
 829         */
 830        barrier();
 831
 832        /*
 833         * If the tail page is still the same as what we think
 834         * it is, then it is up to us to update the tail
 835         * pointer.
 836         */
 837        if (tail_page == cpu_buffer->tail_page) {
 838                /* Zero the write counter */
 839                unsigned long val = old_write & ~RB_WRITE_MASK;
 840                unsigned long eval = old_entries & ~RB_WRITE_MASK;
 841
 842                /*
 843                 * This will only succeed if an interrupt did
 844                 * not come in and change it. In which case, we
 845                 * do not want to modify it.
 846                 *
 847                 * We add (void) to let the compiler know that we do not care
 848                 * about the return value of these functions. We use the
 849                 * cmpxchg to only update if an interrupt did not already
 850                 * do it for us. If the cmpxchg fails, we don't care.
 851                 */
 852                (void)local_cmpxchg(&next_page->write, old_write, val);
 853                (void)local_cmpxchg(&next_page->entries, old_entries, eval);
 854
 855                /*
 856                 * No need to worry about races with clearing out the commit.
 857                 * it only can increment when a commit takes place. But that
 858                 * only happens in the outer most nested commit.
 859                 */
 860                local_set(&next_page->page->commit, 0);
 861
 862                old_tail = cmpxchg(&cpu_buffer->tail_page,
 863                                   tail_page, next_page);
 864
 865                if (old_tail == tail_page)
 866                        ret = 1;
 867        }
 868
 869        return ret;
 870}
 871
 872static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
 873                          struct buffer_page *bpage)
 874{
 875        unsigned long val = (unsigned long)bpage;
 876
 877        if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
 878                return 1;
 879
 880        return 0;
 881}
 882
 883/**
 884 * rb_check_list - make sure a pointer to a list has the last bits zero
 885 */
 886static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
 887                         struct list_head *list)
 888{
 889        if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
 890                return 1;
 891        if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
 892                return 1;
 893        return 0;
 894}
 895
 896/**
 897 * check_pages - integrity check of buffer pages
 898 * @cpu_buffer: CPU buffer with pages to test
 899 *
 900 * As a safety measure we check to make sure the data pages have not
 901 * been corrupted.
 902 */
 903static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 904{
 905        struct list_head *head = cpu_buffer->pages;
 906        struct buffer_page *bpage, *tmp;
 907
 908        rb_head_page_deactivate(cpu_buffer);
 909
 910        if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
 911                return -1;
 912        if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
 913                return -1;
 914
 915        if (rb_check_list(cpu_buffer, head))
 916                return -1;
 917
 918        list_for_each_entry_safe(bpage, tmp, head, list) {
 919                if (RB_WARN_ON(cpu_buffer,
 920                               bpage->list.next->prev != &bpage->list))
 921                        return -1;
 922                if (RB_WARN_ON(cpu_buffer,
 923                               bpage->list.prev->next != &bpage->list))
 924                        return -1;
 925                if (rb_check_list(cpu_buffer, &bpage->list))
 926                        return -1;
 927        }
 928
 929        rb_head_page_activate(cpu_buffer);
 930
 931        return 0;
 932}
 933
 934static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 935                             unsigned nr_pages)
 936{
 937        struct buffer_page *bpage, *tmp;
 938        unsigned long addr;
 939        LIST_HEAD(pages);
 940        unsigned i;
 941
 942        WARN_ON(!nr_pages);
 943
 944        for (i = 0; i < nr_pages; i++) {
 945                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 946                                    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
 947                if (!bpage)
 948                        goto free_pages;
 949
 950                rb_check_bpage(cpu_buffer, bpage);
 951
 952                list_add(&bpage->list, &pages);
 953
 954                addr = __get_free_page(GFP_KERNEL);
 955                if (!addr)
 956                        goto free_pages;
 957                bpage->page = (void *)addr;
 958                rb_init_page(bpage->page);
 959        }
 960
 961        /*
 962         * The ring buffer page list is a circular list that does not
 963         * start and end with a list head. All page list items point to
 964         * other pages.
 965         */
 966        cpu_buffer->pages = pages.next;
 967        list_del(&pages);
 968
 969        rb_check_pages(cpu_buffer);
 970
 971        return 0;
 972
 973 free_pages:
 974        list_for_each_entry_safe(bpage, tmp, &pages, list) {
 975                list_del_init(&bpage->list);
 976                free_buffer_page(bpage);
 977        }
 978        return -ENOMEM;
 979}
 980
 981static struct ring_buffer_per_cpu *
 982rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 983{
 984        struct ring_buffer_per_cpu *cpu_buffer;
 985        struct buffer_page *bpage;
 986        unsigned long addr;
 987        int ret;
 988
 989        cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
 990                                  GFP_KERNEL, cpu_to_node(cpu));
 991        if (!cpu_buffer)
 992                return NULL;
 993
 994        cpu_buffer->cpu = cpu;
 995        cpu_buffer->buffer = buffer;
 996        spin_lock_init(&cpu_buffer->reader_lock);
 997        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 998        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 999
1000        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),

1001                            GFP_KERNEL, cpu_to_node(cpu));
1002        if (!bpage)
1003                goto fail_free_buffer;
1004
1005        rb_check_bpage(cpu_buffer, bpage);
1006
1007        cpu_buffer->reader_page = bpage;
1008        addr = __get_free_page(GFP_KERNEL);
1009        if (!addr)
1010                goto fail_free_reader;
1011        bpage->page = (void *)addr;
1012        rb_init_page(bpage->page);
1013
1014        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1015
1016        ret = rb_allocate_pages(cpu_buffer, buffer->pages);
1017        if (ret < 0)
1018                goto fail_free_reader;
1019
1020        cpu_buffer->head_page
1021                = list_entry(cpu_buffer->pages, struct buffer_page, list);
1022        cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
1023
1024        rb_head_page_activate(cpu_buffer);
1025
1026        return cpu_buffer;
1027
1028 fail_free_reader:
1029        free_buffer_page(cpu_buffer->reader_page);
1030
1031 fail_free_buffer:
1032        kfree(cpu_buffer);
1033        return NULL;
1034}
1035
1036static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
1037{
1038        struct list_head *head = cpu_buffer->pages;
1039        struct buffer_page *bpage, *tmp;
1040
1041        free_buffer_page(cpu_buffer->reader_page);
1042
1043        rb_head_page_deactivate(cpu_buffer);
1044
1045        if (head) {
1046                list_for_each_entry_safe(bpage, tmp, head, list) {
1047                        list_del_init(&bpage->list);
1048                        free_buffer_page(bpage);
1049                }
1050                bpage = list_entry(head, struct buffer_page, list);
1051                free_buffer_page(bpage);
1052        }
1053
1054        kfree(cpu_buffer);
1055}
1056
1057#ifdef CONFIG_HOTPLUG_CPU
1058static int rb_cpu_notify(struct notifier_block *self,
1059                         unsigned long action, void *hcpu);
1060#endif
1061
1062/**
1063 * ring_buffer_alloc - allocate a new ring_buffer
1064 * @size: the size in bytes per cpu that is needed.
1065 * @flags: attributes to set for the ring buffer.
1066 *
1067 * Currently the only flag that is available is the RB_FL_OVERWRITE
1068 * flag. This flag means that the buffer will overwrite old data
1069 * when the buffer wraps. If this flag is not set, the buffer will
1070 * drop data when the tail hits the head.
1071 */
1072struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1073                                        struct lock_class_key *key)
1074{
1075        struct ring_buffer *buffer;
1076        int bsize;
1077        int cpu;
1078
1079        /* keep it in its own cache line */
1080        buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
1081                         GFP_KERNEL);
1082        if (!buffer)
1083                return NULL;
1084
1085        if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
1086                goto fail_free_buffer;
1087
1088        buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1089        buffer->flags = flags;
1090        buffer->clock = trace_clock_local;
1091        buffer->reader_lock_key = key;
1092
1093        /* need at least two pages */
1094        if (buffer->pages < 2)
1095                buffer->pages = 2;
1096
1097        /*
1098         * In case of non-hotplug cpu, if the ring-buffer is allocated
1099         * in early initcall, it will not be notified of secondary cpus.
1100         * In that off case, we need to allocate for all possible cpus.
1101         */
1102#ifdef CONFIG_HOTPLUG_CPU
1103        get_online_cpus();
1104        cpumask_copy(buffer->cpumask, cpu_online_mask);
1105#else
1106        cpumask_copy(buffer->cpumask, cpu_possible_mask);
1107#endif
1108        buffer->cpus = nr_cpu_ids;
1109
1110        bsize = sizeof(void *) * nr_cpu_ids;
1111        buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
1112                                  GFP_KERNEL);
1113        if (!buffer->buffers)
1114                goto fail_free_cpumask;
1115
1116        for_each_buffer_cpu(buffer, cpu) {
1117                buffer->buffers[cpu] =
1118                        rb_allocate_cpu_buffer(buffer, cpu);
1119                if (!buffer->buffers[cpu])
1120                        goto fail_free_buffers;
1121        }
1122
1123#ifdef CONFIG_HOTPLUG_CPU
1124        buffer->cpu_notify.notifier_call = rb_cpu_notify;
1125        buffer->cpu_notify.priority = 0;
1126        register_cpu_notifier(&buffer->cpu_notify);
1127#endif
1128
1129        put_online_cpus();
1130        mutex_init(&buffer->mutex);
1131
1132        return buffer;
1133
1134 fail_free_buffers:
1135        for_each_buffer_cpu(buffer, cpu) {
1136                if (buffer->buffers[cpu])
1137                        rb_free_cpu_buffer(buffer->buffers[cpu]);
1138        }
1139        kfree(buffer->buffers);
1140
1141 fail_free_cpumask:
1142        free_cpumask_var(buffer->cpumask);
1143        put_online_cpus();
1144
1145 fail_free_buffer:
1146        kfree(buffer);
1147        return NULL;
1148}
1149EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
1150
1151/**
1152 * ring_buffer_free - free a ring buffer.
1153 * @buffer: the buffer to free.
1154 */
1155void
1156ring_buffer_free(struct ring_buffer *buffer)
1157{
1158        int cpu;
1159
1160        get_online_cpus();
1161
1162#ifdef CONFIG_HOTPLUG_CPU
1163        unregister_cpu_notifier(&buffer->cpu_notify);
1164#endif
1165
1166        for_each_buffer_cpu(buffer, cpu)
1167                rb_free_cpu_buffer(buffer->buffers[cpu]);
1168
1169        put_online_cpus();
1170
1171        kfree(buffer->buffers);
1172        free_cpumask_var(buffer->cpumask);
1173
1174        kfree(buffer);
1175}
1176EXPORT_SYMBOL_GPL(ring_buffer_free);
1177
1178void ring_buffer_set_clock(struct ring_buffer *buffer,
1179                           u64 (*clock)(void))
1180{
1181        buffer->clock = clock;
1182}
1183
1184static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1185
1186static void
1187rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1188{
1189        struct buffer_page *bpage;
1190        struct list_head *p;
1191        unsigned i;
1192
1193        atomic_inc(&cpu_buffer->record_disabled);
1194        synchronize_sched();
1195
1196        spin_lock_irq(&cpu_buffer->reader_lock);
1197        rb_head_page_deactivate(cpu_buffer);
1198
1199        for (i = 0; i < nr_pages; i++) {
1200                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201                        return;
1202                p = cpu_buffer->pages->next;
1203                bpage = list_entry(p, struct buffer_page, list);
1204                list_del_init(&bpage->list);
1205                free_buffer_page(bpage);
1206        }
1207        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208                return;
1209
1210        rb_reset_cpu(cpu_buffer);
1211        spin_unlock_irq(&cpu_buffer->reader_lock);
1212
1213        rb_check_pages(cpu_buffer);
1214
1215        atomic_dec(&cpu_buffer->record_disabled);
1216
1217}
1218
1219static void
1220rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1221                struct list_head *pages, unsigned nr_pages)
1222{
1223        struct buffer_page *bpage;
1224        struct list_head *p;
1225        unsigned i;
1226
1227        atomic_inc(&cpu_buffer->record_disabled);
1228        synchronize_sched();
1229
1230        spin_lock_irq(&cpu_buffer->reader_lock);
1231        rb_head_page_deactivate(cpu_buffer);
1232
1233        for (i = 0; i < nr_pages; i++) {
1234                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1235                        return;
1236                p = pages->next;
1237                bpage = list_entry(p, struct buffer_page, list);
1238                list_del_init(&bpage->list);
1239                list_add_tail(&bpage->list, cpu_buffer->pages);
1240        }
1241        rb_reset_cpu(cpu_buffer);
1242        spin_unlock_irq(&cpu_buffer->reader_lock);
1243
1244        rb_check_pages(cpu_buffer);
1245
1246        atomic_dec(&cpu_buffer->record_disabled);
1247}
1248
1249/**
1250 * ring_buffer_resize - resize the ring buffer
1251 * @buffer: the buffer to resize.
1252 * @size: the new size.
1253 *
1254 * The tracer is responsible for making sure that the buffer is
1255 * not being used while changing the size.
1256 * Note: We may be able to change the above requirement by using
1257 *  RCU synchronizations.
1258 *
1259 * Minimum size is 2 * BUF_PAGE_SIZE.
1260 *
1261 * Returns -1 on failure.
1262 */
1263int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1264{
1265        struct ring_buffer_per_cpu *cpu_buffer;
1266        unsigned nr_pages, rm_pages, new_pages;
1267        struct buffer_page *bpage, *tmp;
1268        unsigned long buffer_size;
1269        unsigned long addr;
1270        LIST_HEAD(pages);
1271        int i, cpu;
1272
1273        /*
1274         * Always succeed at resizing a non-existent buffer:
1275         */
1276        if (!buffer)
1277                return size;
1278
1279        size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1280        size *= BUF_PAGE_SIZE;
1281        buffer_size = buffer->pages * BUF_PAGE_SIZE;
1282
1283        /* we need a minimum of two pages */
1284        if (size < BUF_PAGE_SIZE * 2)
1285                size = BUF_PAGE_SIZE * 2;
1286
1287        if (size == buffer_size)
1288                return size;
1289
1290        mutex_lock(&buffer->mutex);
1291        get_online_cpus();
1292
1293        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1294
1295        if (size < buffer_size) {
1296
1297                /* easy case, just free pages */
1298                if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
1299                        goto out_fail;
1300
1301                rm_pages = buffer->pages - nr_pages;
1302
1303                for_each_buffer_cpu(buffer, cpu) {
1304                        cpu_buffer = buffer->buffers[cpu];
1305                        rb_remove_pages(cpu_buffer, rm_pages);
1306                }
1307                goto out;
1308        }
1309
1310        /*
1311         * This is a bit more difficult. We only want to add pages
1312         * when we can allocate enough for all CPUs. We do this
1313         * by allocating all the pages and storing them on a local
1314         * link list. If we succeed in our allocation, then we
1315         * add these pages to the cpu_buffers. Otherwise we just free
1316         * them all and return -ENOMEM;
1317         */
1318        if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
1319                goto out_fail;
1320
1321        new_pages = nr_pages - buffer->pages;
1322
1323        for_each_buffer_cpu(buffer, cpu) {
1324                for (i = 0; i < new_pages; i++) {
1325                        bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1326                                                  cache_line_size()),
1327                                            GFP_KERNEL, cpu_to_node(cpu));
1328                        if (!bpage)
1329                                goto free_pages;
1330                        list_add(&bpage->list, &pages);
1331                        addr = __get_free_page(GFP_KERNEL);
1332                        if (!addr)
1333                                goto free_pages;
1334                        bpage->page = (void *)addr;
1335                        rb_init_page(bpage->page);
1336                }
1337        }
1338
1339        for_each_buffer_cpu(buffer, cpu) {
1340                cpu_buffer = buffer->buffers[cpu];
1341                rb_insert_pages(cpu_buffer, &pages, new_pages);
1342        }
1343
1344        if (RB_WARN_ON(buffer, !list_empty(&pages)))
1345                goto out_fail;
1346
1347 out:
1348        buffer->pages = nr_pages;
1349        put_online_cpus();
1350        mutex_unlock(&buffer->mutex);
1351
1352        return size;
1353
1354 free_pages:
1355        list_for_each_entry_safe(bpage, tmp, &pages, list) {
1356                list_del_init(&bpage->list);
1357                free_buffer_page(bpage);
1358        }
1359        put_online_cpus();
1360        mutex_unlock(&buffer->mutex);
1361        return -ENOMEM;
1362
1363        /*
1364         * Something went totally wrong, and we are too paranoid
1365         * to even clean up the mess.
1366         */
1367 out_fail:
1368        put_online_cpus();
1369        mutex_unlock(&buffer->mutex);
1370        return -1;
1371}
1372EXPORT_SYMBOL_GPL(ring_buffer_resize);
1373
1374static inline void *
1375__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1376{
1377        return bpage->data + index;
1378}
1379
1380static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
1381{
1382        return bpage->page->data + index;
1383}
1384
1385static inline struct ring_buffer_event *
1386rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
1387{
1388        return __rb_page_index(cpu_buffer->reader_page,
1389                               cpu_buffer->reader_page->read);
1390}
1391
1392static inline struct ring_buffer_event *
1393rb_iter_head_event(struct ring_buffer_iter *iter)
1394{
1395        return __rb_page_index(iter->head_page, iter->head);
1396}
1397
1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
1399{
1400        return local_read(&bpage->write) & RB_WRITE_MASK;
1401}
1402
1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
1404{
1405        return local_read(&bpage->page->commit);
1406}
1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410        return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
1413/* Size is determined by what has been commited */
1414static inline unsigned rb_page_size(struct buffer_page *bpage)
1415{
1416        return rb_page_commit(bpage);
1417}
1418
1419static inline unsigned
1420rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
1421{
1422        return rb_page_commit(cpu_buffer->commit_page);
1423}
1424
1425static inline unsigned
1426rb_event_index(struct ring_buffer_event *event)
1427{
1428        unsigned long addr = (unsigned long)event;
1429
1430        return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1431}
1432
1433static inline int
1434rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1435                   struct ring_buffer_event *event)
1436{
1437        unsigned long addr = (unsigned long)event;
1438        unsigned long index;
1439
1440        index = rb_event_index(event);
1441        addr &= PAGE_MASK;
1442
1443        return cpu_buffer->commit_page->page == (void *)addr &&
1444                rb_commit_index(cpu_buffer) == index;
1445}
1446
1447static void
1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1449{
1450        unsigned long max_count;
1451
1452        /*
1453         * We only race with interrupts and NMIs on this CPU.
1454         * If we own the commit event, then we can commit
1455         * all others that interrupted us, since the interruptions
1456         * are in stack format (they finish before they come
1457         * back to us). This allows us to do a simple loop to
1458         * assign the commit to the tail.
1459         */
1460 again:
1461        max_count = cpu_buffer->buffer->pages * 100;
1462
1463        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1464                if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1465                        return;
1466                if (RB_WARN_ON(cpu_buffer,
1467                               rb_is_reader_page(cpu_buffer->tail_page)))
1468                        return;
1469                local_set(&cpu_buffer->commit_page->page->commit,
1470                          rb_page_write(cpu_buffer->commit_page));
1471                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1472                cpu_buffer->write_stamp =
1473                        cpu_buffer->commit_page->page->time_stamp;
1474                /* add barrier to keep gcc from optimizing too much */
1475                barrier();
1476        }
1477        while (rb_commit_index(cpu_buffer) !=
1478               rb_page_write(cpu_buffer->commit_page)) {
1479
1480                local_set(&cpu_buffer->commit_page->page->commit,
1481                          rb_page_write(cpu_buffer->commit_page));
1482                RB_WARN_ON(cpu_buffer,
1483                           local_read(&cpu_buffer->commit_page->page->commit) &
1484                           ~RB_WRITE_MASK);
1485                barrier();
1486        }
1487
1488        /* again, keep gcc from optimizing */
1489        barrier();
1490
1491        /*
1492         * If an interrupt came in just after the first while loop
1493         * and pushed the tail page forward, we will be left with
1494         * a dangling commit that will never go forward.
1495         */
1496        if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
1497                goto again;
1498}
1499
1500static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1501{
1502        cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
1503        cpu_buffer->reader_page->read = 0;
1504}
1505
1506static void rb_inc_iter(struct ring_buffer_iter *iter)
1507{
1508        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1509
1510        /*
1511         * The iterator could be on the reader page (it starts there).
1512         * But the head could have moved, since the reader was
1513         * found. Check for this case and assign the iterator
1514         * to the head page instead of next.
1515         */
1516        if (iter->head_page == cpu_buffer->reader_page)
1517                iter->head_page = rb_set_head_page(cpu_buffer);
1518        else
1519                rb_inc_page(cpu_buffer, &iter->head_page);
1520
1521        iter->read_stamp = iter->head_page->page->time_stamp;
1522        iter->head = 0;
1523}
1524
1525/**
1526 * ring_buffer_update_event - update event type and data
1527 * @event: the even to update
1528 * @type: the type of event
1529 * @length: the size of the event field in the ring buffer
1530 *
1531 * Update the type and data fields of the event. The length
1532 * is the actual size that is written to the ring buffer,
1533 * and with this, we can determine what to place into the
1534 * data field.
1535 */
1536static void
1537rb_update_event(struct ring_buffer_event *event,
1538                         unsigned type, unsigned length)
1539{
1540        event->type_len = type;
1541
1542        switch (type) {
1543
1544        case RINGBUF_TYPE_PADDING:
1545        case RINGBUF_TYPE_TIME_EXTEND:
1546        case RINGBUF_TYPE_TIME_STAMP:
1547                break;
1548
1549        case 0:
1550                length -= RB_EVNT_HDR_SIZE;
1551                if (length > RB_MAX_SMALL_DATA)
1552                        event->array[0] = length;
1553                else
1554                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1555                break;
1556        default:
1557                BUG();
1558        }
1559}
1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 *           0 to continue
1566 *          -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570                    struct buffer_page *tail_page,
1571                    struct buffer_page *next_page)
1572{
1573        struct buffer_page *new_head;
1574        int entries;
1575        int type;
1576        int ret;
1577
1578        entries = rb_page_entries(next_page);
1579
1580        /*
1581         * The hard part is here. We need to move the head
1582         * forward, and protect against both readers on
1583         * other CPUs and writers coming in via interrupts.
1584         */
1585        type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586                                       RB_PAGE_HEAD);
1587
1588        /*
1589         * type can be one of four:
1590         *  NORMAL - an interrupt already moved it for us
1591         *  HEAD   - we are the first to get here.
1592         *  UPDATE - we are the interrupt interrupting
1593         *           a current move.
1594         *  MOVED  - a reader on another CPU moved the next
1595         *           pointer to its reader page. Give up
1596         *           and try again.
1597         */
1598
1599        switch (type) {
1600        case RB_PAGE_HEAD:
1601                /*
1602                 * We changed the head to UPDATE, thus
1603                 * it is our responsibility to update
1604                 * the counters.
1605                 */
1606                local_add(entries, &cpu_buffer->overrun);
1607
1608                /*
1609                 * The entries will be zeroed out when we move the
1610                 * tail page.
1611                 */
1612
1613                /* still more to do */
1614                break;
1615
1616        case RB_PAGE_UPDATE:
1617                /*
1618                 * This is an interrupt that interrupt the
1619                 * previous update. Still more to do.
1620                 */
1621                break;
1622        case RB_PAGE_NORMAL:
1623                /*
1624                 * An interrupt came in before the update
1625                 * and processed this for us.
1626                 * Nothing left to do.
1627                 */
1628                return 1;
1629        case RB_PAGE_MOVED:
1630                /*
1631                 * The reader is on another CPU and just did
1632                 * a swap with our next_page.
1633                 * Try again.
1634                 */
1635                return 1;
1636        default:
1637                RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638                return -1;
1639        }
1640
1641        /*
1642         * Now that we are here, the old head pointer is
1643         * set to UPDATE. This will keep the reader from
1644         * swapping the head page with the reader page.
1645         * The reader (on another CPU) will spin till
1646         * we are finished.
1647         *
1648         * We just need to protect against interrupts
1649         * doing the job. We will set the next pointer
1650         * to HEAD. After that, we set the old pointer
1651         * to NORMAL, but only if it was HEAD before.
1652         * otherwise we are an interrupt, and only
1653         * want the outer most commit to reset it.
1654         */
1655        new_head = next_page;
1656        rb_inc_page(cpu_buffer, &new_head);
1657
1658        ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659                                    RB_PAGE_NORMAL);
1660
1661        /*
1662         * Valid returns are:
1663         *  HEAD   - an interrupt came in and already set it.
1664         *  NORMAL - One of two things:
1665         *            1) We really set it.
1666         *            2) A bunch of interrupts came in and moved
1667         *               the page forward again.
1668         */
1669        switch (ret) {
1670        case RB_PAGE_HEAD:
1671        case RB_PAGE_NORMAL:
1672                /* OK */
1673                break;
1674        default:
1675                RB_WARN_ON(cpu_buffer, 1);
1676                return -1;
1677        }
1678
1679        /*
1680         * It is possible that an interrupt came in,
1681         * set the head up, then more interrupts came in
1682         * and moved it again. When we get back here,
1683         * the page would have been set to NORMAL but we
1684         * just set it back to HEAD.
1685         *
1686         * How do you detect this? Well, if that happened
1687         * the tail page would have moved.
1688         */
1689        if (ret == RB_PAGE_NORMAL) {
1690                /*
1691                 * If the tail had moved passed next, then we need
1692                 * to reset the pointer.
1693                 */
1694                if (cpu_buffer->tail_page != tail_page &&
1695                    cpu_buffer->tail_page != next_page)
1696                        rb_head_page_set_normal(cpu_buffer, new_head,
1697                                                next_page,
1698                                                RB_PAGE_HEAD);
1699        }
1700
1701        /*
1702         * If this was the outer most commit (the one that
1703         * changed the original pointer from HEAD to UPDATE),
1704         * then it is up to us to reset it to NORMAL.
1705         */
1706        if (type == RB_PAGE_HEAD) {
1707                ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708                                              tail_page,
1709                                              RB_PAGE_UPDATE);
1710                if (RB_WARN_ON(cpu_buffer,
1711                               ret != RB_PAGE_UPDATE))
1712                        return -1;
1713        }
1714
1715        return 0;
1716}
1717
1718static unsigned rb_calculate_event_length(unsigned length)
1719{
1720        struct ring_buffer_event event; /* Used only for sizeof array */
1721
1722        /* zero length can cause confusions */
1723        if (!length)
1724                length = 1;
1725
1726        if (length > RB_MAX_SMALL_DATA)
1727                length += sizeof(event.array[0]);
1728
1729        length += RB_EVNT_HDR_SIZE;
1730        length = ALIGN(length, RB_ALIGNMENT);
1731
1732        return length;
1733}
1734
1735static inline void
1736rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1737              struct buffer_page *tail_page,
1738              unsigned long tail, unsigned long length)
1739{
1740        struct ring_buffer_event *event;
1741
1742        /*
1743         * Only the event that crossed the page boundary
1744         * must fill the old tail_page with padding.
1745         */
1746        if (tail >= BUF_PAGE_SIZE) {
1747                local_sub(length, &tail_page->write);
1748                return;
1749        }
1750
1751        event = __rb_page_index(tail_page, tail);
1752        kmemcheck_annotate_bitfield(event, bitfield);
1753
1754        /*
1755         * If this event is bigger than the minimum size, then
1756         * we need to be careful that we don't subtract the
1757         * write counter enough to allow another writer to slip
1758         * in on this page.
1759         * We put in a discarded commit instead, to make sure
1760         * that this space is not used again.
1761         *
1762         * If we are less than the minimum size, we don't need to
1763         * worry about it.
1764         */
1765        if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1766                /* No room for any events */
1767
1768                /* Mark the rest of the page with padding */
1769                rb_event_set_padding(event);
1770
1771                /* Set the write back to the previous setting */
1772                local_sub(length, &tail_page->write);
1773                return;
1774        }
1775
1776        /* Put in a discarded event */
1777        event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1778        event->type_len = RINGBUF_TYPE_PADDING;
1779        /* time delta must be non zero */
1780        event->time_delta = 1;
1781
1782        /* Set write to end of buffer */
1783        length = (tail + length) - BUF_PAGE_SIZE;
1784        local_sub(length, &tail_page->write);
1785}
1786
1787static struct ring_buffer_event *
1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1789             unsigned long length, unsigned long tail,
1790             struct buffer_page *commit_page,
1791             struct buffer_page *tail_page, u64 *ts)
1792{
1793        struct ring_buffer *buffer = cpu_buffer->buffer;
1794        struct buffer_page *next_page;
1795        int ret;
1796
1797        next_page = tail_page;
1798
1799        rb_inc_page(cpu_buffer, &next_page);
1800
1801        /*
1802         * If for some reason, we had an interrupt storm that made
1803         * it all the way around the buffer, bail, and warn
1804         * about it.
1805         */
1806        if (unlikely(next_page == commit_page)) {
1807                local_inc(&cpu_buffer->commit_overrun);
1808                goto out_reset;
1809        }
1810
1811        /*
1812         * This is where the fun begins!
1813         *
1814         * We are fighting against races between a reader that
1815         * could be on another CPU trying to swap its reader
1816         * page with the buffer head.
1817         *
1818         * We are also fighting against interrupts coming in and
1819         * moving the head or tail on us as well.
1820         *
1821         * If the next page is the head page then we have filled
1822         * the buffer, unless the commit page is still on the
1823         * reader page.
1824         */
1825        if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1826
1827                /*
1828                 * If the commit is not on the reader page, then
1829                 * move the header page.
1830                 */
1831                if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832                        /*
1833                         * If we are not in overwrite mode,
1834                         * this is easy, just stop here.
1835                         */
1836                        if (!(buffer->flags & RB_FL_OVERWRITE))
1837                                goto out_reset;
1838
1839                        ret = rb_handle_head_page(cpu_buffer,
1840                                                  tail_page,
1841                                                  next_page);
1842                        if (ret < 0)
1843                                goto out_reset;
1844                        if (ret)
1845                                goto out_again;
1846                } else {
1847                        /*
1848                         * We need to be careful here too. The
1849                         * commit page could still be on the reader
1850                         * page. We could have a small buffer, and
1851                         * have filled up the buffer with events
1852                         * from interrupts and such, and wrapped.
1853                         *
1854                         * Note, if the tail page is also the on the
1855                         * reader_page, we let it move out.
1856                         */
1857                        if (unlikely((cpu_buffer->commit_page !=
1858                                      cpu_buffer->tail_page) &&
1859                                     (cpu_buffer->commit_page ==
1860                                      cpu_buffer->reader_page))) {
1861                                local_inc(&cpu_buffer->commit_overrun);
1862                                goto out_reset;
1863                        }
1864                }
1865        }
1866
1867        ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1868        if (ret) {
1869                /*
1870                 * Nested commits always have zero deltas, so
1871                 * just reread the time stamp
1872                 */
1873                *ts = rb_time_stamp(buffer);
1874                next_page->page->time_stamp = *ts;
1875        }
1876
1877 out_again:
1878
1879        rb_reset_tail(cpu_buffer, tail_page, tail, length);
1880
1881        /* fail and let the caller try again */
1882        return ERR_PTR(-EAGAIN);
1883
1884 out_reset:
1885        /* reset write */
1886        rb_reset_tail(cpu_buffer, tail_page, tail, length);
1887
1888        return NULL;
1889}
1890
1891static struct ring_buffer_event *
1892__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1893                  unsigned type, unsigned long length, u64 *ts)
1894{
1895        struct buffer_page *tail_page, *commit_page;
1896        struct ring_buffer_event *event;
1897        unsigned long tail, write;
1898
1899        commit_page = cpu_buffer->commit_page;
1900        /* we just need to protect against interrupts */
1901        barrier();
1902        tail_page = cpu_buffer->tail_page;
1903        write = local_add_return(length, &tail_page->write);
1904
1905        /* set write to only the index of the write */
1906        write &= RB_WRITE_MASK;
1907        tail = write - length;
1908
1909        /* See if we shot pass the end of this buffer page */
1910        if (write > BUF_PAGE_SIZE)
1911                return rb_move_tail(cpu_buffer, length, tail,
1912                                    commit_page, tail_page, ts);
1913
1914        /* We reserved something on the buffer */
1915
1916        event = __rb_page_index(tail_page, tail);
1917        kmemcheck_annotate_bitfield(event, bitfield);
1918        rb_update_event(event, type, length);
1919
1920        /* The passed in type is zero for DATA */
1921        if (likely(!type))
1922                local_inc(&tail_page->entries);
1923
1924        /*
1925         * If this is the first commit on the page, then update
1926         * its timestamp.
1927         */
1928        if (!tail)
1929                tail_page->page->time_stamp = *ts;
1930
1931        return event;
1932}
1933
1934static inline int
1935rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1936                  struct ring_buffer_event *event)
1937{
1938        unsigned long new_index, old_index;
1939        struct buffer_page *bpage;
1940        unsigned long index;
1941        unsigned long addr;
1942
1943        new_index = rb_event_index(event);
1944        old_index = new_index + rb_event_length(event);
1945        addr = (unsigned long)event;
1946        addr &= PAGE_MASK;
1947
1948        bpage = cpu_buffer->tail_page;
1949
1950        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951                unsigned long write_mask =
1952                        local_read(&bpage->write) & ~RB_WRITE_MASK;
1953                /*
1954                 * This is on the tail page. It is possible that
1955                 * a write could come in and move the tail page
1956                 * and write to the next page. That is fine
1957                 * because we just shorten what is on this page.
1958                 */
1959                old_index += write_mask;
1960                new_index += write_mask;
1961                index = local_cmpxchg(&bpage->write, old_index, new_index);
1962                if (index == old_index)
1963                        return 1;
1964        }
1965
1966        /* could not discard */
1967        return 0;
1968}
1969
1970static int
1971rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1972                  u64 *ts, u64 *delta)
1973{
1974        struct ring_buffer_event *event;
1975        static int once;
1976        int ret;
1977
1978        if (unlikely(*delta > (1ULL << 59) && !once++)) {
1979                printk(KERN_WARNING "Delta way too big! %llu"
1980                       " ts=%llu write stamp = %llu\n",
1981                       (unsigned long long)*delta,
1982                       (unsigned long long)*ts,
1983                       (unsigned long long)cpu_buffer->write_stamp);
1984                WARN_ON(1);
1985        }
1986
1987        /*
1988         * The delta is too big, we to add a
1989         * new timestamp.
1990         */
1991        event = __rb_reserve_next(cpu_buffer,
1992                                  RINGBUF_TYPE_TIME_EXTEND,
1993                                  RB_LEN_TIME_EXTEND,
1994                                  ts);
1995        if (!event)
1996                return -EBUSY;
1997
1998        if (PTR_ERR(event) == -EAGAIN)
1999                return -EAGAIN;
2000

2001        /* Only a commited time event can update the write stamp */
2002        if (rb_event_is_commit(cpu_buffer, event)) {
2003                /*
2004                 * If this is the first on the page, then it was
2005                 * updated with the page itself. Try to discard it
2006                 * and if we can't just make it zero.
2007                 */
2008                if (rb_event_index(event)) {
2009                        event->time_delta = *delta & TS_MASK;
2010                        event->array[0] = *delta >> TS_SHIFT;
2011                } else {
2012                        /* try to discard, since we do not need this */
2013                        if (!rb_try_to_discard(cpu_buffer, event)) {
2014                                /* nope, just zero it */
2015                                event->time_delta = 0;
2016                                event->array[0] = 0;
2017                        }
2018                }
2019                cpu_buffer->write_stamp = *ts;
2020                /* let the caller know this was the commit */
2021                ret = 1;
2022        } else {
2023                /* Try to discard the event */
2024                if (!rb_try_to_discard(cpu_buffer, event)) {
2025                        /* Darn, this is just wasted space */
2026                        event->time_delta = 0;
2027                        event->array[0] = 0;
2028                }
2029                ret = 0;
2030        }
2031
2032        *delta = 0;
2033
2034        return ret;
2035}
2036
2037static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2038{
2039        local_inc(&cpu_buffer->committing);
2040        local_inc(&cpu_buffer->commits);
2041}
2042
2043static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2044{
2045        unsigned long commits;
2046
2047        if (RB_WARN_ON(cpu_buffer,
2048                       !local_read(&cpu_buffer->committing)))
2049                return;
2050
2051 again:
2052        commits = local_read(&cpu_buffer->commits);
2053        /* synchronize with interrupts */
2054        barrier();
2055        if (local_read(&cpu_buffer->committing) == 1)
2056                rb_set_commit_to_write(cpu_buffer);
2057
2058        local_dec(&cpu_buffer->committing);
2059
2060        /* synchronize with interrupts */
2061        barrier();
2062
2063        /*
2064         * Need to account for interrupts coming in between the
2065         * updating of the commit page and the clearing of the
2066         * committing counter.
2067         */
2068        if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
2069            !local_read(&cpu_buffer->committing)) {
2070                local_inc(&cpu_buffer->committing);
2071                goto again;
2072        }
2073}
2074
2075static struct ring_buffer_event *
2076rb_reserve_next_event(struct ring_buffer *buffer,
2077                      struct ring_buffer_per_cpu *cpu_buffer,
2078                      unsigned long length)
2079{
2080        struct ring_buffer_event *event;
2081        u64 ts, delta = 0;
2082        int commit = 0;
2083        int nr_loops = 0;
2084
2085        rb_start_commit(cpu_buffer);
2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088        /*
2089         * Due to the ability to swap a cpu buffer from a buffer
2090         * it is possible it was swapped before we committed.
2091         * (committing stops a swap). We check for it here and
2092         * if it happened, we have to fail the write.
2093         */
2094        barrier();
2095        if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096                local_dec(&cpu_buffer->committing);
2097                local_dec(&cpu_buffer->commits);
2098                return NULL;
2099        }
2100#endif
2101
2102        length = rb_calculate_event_length(length);
2103 again:
2104        /*
2105         * We allow for interrupts to reenter here and do a trace.
2106         * If one does, it will cause this original code to loop
2107         * back here. Even with heavy interrupts happening, this
2108         * should only happen a few times in a row. If this happens
2109         * 1000 times in a row, there must be either an interrupt
2110         * storm or we have something buggy.
2111         * Bail!
2112         */
2113        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2114                goto out_fail;
2115
2116        ts = rb_time_stamp(cpu_buffer->buffer);
2117
2118        /*
2119         * Only the first commit can update the timestamp.
2120         * Yes there is a race here. If an interrupt comes in
2121         * just after the conditional and it traces too, then it
2122         * will also check the deltas. More than one timestamp may
2123         * also be made. But only the entry that did the actual
2124         * commit will be something other than zero.
2125         */
2126        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2127                   rb_page_write(cpu_buffer->tail_page) ==
2128                   rb_commit_index(cpu_buffer))) {
2129                u64 diff;
2130
2131                diff = ts - cpu_buffer->write_stamp;
2132
2133                /* make sure this diff is calculated here */
2134                barrier();
2135
2136                /* Did the write stamp get updated already? */
2137                if (unlikely(ts < cpu_buffer->write_stamp))
2138                        goto get_event;
2139
2140                delta = diff;
2141                if (unlikely(test_time_stamp(delta))) {
2142
2143                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
2144                        if (commit == -EBUSY)
2145                                goto out_fail;
2146
2147                        if (commit == -EAGAIN)
2148                                goto again;
2149
2150                        RB_WARN_ON(cpu_buffer, commit < 0);
2151                }
2152        }
2153
2154 get_event:
2155        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
2156        if (unlikely(PTR_ERR(event) == -EAGAIN))
2157                goto again;
2158
2159        if (!event)
2160                goto out_fail;
2161
2162        if (!rb_event_is_commit(cpu_buffer, event))
2163                delta = 0;
2164
2165        event->time_delta = delta;
2166
2167        return event;
2168
2169 out_fail:
2170        rb_end_commit(cpu_buffer);
2171        return NULL;
2172}
2173
2174#ifdef CONFIG_TRACING
2175
2176#define TRACE_RECURSIVE_DEPTH 16
2177
2178static int trace_recursive_lock(void)
2179{
2180        current->trace_recursion++;
2181
2182        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2183                return 0;
2184
2185        /* Disable all tracing before we do anything else */
2186        tracing_off_permanent();
2187
2188        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2189                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2190                    current->trace_recursion,
2191                    hardirq_count() >> HARDIRQ_SHIFT,
2192                    softirq_count() >> SOFTIRQ_SHIFT,
2193                    in_nmi());
2194
2195        WARN_ON_ONCE(1);
2196        return -1;
2197}
2198
2199static void trace_recursive_unlock(void)
2200{
2201        WARN_ON_ONCE(!current->trace_recursion);
2202
2203        current->trace_recursion--;
2204}
2205
2206#else
2207
2208#define trace_recursive_lock()          (0)
2209#define trace_recursive_unlock()        do { } while (0)
2210
2211#endif
2212
2213static DEFINE_PER_CPU(int, rb_need_resched);
2214
2215/**
2216 * ring_buffer_lock_reserve - reserve a part of the buffer
2217 * @buffer: the ring buffer to reserve from
2218 * @length: the length of the data to reserve (excluding event header)
2219 *
2220 * Returns a reseverd event on the ring buffer to copy directly to.
2221 * The user of this interface will need to get the body to write into
2222 * and can use the ring_buffer_event_data() interface.
2223 *
2224 * The length is the length of the data needed, not the event length
2225 * which also includes the event header.
2226 *
2227 * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
2228 * If NULL is returned, then nothing has been allocated or locked.
2229 */
2230struct ring_buffer_event *
2231ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2232{
2233        struct ring_buffer_per_cpu *cpu_buffer;
2234        struct ring_buffer_event *event;
2235        int cpu, resched;
2236
2237        if (ring_buffer_flags != RB_BUFFERS_ON)
2238                return NULL;
2239
2240        if (atomic_read(&buffer->record_disabled))
2241                return NULL;
2242
2243        /* If we are tracing schedule, we don't want to recurse */
2244        resched = ftrace_preempt_disable();
2245
2246        if (trace_recursive_lock())
2247                goto out_nocheck;
2248
2249        cpu = raw_smp_processor_id();
2250
2251        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2252                goto out;
2253
2254        cpu_buffer = buffer->buffers[cpu];
2255
2256        if (atomic_read(&cpu_buffer->record_disabled))
2257                goto out;
2258
2259        if (length > BUF_MAX_DATA_SIZE)
2260                goto out;
2261
2262        event = rb_reserve_next_event(buffer, cpu_buffer, length);
2263        if (!event)
2264                goto out;
2265
2266        /*
2267         * Need to store resched state on this cpu.
2268         * Only the first needs to.
2269         */
2270
2271        if (preempt_count() == 1)
2272                per_cpu(rb_need_resched, cpu) = resched;
2273
2274        return event;
2275
2276 out:
2277        trace_recursive_unlock();
2278
2279 out_nocheck:
2280        ftrace_preempt_enable(resched);
2281        return NULL;
2282}
2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
2284
2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2287                      struct ring_buffer_event *event)
2288{
2289        /*
2290         * The event first in the commit queue updates the
2291         * time stamp.
2292         */
2293        if (rb_event_is_commit(cpu_buffer, event))
2294                cpu_buffer->write_stamp += event->time_delta;
2295}
2296
2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2298                      struct ring_buffer_event *event)
2299{
2300        local_inc(&cpu_buffer->entries);
2301        rb_update_write_stamp(cpu_buffer, event);
2302        rb_end_commit(cpu_buffer);
2303}
2304
2305/**
2306 * ring_buffer_unlock_commit - commit a reserved
2307 * @buffer: The buffer to commit to
2308 * @event: The event pointer to commit.
2309 *
2310 * This commits the data to the ring buffer, and releases any locks held.
2311 *
2312 * Must be paired with ring_buffer_lock_reserve.
2313 */
2314int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2315                              struct ring_buffer_event *event)
2316{
2317        struct ring_buffer_per_cpu *cpu_buffer;
2318        int cpu = raw_smp_processor_id();
2319
2320        cpu_buffer = buffer->buffers[cpu];
2321
2322        rb_commit(cpu_buffer, event);
2323
2324        trace_recursive_unlock();
2325
2326        /*
2327         * Only the last preempt count needs to restore preemption.
2328         */
2329        if (preempt_count() == 1)
2330                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2331        else
2332                preempt_enable_no_resched_notrace();
2333
2334        return 0;
2335}
2336EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2337
2338static inline void rb_event_discard(struct ring_buffer_event *event)
2339{
2340        /* array[0] holds the actual length for the discarded event */
2341        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2342        event->type_len = RINGBUF_TYPE_PADDING;
2343        /* time delta must be non zero */
2344        if (!event->time_delta)
2345                event->time_delta = 1;
2346}
2347
2348/*
2349 * Decrement the entries to the page that an event is on.
2350 * The event does not even need to exist, only the pointer
2351 * to the page it is on. This may only be called before the commit
2352 * takes place.
2353 */
2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356                   struct ring_buffer_event *event)
2357{
2358        unsigned long addr = (unsigned long)event;
2359        struct buffer_page *bpage = cpu_buffer->commit_page;
2360        struct buffer_page *start;
2361
2362        addr &= PAGE_MASK;
2363
2364        /* Do the likely case first */
2365        if (likely(bpage->page == (void *)addr)) {
2366                local_dec(&bpage->entries);
2367                return;
2368        }
2369
2370        /*
2371         * Because the commit page may be on the reader page we
2372         * start with the next page and check the end loop there.
2373         */
2374        rb_inc_page(cpu_buffer, &bpage);
2375        start = bpage;
2376        do {
2377                if (bpage->page == (void *)addr) {
2378                        local_dec(&bpage->entries);
2379                        return;
2380                }
2381                rb_inc_page(cpu_buffer, &bpage);
2382        } while (bpage != start);
2383
2384        /* commit not part of this buffer?? */
2385        RB_WARN_ON(cpu_buffer, 1);
2386}
2387
2388/**
2389 * ring_buffer_commit_discard - discard an event that has not been committed
2390 * @buffer: the ring buffer
2391 * @event: non committed event to discard
2392 *
2393 * Sometimes an event that is in the ring buffer needs to be ignored.
2394 * This function lets the user discard an event in the ring buffer
2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
2399 * if another event has not been added behind it.
2400 *
2401 * If another event has been added behind it, it will set the event
2402 * up as discarded, and perform the commit.
2403 *
2404 * If this function is called, do not call ring_buffer_unlock_commit on
2405 * the event.
2406 */
2407void ring_buffer_discard_commit(struct ring_buffer *buffer,
2408                                struct ring_buffer_event *event)
2409{
2410        struct ring_buffer_per_cpu *cpu_buffer;
2411        int cpu;
2412
2413        /* The event is discarded regardless */
2414        rb_event_discard(event);
2415
2416        cpu = smp_processor_id();
2417        cpu_buffer = buffer->buffers[cpu];
2418
2419        /*
2420         * This must only be called if the event has not been
2421         * committed yet. Thus we can assume that preemption
2422         * is still disabled.
2423         */
2424        RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
2425
2426        rb_decrement_entry(cpu_buffer, event);
2427        if (rb_try_to_discard(cpu_buffer, event))
2428                goto out;
2429
2430        /*
2431         * The commit is still visible by the reader, so we
2432         * must still update the timestamp.
2433         */
2434        rb_update_write_stamp(cpu_buffer, event);
2435 out:
2436        rb_end_commit(cpu_buffer);
2437
2438        trace_recursive_unlock();
2439
2440        /*
2441         * Only the last preempt count needs to restore preemption.
2442         */
2443        if (preempt_count() == 1)
2444                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2445        else
2446                preempt_enable_no_resched_notrace();
2447
2448}
2449EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2450
2451/**
2452 * ring_buffer_write - write data to the buffer without reserving
2453 * @buffer: The ring buffer to write to.
2454 * @length: The length of the data being written (excluding the event header)
2455 * @data: The data to write to the buffer.
2456 *
2457 * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
2458 * one function. If you already have the data to write to the buffer, it
2459 * may be easier to simply call this function.
2460 *
2461 * Note, like ring_buffer_lock_reserve, the length is the length of the data
2462 * and not the length of the event which would hold the header.
2463 */
2464int ring_buffer_write(struct ring_buffer *buffer,
2465                        unsigned long length,
2466                        void *data)
2467{
2468        struct ring_buffer_per_cpu *cpu_buffer;
2469        struct ring_buffer_event *event;
2470        void *body;
2471        int ret = -EBUSY;
2472        int cpu, resched;
2473
2474        if (ring_buffer_flags != RB_BUFFERS_ON)
2475                return -EBUSY;
2476
2477        if (atomic_read(&buffer->record_disabled))
2478                return -EBUSY;
2479
2480        resched = ftrace_preempt_disable();
2481
2482        cpu = raw_smp_processor_id();
2483
2484        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2485                goto out;
2486
2487        cpu_buffer = buffer->buffers[cpu];
2488
2489        if (atomic_read(&cpu_buffer->record_disabled))
2490                goto out;
2491
2492        if (length > BUF_MAX_DATA_SIZE)
2493                goto out;
2494
2495        event = rb_reserve_next_event(buffer, cpu_buffer, length);
2496        if (!event)
2497                goto out;
2498
2499        body = rb_event_data(event);
2500
2501        memcpy(body, data, length);
2502
2503        rb_commit(cpu_buffer, event);
2504
2505        ret = 0;
2506 out:
2507        ftrace_preempt_enable(resched);
2508
2509        return ret;
2510}
2511EXPORT_SYMBOL_GPL(ring_buffer_write);
2512
2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
2514{
2515        struct buffer_page *reader = cpu_buffer->reader_page;
2516        struct buffer_page *head = rb_set_head_page(cpu_buffer);
2517        struct buffer_page *commit = cpu_buffer->commit_page;
2518
2519        /* In case of error, head will be NULL */
2520        if (unlikely(!head))
2521                return 1;
2522
2523        return reader->read == rb_page_commit(reader) &&
2524                (commit == reader ||
2525                 (commit == head &&
2526                  head->read == rb_page_commit(commit)));
2527}
2528
2529/**
2530 * ring_buffer_record_disable - stop all writes into the buffer
2531 * @buffer: The ring buffer to stop writes to.
2532 *
2533 * This prevents all writes to the buffer. Any attempt to write
2534 * to the buffer after this will fail and return NULL.
2535 *
2536 * The caller should call synchronize_sched() after this.
2537 */
2538void ring_buffer_record_disable(struct ring_buffer *buffer)
2539{
2540        atomic_inc(&buffer->record_disabled);
2541}
2542EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2543
2544/**
2545 * ring_buffer_record_enable - enable writes to the buffer
2546 * @buffer: The ring buffer to enable writes
2547 *
2548 * Note, multiple disables will need the same number of enables
2549 * to truely enable the writing (much like preempt_disable).
2550 */
2551void ring_buffer_record_enable(struct ring_buffer *buffer)
2552{
2553        atomic_dec(&buffer->record_disabled);
2554}
2555EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
2556
2557/**
2558 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
2559 * @buffer: The ring buffer to stop writes to.
2560 * @cpu: The CPU buffer to stop
2561 *
2562 * This prevents all writes to the buffer. Any attempt to write
2563 * to the buffer after this will fail and return NULL.
2564 *
2565 * The caller should call synchronize_sched() after this.
2566 */
2567void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
2568{
2569        struct ring_buffer_per_cpu *cpu_buffer;
2570
2571        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2572                return;
2573
2574        cpu_buffer = buffer->buffers[cpu];
2575        atomic_inc(&cpu_buffer->record_disabled);
2576}
2577EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2578
2579/**
2580 * ring_buffer_record_enable_cpu - enable writes to the buffer
2581 * @buffer: The ring buffer to enable writes
2582 * @cpu: The CPU to enable.
2583 *
2584 * Note, multiple disables will need the same number of enables
2585 * to truely enable the writing (much like preempt_disable).
2586 */
2587void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2588{
2589        struct ring_buffer_per_cpu *cpu_buffer;
2590
2591        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2592                return;
2593
2594        cpu_buffer = buffer->buffers[cpu];
2595        atomic_dec(&cpu_buffer->record_disabled);
2596}
2597EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2598
2599/**
2600 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2601 * @buffer: The ring buffer
2602 * @cpu: The per CPU buffer to get the entries from.
2603 */
2604unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2605{
2606        struct ring_buffer_per_cpu *cpu_buffer;
2607        unsigned long ret;
2608
2609        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2610                return 0;
2611
2612        cpu_buffer = buffer->buffers[cpu];
2613        ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2614                - cpu_buffer->read;
2615
2616        return ret;
2617}
2618EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2619
2620/**
2621 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
2622 * @buffer: The ring buffer
2623 * @cpu: The per CPU buffer to get the number of overruns from
2624 */
2625unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
2626{
2627        struct ring_buffer_per_cpu *cpu_buffer;
2628        unsigned long ret;
2629
2630        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2631                return 0;
2632
2633        cpu_buffer = buffer->buffers[cpu];
2634        ret = local_read(&cpu_buffer->overrun);
2635
2636        return ret;
2637}
2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
2639
2640/**
2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2642 * @buffer: The ring buffer
2643 * @cpu: The per CPU buffer to get the number of overruns from
2644 */
2645unsigned long
2646ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2647{
2648        struct ring_buffer_per_cpu *cpu_buffer;
2649        unsigned long ret;
2650
2651        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2652                return 0;
2653
2654        cpu_buffer = buffer->buffers[cpu];
2655        ret = local_read(&cpu_buffer->commit_overrun);
2656
2657        return ret;
2658}
2659EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
2660
2661/**
2662 * ring_buffer_entries - get the number of entries in a buffer
2663 * @buffer: The ring buffer
2664 *
2665 * Returns the total number of entries in the ring buffer
2666 * (all CPU entries)
2667 */
2668unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2669{
2670        struct ring_buffer_per_cpu *cpu_buffer;
2671        unsigned long entries = 0;
2672        int cpu;
2673
2674        /* if you care about this being correct, lock the buffer */
2675        for_each_buffer_cpu(buffer, cpu) {
2676                cpu_buffer = buffer->buffers[cpu];
2677                entries += (local_read(&cpu_buffer->entries) -
2678                            local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2679        }
2680
2681        return entries;
2682}
2683EXPORT_SYMBOL_GPL(ring_buffer_entries);
2684
2685/**
2686 * ring_buffer_overruns - get the number of overruns in buffer
2687 * @buffer: The ring buffer
2688 *
2689 * Returns the total number of overruns in the ring buffer
2690 * (all CPU entries)
2691 */
2692unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2693{
2694        struct ring_buffer_per_cpu *cpu_buffer;
2695        unsigned long overruns = 0;
2696        int cpu;
2697
2698        /* if you care about this being correct, lock the buffer */
2699        for_each_buffer_cpu(buffer, cpu) {
2700                cpu_buffer = buffer->buffers[cpu];
2701                overruns += local_read(&cpu_buffer->overrun);
2702        }
2703
2704        return overruns;
2705}
2706EXPORT_SYMBOL_GPL(ring_buffer_overruns);
2707
2708static void rb_iter_reset(struct ring_buffer_iter *iter)
2709{
2710        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2711
2712        /* Iterator usage is expected to have record disabled */
2713        if (list_empty(&cpu_buffer->reader_page->list)) {
2714                iter->head_page = rb_set_head_page(cpu_buffer);
2715                if (unlikely(!iter->head_page))
2716                        return;
2717                iter->head = iter->head_page->read;
2718        } else {
2719                iter->head_page = cpu_buffer->reader_page;
2720                iter->head = cpu_buffer->reader_page->read;
2721        }
2722        if (iter->head)
2723                iter->read_stamp = cpu_buffer->read_stamp;
2724        else
2725                iter->read_stamp = iter->head_page->page->time_stamp;
2726}
2727
2728/**
2729 * ring_buffer_iter_reset - reset an iterator
2730 * @iter: The iterator to reset
2731 *
2732 * Resets the iterator, so that it will start from the beginning
2733 * again.
2734 */
2735void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
2736{
2737        struct ring_buffer_per_cpu *cpu_buffer;
2738        unsigned long flags;
2739
2740        if (!iter)
2741                return;
2742
2743        cpu_buffer = iter->cpu_buffer;
2744
2745        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2746        rb_iter_reset(iter);
2747        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2748}
2749EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
2750
2751/**
2752 * ring_buffer_iter_empty - check if an iterator has no more to read
2753 * @iter: The iterator to check
2754 */
2755int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
2756{
2757        struct ring_buffer_per_cpu *cpu_buffer;
2758
2759        cpu_buffer = iter->cpu_buffer;
2760
2761        return iter->head_page == cpu_buffer->commit_page &&
2762                iter->head == rb_commit_index(cpu_buffer);
2763}
2764EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
2765
2766static void
2767rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2768                     struct ring_buffer_event *event)
2769{
2770        u64 delta;
2771
2772        switch (event->type_len) {
2773        case RINGBUF_TYPE_PADDING:
2774                return;
2775
2776        case RINGBUF_TYPE_TIME_EXTEND:
2777                delta = event->array[0];
2778                delta <<= TS_SHIFT;
2779                delta += event->time_delta;
2780                cpu_buffer->read_stamp += delta;
2781                return;
2782
2783        case RINGBUF_TYPE_TIME_STAMP:
2784                /* FIXME: not implemented */
2785                return;
2786
2787        case RINGBUF_TYPE_DATA:
2788                cpu_buffer->read_stamp += event->time_delta;
2789                return;
2790
2791        default:
2792                BUG();
2793        }
2794        return;
2795}
2796
2797static void
2798rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
2799                          struct ring_buffer_event *event)
2800{
2801        u64 delta;
2802
2803        switch (event->type_len) {
2804        case RINGBUF_TYPE_PADDING:
2805                return;
2806
2807        case RINGBUF_TYPE_TIME_EXTEND:
2808                delta = event->array[0];
2809                delta <<= TS_SHIFT;
2810                delta += event->time_delta;
2811                iter->read_stamp += delta;
2812                return;
2813
2814        case RINGBUF_TYPE_TIME_STAMP:
2815                /* FIXME: not implemented */
2816                return;
2817
2818        case RINGBUF_TYPE_DATA:
2819                iter->read_stamp += event->time_delta;
2820                return;
2821
2822        default:
2823                BUG();
2824        }
2825        return;
2826}
2827
2828static struct buffer_page *
2829rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2830{
2831        struct buffer_page *reader = NULL;
2832        unsigned long flags;
2833        int nr_loops = 0;
2834        int ret;
2835
2836        local_irq_save(flags);
2837        __raw_spin_lock(&cpu_buffer->lock);
2838
2839 again:
2840        /*
2841         * This should normally only loop twice. But because the
2842         * start of the reader inserts an empty page, it causes
2843         * a case where we will loop three times. There should be no
2844         * reason to loop four times (that I know of).
2845         */
2846        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
2847                reader = NULL;
2848                goto out;
2849        }
2850
2851        reader = cpu_buffer->reader_page;
2852
2853        /* If there's more to read, return this page */
2854        if (cpu_buffer->reader_page->read < rb_page_size(reader))
2855                goto out;
2856
2857        /* Never should we have an index greater than the size */
2858        if (RB_WARN_ON(cpu_buffer,
2859                       cpu_buffer->reader_page->read > rb_page_size(reader)))
2860                goto out;
2861
2862        /* check if we caught up to the tail */
2863        reader = NULL;
2864        if (cpu_buffer->commit_page == cpu_buffer->reader_page)
2865                goto out;
2866
2867        /*
2868         * Reset the reader page to size zero.
2869         */
2870        local_set(&cpu_buffer->reader_page->write, 0);
2871        local_set(&cpu_buffer->reader_page->entries, 0);
2872        local_set(&cpu_buffer->reader_page->page->commit, 0);
2873
2874 spin:
2875        /*
2876         * Splice the empty reader page into the list around the head.
2877         */
2878        reader = rb_set_head_page(cpu_buffer);
2879        cpu_buffer->reader_page->list.next = reader->list.next;
2880        cpu_buffer->reader_page->list.prev = reader->list.prev;
2881
2882        /*
2883         * cpu_buffer->pages just needs to point to the buffer, it
2884         *  has no specific buffer page to point to. Lets move it out
2885         *  of our way so we don't accidently swap it.
2886         */
2887        cpu_buffer->pages = reader->list.prev;
2888
2889        /* The reader page will be pointing to the new head */
2890        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2891
2892        /*
2893         * Here's the tricky part.
2894         *
2895         * We need to move the pointer past the header page.
2896         * But we can only do that if a writer is not currently
2897         * moving it. The page before the header page has the
2898         * flag bit '1' set if it is pointing to the page we want.
2899         * but if the writer is in the process of moving it
2900         * than it will be '2' or already moved '0'.
2901         */
2902
2903        ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2904
2905        /*
2906         * If we did not convert it, then we must try again.
2907         */
2908        if (!ret)
2909                goto spin;
2910
2911        /*
2912         * Yeah! We succeeded in replacing the page.
2913         *
2914         * Now make the new head point back to the reader page.
2915         */
2916        reader->list.next->prev = &cpu_buffer->reader_page->list;
2917        rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2918
2919        /* Finally update the reader page to the new head */
2920        cpu_buffer->reader_page = reader;
2921        rb_reset_reader_page(cpu_buffer);
2922
2923        goto again;
2924
2925 out:
2926        __raw_spin_unlock(&cpu_buffer->lock);
2927        local_irq_restore(flags);
2928
2929        return reader;
2930}
2931
2932static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2933{
2934        struct ring_buffer_event *event;
2935        struct buffer_page *reader;
2936        unsigned length;
2937
2938        reader = rb_get_reader_page(cpu_buffer);
2939
2940        /* This function should not be called when buffer is empty */
2941        if (RB_WARN_ON(cpu_buffer, !reader))
2942                return;
2943
2944        event = rb_reader_event(cpu_buffer);
2945
2946        if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2947                cpu_buffer->read++;
2948
2949        rb_update_read_stamp(cpu_buffer, event);
2950
2951        length = rb_event_length(event);
2952        cpu_buffer->reader_page->read += length;
2953}
2954
2955static void rb_advance_iter(struct ring_buffer_iter *iter)
2956{
2957        struct ring_buffer *buffer;
2958        struct ring_buffer_per_cpu *cpu_buffer;
2959        struct ring_buffer_event *event;
2960        unsigned length;
2961
2962        cpu_buffer = iter->cpu_buffer;
2963        buffer = cpu_buffer->buffer;
2964
2965        /*
2966         * Check if we are at the end of the buffer.
2967         */
2968        if (iter->head >= rb_page_size(iter->head_page)) {
2969                /* discarded commits can make the page empty */
2970                if (iter->head_page == cpu_buffer->commit_page)
2971                        return;
2972                rb_inc_iter(iter);
2973                return;
2974        }
2975
2976        event = rb_iter_head_event(iter);
2977
2978        length = rb_event_length(event);
2979
2980        /*
2981         * This should not be called to advance the header if we are
2982         * at the tail of the buffer.
2983         */
2984        if (RB_WARN_ON(cpu_buffer,
2985                       (iter->head_page == cpu_buffer->commit_page) &&
2986                       (iter->head + length > rb_commit_index(cpu_buffer))))
2987                return;
2988
2989        rb_update_iter_read_stamp(iter, event);
2990
2991        iter->head += length;
2992
2993        /* check for end of page padding */
2994        if ((iter->head >= rb_page_size(iter->head_page)) &&
2995            (iter->head_page != cpu_buffer->commit_page))
2996                rb_advance_iter(iter);
2997}
2998
2999static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)

3001{
3002        struct ring_buffer_event *event;
3003        struct buffer_page *reader;
3004        int nr_loops = 0;
3005
3006 again:
3007        /*
3008         * We repeat when a timestamp is encountered. It is possible
3009         * to get multiple timestamps from an interrupt entering just
3010         * as one timestamp is about to be written, or from discarded
3011         * commits. The most that we can have is the number on a single page.
3012         */
3013        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
3014                return NULL;
3015
3016        reader = rb_get_reader_page(cpu_buffer);
3017        if (!reader)
3018                return NULL;
3019
3020        event = rb_reader_event(cpu_buffer);
3021
3022        switch (event->type_len) {
3023        case RINGBUF_TYPE_PADDING:
3024                if (rb_null_event(event))
3025                        RB_WARN_ON(cpu_buffer, 1);
3026                /*
3027                 * Because the writer could be discarding every
3028                 * event it creates (which would probably be bad)
3029                 * if we were to go back to "again" then we may never
3030                 * catch up, and will trigger the warn on, or lock
3031                 * the box. Return the padding, and we will release
3032                 * the current locks, and try again.
3033                 */
3034                return event;
3035
3036        case RINGBUF_TYPE_TIME_EXTEND:
3037                /* Internal data, OK to advance */
3038                rb_advance_reader(cpu_buffer);
3039                goto again;
3040
3041        case RINGBUF_TYPE_TIME_STAMP:
3042                /* FIXME: not implemented */
3043                rb_advance_reader(cpu_buffer);
3044                goto again;
3045
3046        case RINGBUF_TYPE_DATA:
3047                if (ts) {
3048                        *ts = cpu_buffer->read_stamp + event->time_delta;
3049                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3050                                                         cpu_buffer->cpu, ts);
3051                }
3052                return event;
3053
3054        default:
3055                BUG();
3056        }
3057
3058        return NULL;
3059}
3060EXPORT_SYMBOL_GPL(ring_buffer_peek);
3061
3062static struct ring_buffer_event *
3063rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3064{
3065        struct ring_buffer *buffer;
3066        struct ring_buffer_per_cpu *cpu_buffer;
3067        struct ring_buffer_event *event;
3068        int nr_loops = 0;
3069
3070        if (ring_buffer_iter_empty(iter))
3071                return NULL;
3072
3073        cpu_buffer = iter->cpu_buffer;
3074        buffer = cpu_buffer->buffer;
3075
3076 again:
3077        /*
3078         * We repeat when a timestamp is encountered.
3079         * We can get multiple timestamps by nested interrupts or also
3080         * if filtering is on (discarding commits). Since discarding
3081         * commits can be frequent we can get a lot of timestamps.
3082         * But we limit them by not adding timestamps if they begin
3083         * at the start of a page.
3084         */
3085        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
3086                return NULL;
3087
3088        if (rb_per_cpu_empty(cpu_buffer))
3089                return NULL;
3090
3091        event = rb_iter_head_event(iter);
3092
3093        switch (event->type_len) {
3094        case RINGBUF_TYPE_PADDING:
3095                if (rb_null_event(event)) {
3096                        rb_inc_iter(iter);
3097                        goto again;
3098                }
3099                rb_advance_iter(iter);
3100                return event;
3101
3102        case RINGBUF_TYPE_TIME_EXTEND:
3103                /* Internal data, OK to advance */
3104                rb_advance_iter(iter);
3105                goto again;
3106
3107        case RINGBUF_TYPE_TIME_STAMP:
3108                /* FIXME: not implemented */
3109                rb_advance_iter(iter);
3110                goto again;
3111
3112        case RINGBUF_TYPE_DATA:
3113                if (ts) {
3114                        *ts = iter->read_stamp + event->time_delta;
3115                        ring_buffer_normalize_time_stamp(buffer,
3116                                                         cpu_buffer->cpu, ts);
3117                }
3118                return event;
3119
3120        default:
3121                BUG();
3122        }
3123
3124        return NULL;
3125}
3126EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
3127
3128static inline int rb_ok_to_lock(void)
3129{
3130        /*
3131         * If an NMI die dumps out the content of the ring buffer
3132         * do not grab locks. We also permanently disable the ring
3133         * buffer too. A one time deal is all you get from reading
3134         * the ring buffer from an NMI.
3135         */
3136        if (likely(!in_nmi()))
3137                return 1;
3138
3139        tracing_off_permanent();
3140        return 0;
3141}
3142
3143/**
3144 * ring_buffer_peek - peek at the next event to be read
3145 * @buffer: The ring buffer to read
3146 * @cpu: The cpu to peak at
3147 * @ts: The timestamp counter of this event.
3148 *
3149 * This will return the event that will be read next, but does
3150 * not consume the data.
3151 */
3152struct ring_buffer_event *
3153ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3154{
3155        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3156        struct ring_buffer_event *event;
3157        unsigned long flags;
3158        int dolock;
3159
3160        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3161                return NULL;
3162
3163        dolock = rb_ok_to_lock();
3164 again:
3165        local_irq_save(flags);
3166        if (dolock)
3167                spin_lock(&cpu_buffer->reader_lock);
3168        event = rb_buffer_peek(cpu_buffer, ts);
3169        if (event && event->type_len == RINGBUF_TYPE_PADDING)
3170                rb_advance_reader(cpu_buffer);
3171        if (dolock)
3172                spin_unlock(&cpu_buffer->reader_lock);
3173        local_irq_restore(flags);
3174
3175        if (event && event->type_len == RINGBUF_TYPE_PADDING)
3176                goto again;
3177
3178        return event;
3179}
3180
3181/**
3182 * ring_buffer_iter_peek - peek at the next event to be read
3183 * @iter: The ring buffer iterator
3184 * @ts: The timestamp counter of this event.
3185 *
3186 * This will return the event that will be read next, but does
3187 * not increment the iterator.
3188 */
3189struct ring_buffer_event *
3190ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3191{
3192        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3193        struct ring_buffer_event *event;
3194        unsigned long flags;
3195
3196 again:
3197        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3198        event = rb_iter_peek(iter, ts);
3199        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3200
3201        if (event && event->type_len == RINGBUF_TYPE_PADDING)
3202                goto again;
3203
3204        return event;
3205}
3206
3207/**
3208 * ring_buffer_consume - return an event and consume it
3209 * @buffer: The ring buffer to get the next event from
3210 *
3211 * Returns the next event in the ring buffer, and that event is consumed.
3212 * Meaning, that sequential reads will keep returning a different event,
3213 * and eventually empty the ring buffer if the producer is slower.
3214 */
3215struct ring_buffer_event *
3216ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3217{
3218        struct ring_buffer_per_cpu *cpu_buffer;
3219        struct ring_buffer_event *event = NULL;
3220        unsigned long flags;
3221        int dolock;
3222
3223        dolock = rb_ok_to_lock();
3224
3225 again:
3226        /* might be called in atomic */
3227        preempt_disable();
3228
3229        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3230                goto out;
3231
3232        cpu_buffer = buffer->buffers[cpu];
3233        local_irq_save(flags);
3234        if (dolock)
3235                spin_lock(&cpu_buffer->reader_lock);
3236
3237        event = rb_buffer_peek(cpu_buffer, ts);
3238        if (event)
3239                rb_advance_reader(cpu_buffer);
3240
3241        if (dolock)
3242                spin_unlock(&cpu_buffer->reader_lock);
3243        local_irq_restore(flags);
3244
3245 out:
3246        preempt_enable();
3247
3248        if (event && event->type_len == RINGBUF_TYPE_PADDING)
3249                goto again;
3250
3251        return event;
3252}
3253EXPORT_SYMBOL_GPL(ring_buffer_consume);
3254
3255/**
3256 * ring_buffer_read_start - start a non consuming read of the buffer
3257 * @buffer: The ring buffer to read from
3258 * @cpu: The cpu buffer to iterate over
3259 *
3260 * This starts up an iteration through the buffer. It also disables
3261 * the recording to the buffer until the reading is finished.
3262 * This prevents the reading from being corrupted. This is not
3263 * a consuming read, so a producer is not expected.
3264 *
3265 * Must be paired with ring_buffer_finish.
3266 */
3267struct ring_buffer_iter *
3268ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3269{
3270        struct ring_buffer_per_cpu *cpu_buffer;
3271        struct ring_buffer_iter *iter;
3272        unsigned long flags;
3273
3274        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3275                return NULL;
3276
3277        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
3278        if (!iter)
3279                return NULL;
3280
3281        cpu_buffer = buffer->buffers[cpu];
3282
3283        iter->cpu_buffer = cpu_buffer;
3284
3285        atomic_inc(&cpu_buffer->record_disabled);
3286        synchronize_sched();
3287
3288        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3289        __raw_spin_lock(&cpu_buffer->lock);
3290        rb_iter_reset(iter);
3291        __raw_spin_unlock(&cpu_buffer->lock);
3292        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3293
3294        return iter;
3295}
3296EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3297
3298/**
3299 * ring_buffer_finish - finish reading the iterator of the buffer
3300 * @iter: The iterator retrieved by ring_buffer_start
3301 *
3302 * This re-enables the recording to the buffer, and frees the
3303 * iterator.
3304 */
3305void
3306ring_buffer_read_finish(struct ring_buffer_iter *iter)
3307{
3308        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3309
3310        atomic_dec(&cpu_buffer->record_disabled);
3311        kfree(iter);
3312}
3313EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
3314
3315/**
3316 * ring_buffer_read - read the next item in the ring buffer by the iterator
3317 * @iter: The ring buffer iterator
3318 * @ts: The time stamp of the event read.
3319 *
3320 * This reads the next event in the ring buffer and increments the iterator.
3321 */
3322struct ring_buffer_event *
3323ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3324{
3325        struct ring_buffer_event *event;
3326        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3327        unsigned long flags;
3328
3329        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3330 again:
3331        event = rb_iter_peek(iter, ts);
3332        if (!event)
3333                goto out;
3334
3335        if (event->type_len == RINGBUF_TYPE_PADDING)
3336                goto again;
3337
3338        rb_advance_iter(iter);
3339 out:
3340        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3341
3342        return event;
3343}
3344EXPORT_SYMBOL_GPL(ring_buffer_read);
3345
3346/**
3347 * ring_buffer_size - return the size of the ring buffer (in bytes)
3348 * @buffer: The ring buffer.
3349 */
3350unsigned long ring_buffer_size(struct ring_buffer *buffer)
3351{
3352        return BUF_PAGE_SIZE * buffer->pages;
3353}
3354EXPORT_SYMBOL_GPL(ring_buffer_size);
3355
3356static void
3357rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3358{
3359        rb_head_page_deactivate(cpu_buffer);
3360
3361        cpu_buffer->head_page
3362                = list_entry(cpu_buffer->pages, struct buffer_page, list);
3363        local_set(&cpu_buffer->head_page->write, 0);
3364        local_set(&cpu_buffer->head_page->entries, 0);
3365        local_set(&cpu_buffer->head_page->page->commit, 0);
3366
3367        cpu_buffer->head_page->read = 0;
3368
3369        cpu_buffer->tail_page = cpu_buffer->head_page;
3370        cpu_buffer->commit_page = cpu_buffer->head_page;
3371
3372        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3373        local_set(&cpu_buffer->reader_page->write, 0);
3374        local_set(&cpu_buffer->reader_page->entries, 0);
3375        local_set(&cpu_buffer->reader_page->page->commit, 0);
3376        cpu_buffer->reader_page->read = 0;
3377
3378        local_set(&cpu_buffer->commit_overrun, 0);
3379        local_set(&cpu_buffer->overrun, 0);
3380        local_set(&cpu_buffer->entries, 0);
3381        local_set(&cpu_buffer->committing, 0);
3382        local_set(&cpu_buffer->commits, 0);
3383        cpu_buffer->read = 0;
3384
3385        cpu_buffer->write_stamp = 0;
3386        cpu_buffer->read_stamp = 0;
3387
3388        rb_head_page_activate(cpu_buffer);
3389}
3390
3391/**
3392 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
3393 * @buffer: The ring buffer to reset a per cpu buffer of
3394 * @cpu: The CPU buffer to be reset
3395 */
3396void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3397{
3398        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3399        unsigned long flags;
3400
3401        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3402                return;
3403
3404        atomic_inc(&cpu_buffer->record_disabled);
3405
3406        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3407
3408        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409                goto out;
3410
3411        __raw_spin_lock(&cpu_buffer->lock);
3412
3413        rb_reset_cpu(cpu_buffer);
3414
3415        __raw_spin_unlock(&cpu_buffer->lock);
3416
3417 out:
3418        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3419
3420        atomic_dec(&cpu_buffer->record_disabled);
3421}
3422EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3423
3424/**
3425 * ring_buffer_reset - reset a ring buffer
3426 * @buffer: The ring buffer to reset all cpu buffers
3427 */
3428void ring_buffer_reset(struct ring_buffer *buffer)
3429{
3430        int cpu;
3431
3432        for_each_buffer_cpu(buffer, cpu)
3433                ring_buffer_reset_cpu(buffer, cpu);
3434}
3435EXPORT_SYMBOL_GPL(ring_buffer_reset);
3436
3437/**
3438 * rind_buffer_empty - is the ring buffer empty?
3439 * @buffer: The ring buffer to test
3440 */
3441int ring_buffer_empty(struct ring_buffer *buffer)
3442{
3443        struct ring_buffer_per_cpu *cpu_buffer;
3444        unsigned long flags;
3445        int dolock;
3446        int cpu;
3447        int ret;
3448
3449        dolock = rb_ok_to_lock();
3450
3451        /* yes this is racy, but if you don't like the race, lock the buffer */
3452        for_each_buffer_cpu(buffer, cpu) {
3453                cpu_buffer = buffer->buffers[cpu];
3454                local_irq_save(flags);
3455                if (dolock)
3456                        spin_lock(&cpu_buffer->reader_lock);
3457                ret = rb_per_cpu_empty(cpu_buffer);
3458                if (dolock)
3459                        spin_unlock(&cpu_buffer->reader_lock);
3460                local_irq_restore(flags);
3461
3462                if (!ret)
3463                        return 0;
3464        }
3465
3466        return 1;
3467}
3468EXPORT_SYMBOL_GPL(ring_buffer_empty);
3469
3470/**
3471 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
3472 * @buffer: The ring buffer
3473 * @cpu: The CPU buffer to test
3474 */
3475int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
3476{
3477        struct ring_buffer_per_cpu *cpu_buffer;
3478        unsigned long flags;
3479        int dolock;
3480        int ret;
3481
3482        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3483                return 1;
3484
3485        dolock = rb_ok_to_lock();
3486
3487        cpu_buffer = buffer->buffers[cpu];
3488        local_irq_save(flags);
3489        if (dolock)
3490                spin_lock(&cpu_buffer->reader_lock);
3491        ret = rb_per_cpu_empty(cpu_buffer);
3492        if (dolock)
3493                spin_unlock(&cpu_buffer->reader_lock);
3494        local_irq_restore(flags);
3495
3496        return ret;
3497}
3498EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
3499
3500#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
3501/**
3502 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
3503 * @buffer_a: One buffer to swap with
3504 * @buffer_b: The other buffer to swap with
3505 *
3506 * This function is useful for tracers that want to take a "snapshot"
3507 * of a CPU buffer and has another back up buffer lying around.
3508 * it is expected that the tracer handles the cpu buffer not being
3509 * used at the moment.
3510 */
3511int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3512                         struct ring_buffer *buffer_b, int cpu)
3513{
3514        struct ring_buffer_per_cpu *cpu_buffer_a;
3515        struct ring_buffer_per_cpu *cpu_buffer_b;
3516        int ret = -EINVAL;
3517
3518        if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
3519            !cpumask_test_cpu(cpu, buffer_b->cpumask))
3520                goto out;
3521
3522        /* At least make sure the two buffers are somewhat the same */
3523        if (buffer_a->pages != buffer_b->pages)
3524                goto out;
3525
3526        ret = -EAGAIN;
3527
3528        if (ring_buffer_flags != RB_BUFFERS_ON)
3529                goto out;
3530
3531        if (atomic_read(&buffer_a->record_disabled))
3532                goto out;
3533
3534        if (atomic_read(&buffer_b->record_disabled))
3535                goto out;
3536
3537        cpu_buffer_a = buffer_a->buffers[cpu];
3538        cpu_buffer_b = buffer_b->buffers[cpu];
3539
3540        if (atomic_read(&cpu_buffer_a->record_disabled))
3541                goto out;
3542
3543        if (atomic_read(&cpu_buffer_b->record_disabled))
3544                goto out;
3545
3546        /*
3547         * We can't do a synchronize_sched here because this
3548         * function can be called in atomic context.
3549         * Normally this will be called from the same CPU as cpu.
3550         * If not it's up to the caller to protect this.
3551         */
3552        atomic_inc(&cpu_buffer_a->record_disabled);
3553        atomic_inc(&cpu_buffer_b->record_disabled);
3554
3555        ret = -EBUSY;
3556        if (local_read(&cpu_buffer_a->committing))
3557                goto out_dec;
3558        if (local_read(&cpu_buffer_b->committing))
3559                goto out_dec;
3560
3561        buffer_a->buffers[cpu] = cpu_buffer_b;
3562        buffer_b->buffers[cpu] = cpu_buffer_a;
3563
3564        cpu_buffer_b->buffer = buffer_a;
3565        cpu_buffer_a->buffer = buffer_b;
3566
3567        ret = 0;
3568
3569out_dec:
3570        atomic_dec(&cpu_buffer_a->record_disabled);
3571        atomic_dec(&cpu_buffer_b->record_disabled);
3572out:
3573        return ret;
3574}
3575EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3576#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
3577
3578/**
3579 * ring_buffer_alloc_read_page - allocate a page to read from buffer
3580 * @buffer: the buffer to allocate for.
3581 *
3582 * This function is used in conjunction with ring_buffer_read_page.
3583 * When reading a full page from the ring buffer, these functions
3584 * can be used to speed up the process. The calling function should
3585 * allocate a few pages first with this function. Then when it
3586 * needs to get pages from the ring buffer, it passes the result
3587 * of this function into ring_buffer_read_page, which will swap
3588 * the page that was allocated, with the read page of the buffer.
3589 *
3590 * Returns:
3591 *  The page allocated, or NULL on error.
3592 */
3593void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
3594{
3595        struct buffer_data_page *bpage;
3596        unsigned long addr;
3597
3598        addr = __get_free_page(GFP_KERNEL);
3599        if (!addr)
3600                return NULL;
3601
3602        bpage = (void *)addr;
3603
3604        rb_init_page(bpage);
3605
3606        return bpage;
3607}
3608EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
3609
3610/**
3611 * ring_buffer_free_read_page - free an allocated read page
3612 * @buffer: the buffer the page was allocate for
3613 * @data: the page to free
3614 *
3615 * Free a page allocated from ring_buffer_alloc_read_page.
3616 */
3617void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
3618{
3619        free_page((unsigned long)data);
3620}
3621EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
3622
3623/**
3624 * ring_buffer_read_page - extract a page from the ring buffer
3625 * @buffer: buffer to extract from
3626 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
3627 * @len: amount to extract
3628 * @cpu: the cpu of the buffer to extract
3629 * @full: should the extraction only happen when the page is full.
3630 *
3631 * This function will pull out a page from the ring buffer and consume it.
3632 * @data_page must be the address of the variable that was returned
3633 * from ring_buffer_alloc_read_page. This is because the page might be used
3634 * to swap with a page in the ring buffer.
3635 *
3636 * for example:
3637 *      rpage = ring_buffer_alloc_read_page(buffer);
3638 *      if (!rpage)
3639 *              return error;
3640 *      ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
3641 *      if (ret >= 0)
3642 *              process_page(rpage, ret);
3643 *
3644 * When @full is set, the function will not return true unless
3645 * the writer is off the reader page.
3646 *
3647 * Note: it is up to the calling functions to handle sleeps and wakeups.
3648 *  The ring buffer can be used anywhere in the kernel and can not
3649 *  blindly call wake_up. The layer that uses the ring buffer must be
3650 *  responsible for that.
3651 *
3652 * Returns:
3653 *  >=0 if data has been transferred, returns the offset of consumed data.
3654 *  <0 if no data has been transferred.
3655 */
3656int ring_buffer_read_page(struct ring_buffer *buffer,
3657                          void **data_page, size_t len, int cpu, int full)
3658{
3659        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3660        struct ring_buffer_event *event;
3661        struct buffer_data_page *bpage;
3662        struct buffer_page *reader;
3663        unsigned long flags;
3664        unsigned int commit;
3665        unsigned int read;
3666        u64 save_timestamp;
3667        int ret = -1;
3668
3669        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3670                goto out;
3671
3672        /*
3673         * If len is not big enough to hold the page header, then
3674         * we can not copy anything.
3675         */
3676        if (len <= BUF_PAGE_HDR_SIZE)
3677                goto out;
3678
3679        len -= BUF_PAGE_HDR_SIZE;
3680
3681        if (!data_page)
3682                goto out;
3683
3684        bpage = *data_page;
3685        if (!bpage)
3686                goto out;
3687
3688        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3689
3690        reader = rb_get_reader_page(cpu_buffer);
3691        if (!reader)
3692                goto out_unlock;
3693
3694        event = rb_reader_event(cpu_buffer);
3695
3696        read = reader->read;
3697        commit = rb_page_commit(reader);
3698
3699        /*
3700         * If this page has been partially read or
3701         * if len is not big enough to read the rest of the page or
3702         * a writer is still on the page, then
3703         * we must copy the data from the page to the buffer.
3704         * Otherwise, we can simply swap the page with the one passed in.
3705         */
3706        if (read || (len < (commit - read)) ||
3707            cpu_buffer->reader_page == cpu_buffer->commit_page) {
3708                struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
3709                unsigned int rpos = read;
3710                unsigned int pos = 0;
3711                unsigned int size;
3712
3713                if (full)
3714                        goto out_unlock;
3715
3716                if (len > (commit - read))
3717                        len = (commit - read);
3718
3719                size = rb_event_length(event);
3720
3721                if (len < size)
3722                        goto out_unlock;
3723
3724                /* save the current timestamp, since the user will need it */
3725                save_timestamp = cpu_buffer->read_stamp;
3726
3727                /* Need to copy one event at a time */
3728                do {
3729                        memcpy(bpage->data + pos, rpage->data + rpos, size);
3730
3731                        len -= size;
3732
3733                        rb_advance_reader(cpu_buffer);
3734                        rpos = reader->read;
3735                        pos += size;
3736
3737                        event = rb_reader_event(cpu_buffer);
3738                        size = rb_event_length(event);
3739                } while (len > size);
3740
3741                /* update bpage */
3742                local_set(&bpage->commit, pos);
3743                bpage->time_stamp = save_timestamp;
3744
3745                /* we copied everything to the beginning */
3746                read = 0;
3747        } else {
3748                /* update the entry counter */
3749                cpu_buffer->read += rb_page_entries(reader);
3750
3751                /* swap the pages */
3752                rb_init_page(bpage);
3753                bpage = reader->page;
3754                reader->page = *data_page;
3755                local_set(&reader->write, 0);
3756                local_set(&reader->entries, 0);
3757                reader->read = 0;
3758                *data_page = bpage;
3759        }
3760        ret = read;
3761
3762 out_unlock:
3763        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3764
3765 out:
3766        return ret;
3767}
3768EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3769
3770#ifdef CONFIG_TRACING
3771static ssize_t
3772rb_simple_read(struct file *filp, char __user *ubuf,
3773               size_t cnt, loff_t *ppos)
3774{
3775        unsigned long *p = filp->private_data;
3776        char buf[64];
3777        int r;
3778
3779        if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
3780                r = sprintf(buf, "permanently disabled\n");
3781        else
3782                r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
3783
3784        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3785}
3786
3787static ssize_t
3788rb_simple_write(struct file *filp, const char __user *ubuf,
3789                size_t cnt, loff_t *ppos)
3790{
3791        unsigned long *p = filp->private_data;
3792        char buf[64];
3793        unsigned long val;
3794        int ret;
3795
3796        if (cnt >= sizeof(buf))
3797                return -EINVAL;
3798
3799        if (copy_from_user(&buf, ubuf, cnt))
3800                return -EFAULT;
3801
3802        buf[cnt] = 0;
3803
3804        ret = strict_strtoul(buf, 10, &val);
3805        if (ret < 0)
3806                return ret;
3807
3808        if (val)
3809                set_bit(RB_BUFFERS_ON_BIT, p);
3810        else
3811                clear_bit(RB_BUFFERS_ON_BIT, p);
3812
3813        (*ppos)++;
3814
3815        return cnt;
3816}
3817
3818static const struct file_operations rb_simple_fops = {
3819        .open           = tracing_open_generic,
3820        .read           = rb_simple_read,
3821        .write          = rb_simple_write,
3822};
3823
3824
3825static __init int rb_init_debugfs(void)
3826{
3827        struct dentry *d_tracer;
3828
3829        d_tracer = tracing_init_dentry();
3830
3831        trace_create_file("tracing_on", 0644, d_tracer,
3832                            &ring_buffer_flags, &rb_simple_fops);
3833
3834        return 0;
3835}
3836
3837fs_initcall(rb_init_debugfs);
3838#endif
3839
3840#ifdef CONFIG_HOTPLUG_CPU
3841static int rb_cpu_notify(struct notifier_block *self,
3842                         unsigned long action, void *hcpu)
3843{
3844        struct ring_buffer *buffer =
3845                container_of(self, struct ring_buffer, cpu_notify);
3846        long cpu = (long)hcpu;
3847
3848        switch (action) {
3849        case CPU_UP_PREPARE:
3850        case CPU_UP_PREPARE_FROZEN:
3851                if (cpumask_test_cpu(cpu, buffer->cpumask))
3852                        return NOTIFY_OK;
3853
3854                buffer->buffers[cpu] =
3855                        rb_allocate_cpu_buffer(buffer, cpu);
3856                if (!buffer->buffers[cpu]) {
3857                        WARN(1, "failed to allocate ring buffer on CPU %ld\n",
3858                             cpu);
3859                        return NOTIFY_OK;
3860                }
3861                smp_wmb();
3862                cpumask_set_cpu(cpu, buffer->cpumask);
3863                break;
3864        case CPU_DOWN_PREPARE:
3865        case CPU_DOWN_PREPARE_FROZEN:
3866                /*
3867                 * Do nothing.
3868                 *  If we were to free the buffer, then the user would
3869                 *  lose any trace that was in the buffer.
3870                 */
3871                break;
3872        default:
3873                break;
3874        }
3875        return NOTIFY_OK;
3876}
3877#endif
3878