linux/arch/x86/kernel/ds.c
<<
>>
Prefs
   1/*
   2 * Debug Store support
   3 *
   4 * This provides a low-level interface to the hardware's Debug Store
   5 * feature that is used for branch trace store (BTS) and
   6 * precise-event based sampling (PEBS).
   7 *
   8 * It manages:
   9 * - DS and BTS hardware configuration
  10 * - buffer overflow handling (to be done)
  11 * - buffer access
  12 *
  13 * It does not do:
  14 * - security checking (is the caller allowed to trace the task)
  15 * - buffer allocation (memory accounting)
  16 *
  17 *
  18 * Copyright (C) 2007-2009 Intel Corporation.
  19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  20 */
  21
  22#include <linux/kernel.h>
  23#include <linux/string.h>
  24#include <linux/errno.h>
  25#include <linux/sched.h>
  26#include <linux/slab.h>
  27#include <linux/mm.h>
  28#include <linux/trace_clock.h>
  29
  30#include <asm/ds.h>
  31
  32#include "ds_selftest.h"
  33
  34/*
  35 * The configuration for a particular DS hardware implementation:
  36 */
  37struct ds_configuration {
  38        /* The name of the configuration: */
  39        const char              *name;
  40
  41        /* The size of pointer-typed fields in DS, BTS, and PEBS: */
  42        unsigned char           sizeof_ptr_field;
  43
  44        /* The size of a BTS/PEBS record in bytes: */
  45        unsigned char           sizeof_rec[2];
  46
  47        /* The number of pebs counter reset values in the DS structure. */
  48        unsigned char           nr_counter_reset;
  49
  50        /* Control bit-masks indexed by enum ds_feature: */
  51        unsigned long           ctl[dsf_ctl_max];
  52};
  53static struct ds_configuration ds_cfg __read_mostly;
  54
  55
  56/* Maximal size of a DS configuration: */
  57#define MAX_SIZEOF_DS           0x80
  58
  59/* Maximal size of a BTS record: */
  60#define MAX_SIZEOF_BTS          (3 * 8)
  61
  62/* BTS and PEBS buffer alignment: */
  63#define DS_ALIGNMENT            (1 << 3)
  64
  65/* Number of buffer pointers in DS: */
  66#define NUM_DS_PTR_FIELDS       8
  67
  68/* Size of a pebs reset value in DS: */
  69#define PEBS_RESET_FIELD_SIZE   8
  70
  71/* Mask of control bits in the DS MSR register: */
  72#define BTS_CONTROL                               \
  73        ( ds_cfg.ctl[dsf_bts]                   | \
  74          ds_cfg.ctl[dsf_bts_kernel]            | \
  75          ds_cfg.ctl[dsf_bts_user]              | \
  76          ds_cfg.ctl[dsf_bts_overflow] )
  77
  78/*
  79 * A BTS or PEBS tracer.
  80 *
  81 * This holds the configuration of the tracer and serves as a handle
  82 * to identify tracers.
  83 */
  84struct ds_tracer {
  85        /* The DS context (partially) owned by this tracer. */
  86        struct ds_context       *context;
  87        /* The buffer provided on ds_request() and its size in bytes. */
  88        void                    *buffer;
  89        size_t                  size;
  90};
  91
  92struct bts_tracer {
  93        /* The common DS part: */
  94        struct ds_tracer        ds;
  95
  96        /* The trace including the DS configuration: */
  97        struct bts_trace        trace;
  98
  99        /* Buffer overflow notification function: */
 100        bts_ovfl_callback_t     ovfl;
 101
 102        /* Active flags affecting trace collection. */
 103        unsigned int            flags;
 104};
 105
 106struct pebs_tracer {
 107        /* The common DS part: */
 108        struct ds_tracer        ds;
 109
 110        /* The trace including the DS configuration: */
 111        struct pebs_trace       trace;
 112
 113        /* Buffer overflow notification function: */
 114        pebs_ovfl_callback_t    ovfl;
 115};
 116
 117/*
 118 * Debug Store (DS) save area configuration (see Intel64 and IA32
 119 * Architectures Software Developer's Manual, section 18.5)
 120 *
 121 * The DS configuration consists of the following fields; different
 122 * architetures vary in the size of those fields.
 123 *
 124 * - double-word aligned base linear address of the BTS buffer
 125 * - write pointer into the BTS buffer
 126 * - end linear address of the BTS buffer (one byte beyond the end of
 127 *   the buffer)
 128 * - interrupt pointer into BTS buffer
 129 *   (interrupt occurs when write pointer passes interrupt pointer)
 130 * - double-word aligned base linear address of the PEBS buffer
 131 * - write pointer into the PEBS buffer
 132 * - end linear address of the PEBS buffer (one byte beyond the end of
 133 *   the buffer)
 134 * - interrupt pointer into PEBS buffer
 135 *   (interrupt occurs when write pointer passes interrupt pointer)
 136 * - value to which counter is reset following counter overflow
 137 *
 138 * Later architectures use 64bit pointers throughout, whereas earlier
 139 * architectures use 32bit pointers in 32bit mode.
 140 *
 141 *
 142 * We compute the base address for the first 8 fields based on:
 143 * - the field size stored in the DS configuration
 144 * - the relative field position
 145 * - an offset giving the start of the respective region
 146 *
 147 * This offset is further used to index various arrays holding
 148 * information for BTS and PEBS at the respective index.
 149 *
 150 * On later 32bit processors, we only access the lower 32bit of the
 151 * 64bit pointer fields. The upper halves will be zeroed out.
 152 */
 153
 154enum ds_field {
 155        ds_buffer_base = 0,
 156        ds_index,
 157        ds_absolute_maximum,
 158        ds_interrupt_threshold,
 159};
 160
 161enum ds_qualifier {
 162        ds_bts = 0,
 163        ds_pebs
 164};
 165
 166static inline unsigned long
 167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
 168{
 169        base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 170        return *(unsigned long *)base;
 171}
 172
 173static inline void
 174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
 175       unsigned long value)
 176{
 177        base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 178        (*(unsigned long *)base) = value;
 179}
 180
 181
 182/*
 183 * Locking is done only for allocating BTS or PEBS resources.
 184 */
 185static DEFINE_SPINLOCK(ds_lock);
 186
 187/*
 188 * We either support (system-wide) per-cpu or per-thread allocation.
 189 * We distinguish the two based on the task_struct pointer, where a
 190 * NULL pointer indicates per-cpu allocation for the current cpu.
 191 *
 192 * Allocations are use-counted. As soon as resources are allocated,
 193 * further allocations must be of the same type (per-cpu or
 194 * per-thread). We model this by counting allocations (i.e. the number
 195 * of tracers of a certain type) for one type negatively:
 196 *   =0  no tracers
 197 *   >0  number of per-thread tracers
 198 *   <0  number of per-cpu tracers
 199 *
 200 * Tracers essentially gives the number of ds contexts for a certain
 201 * type of allocation.
 202 */
 203static atomic_t tracers = ATOMIC_INIT(0);
 204
 205static inline int get_tracer(struct task_struct *task)
 206{
 207        int error;
 208
 209        spin_lock_irq(&ds_lock);
 210
 211        if (task) {
 212                error = -EPERM;
 213                if (atomic_read(&tracers) < 0)
 214                        goto out;
 215                atomic_inc(&tracers);
 216        } else {
 217                error = -EPERM;
 218                if (atomic_read(&tracers) > 0)
 219                        goto out;
 220                atomic_dec(&tracers);
 221        }
 222
 223        error = 0;
 224out:
 225        spin_unlock_irq(&ds_lock);
 226        return error;
 227}
 228
 229static inline void put_tracer(struct task_struct *task)
 230{
 231        if (task)
 232                atomic_dec(&tracers);
 233        else
 234                atomic_inc(&tracers);
 235}
 236
 237/*
 238 * The DS context is either attached to a thread or to a cpu:
 239 * - in the former case, the thread_struct contains a pointer to the
 240 *   attached context.
 241 * - in the latter case, we use a static array of per-cpu context
 242 *   pointers.
 243 *
 244 * Contexts are use-counted. They are allocated on first access and
 245 * deallocated when the last user puts the context.
 246 */
 247struct ds_context {
 248        /* The DS configuration; goes into MSR_IA32_DS_AREA: */
 249        unsigned char           ds[MAX_SIZEOF_DS];
 250
 251        /* The owner of the BTS and PEBS configuration, respectively: */
 252        struct bts_tracer       *bts_master;
 253        struct pebs_tracer      *pebs_master;
 254
 255        /* Use count: */
 256        unsigned long           count;
 257
 258        /* Pointer to the context pointer field: */
 259        struct ds_context       **this;
 260
 261        /* The traced task; NULL for cpu tracing: */
 262        struct task_struct      *task;
 263
 264        /* The traced cpu; only valid if task is NULL: */
 265        int                     cpu;
 266};
 267
 268static DEFINE_PER_CPU(struct ds_context *, cpu_context);
 269
 270
 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
 272{
 273        struct ds_context **p_context =
 274                (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
 275        struct ds_context *context = NULL;
 276        struct ds_context *new_context = NULL;
 277
 278        /* Chances are small that we already have a context. */
 279        new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
 280        if (!new_context)
 281                return NULL;
 282
 283        spin_lock_irq(&ds_lock);
 284
 285        context = *p_context;
 286        if (likely(!context)) {
 287                context = new_context;
 288
 289                context->this = p_context;
 290                context->task = task;
 291                context->cpu = cpu;
 292                context->count = 0;
 293
 294                *p_context = context;
 295        }
 296
 297        context->count++;
 298
 299        spin_unlock_irq(&ds_lock);
 300
 301        if (context != new_context)
 302                kfree(new_context);
 303
 304        return context;
 305}
 306
 307static void ds_put_context(struct ds_context *context)
 308{
 309        struct task_struct *task;
 310        unsigned long irq;
 311
 312        if (!context)
 313                return;
 314
 315        spin_lock_irqsave(&ds_lock, irq);
 316
 317        if (--context->count) {
 318                spin_unlock_irqrestore(&ds_lock, irq);
 319                return;
 320        }
 321
 322        *(context->this) = NULL;
 323
 324        task = context->task;
 325
 326        if (task)
 327                clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 328
 329        /*
 330         * We leave the (now dangling) pointer to the DS configuration in
 331         * the DS_AREA msr. This is as good or as bad as replacing it with
 332         * NULL - the hardware would crash if we enabled tracing.
 333         *
 334         * This saves us some problems with having to write an msr on a
 335         * different cpu while preventing others from doing the same for the
 336         * next context for that same cpu.
 337         */
 338
 339        spin_unlock_irqrestore(&ds_lock, irq);
 340
 341        /* The context might still be in use for context switching. */
 342        if (task && (task != current))
 343                wait_task_context_switch(task);
 344
 345        kfree(context);
 346}
 347
 348static void ds_install_ds_area(struct ds_context *context)
 349{
 350        unsigned long ds;
 351
 352        ds = (unsigned long)context->ds;
 353
 354        /*
 355         * There is a race between the bts master and the pebs master.
 356         *
 357         * The thread/cpu access is synchronized via get/put_cpu() for
 358         * task tracing and via wrmsr_on_cpu for cpu tracing.
 359         *
 360         * If bts and pebs are collected for the same task or same cpu,
 361         * the same confiuration is written twice.
 362         */
 363        if (context->task) {
 364                get_cpu();
 365                if (context->task == current)
 366                        wrmsrl(MSR_IA32_DS_AREA, ds);
 367                set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
 368                put_cpu();
 369        } else
 370                wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
 371                             (u32)((u64)ds), (u32)((u64)ds >> 32));
 372}
 373
 374/*
 375 * Call the tracer's callback on a buffer overflow.
 376 *
 377 * context: the ds context
 378 * qual: the buffer type
 379 */
 380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
 381{
 382        switch (qual) {
 383        case ds_bts:
 384                if (context->bts_master &&
 385                    context->bts_master->ovfl)
 386                        context->bts_master->ovfl(context->bts_master);
 387                break;
 388        case ds_pebs:
 389                if (context->pebs_master &&
 390                    context->pebs_master->ovfl)
 391                        context->pebs_master->ovfl(context->pebs_master);
 392                break;
 393        }
 394}
 395
 396
 397/*
 398 * Write raw data into the BTS or PEBS buffer.
 399 *
 400 * The remainder of any partially written record is zeroed out.
 401 *
 402 * context: the DS context
 403 * qual:    the buffer type
 404 * record:  the data to write
 405 * size:    the size of the data
 406 */
 407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 408                    const void *record, size_t size)
 409{
 410        int bytes_written = 0;
 411
 412        if (!record)
 413                return -EINVAL;
 414
 415        while (size) {
 416                unsigned long base, index, end, write_end, int_th;
 417                unsigned long write_size, adj_write_size;
 418
 419                /*
 420                 * Write as much as possible without producing an
 421                 * overflow interrupt.
 422                 *
 423                 * Interrupt_threshold must either be
 424                 * - bigger than absolute_maximum or
 425                 * - point to a record between buffer_base and absolute_maximum
 426                 *
 427                 * Index points to a valid record.
 428                 */
 429                base   = ds_get(context->ds, qual, ds_buffer_base);
 430                index  = ds_get(context->ds, qual, ds_index);
 431                end    = ds_get(context->ds, qual, ds_absolute_maximum);
 432                int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
 433
 434                write_end = min(end, int_th);
 435
 436                /*
 437                 * If we are already beyond the interrupt threshold,
 438                 * we fill the entire buffer.
 439                 */
 440                if (write_end <= index)
 441                        write_end = end;
 442
 443                if (write_end <= index)
 444                        break;
 445
 446                write_size = min((unsigned long) size, write_end - index);
 447                memcpy((void *)index, record, write_size);
 448
 449                record = (const char *)record + write_size;
 450                size -= write_size;
 451                bytes_written += write_size;
 452
 453                adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
 454                adj_write_size *= ds_cfg.sizeof_rec[qual];
 455
 456                /* Zero out trailing bytes. */
 457                memset((char *)index + write_size, 0,
 458                       adj_write_size - write_size);
 459                index += adj_write_size;
 460
 461                if (index >= end)
 462                        index = base;
 463                ds_set(context->ds, qual, ds_index, index);
 464
 465                if (index >= int_th)
 466                        ds_overflow(context, qual);
 467        }
 468
 469        return bytes_written;
 470}
 471
 472
 473/*
 474 * Branch Trace Store (BTS) uses the following format. Different
 475 * architectures vary in the size of those fields.
 476 * - source linear address
 477 * - destination linear address
 478 * - flags
 479 *
 480 * Later architectures use 64bit pointers throughout, whereas earlier
 481 * architectures use 32bit pointers in 32bit mode.
 482 *
 483 * We compute the base address for the fields based on:
 484 * - the field size stored in the DS configuration
 485 * - the relative field position
 486 *
 487 * In order to store additional information in the BTS buffer, we use
 488 * a special source address to indicate that the record requires
 489 * special interpretation.
 490 *
 491 * Netburst indicated via a bit in the flags field whether the branch
 492 * was predicted; this is ignored.
 493 *
 494 * We use two levels of abstraction:
 495 * - the raw data level defined here
 496 * - an arch-independent level defined in ds.h
 497 */
 498
 499enum bts_field {
 500        bts_from,
 501        bts_to,
 502        bts_flags,
 503
 504        bts_qual                = bts_from,
 505        bts_clock               = bts_to,
 506        bts_pid                 = bts_flags,
 507
 508        bts_qual_mask           = (bts_qual_max - 1),
 509        bts_escape              = ((unsigned long)-1 & ~bts_qual_mask)
 510};
 511
 512static inline unsigned long bts_get(const char *base, unsigned long field)
 513{
 514        base += (ds_cfg.sizeof_ptr_field * field);
 515        return *(unsigned long *)base;
 516}
 517
 518static inline void bts_set(char *base, unsigned long field, unsigned long val)
 519{
 520        base += (ds_cfg.sizeof_ptr_field * field);
 521        (*(unsigned long *)base) = val;
 522}
 523
 524
 525/*
 526 * The raw BTS data is architecture dependent.
 527 *
 528 * For higher-level users, we give an arch-independent view.
 529 * - ds.h defines struct bts_struct
 530 * - bts_read translates one raw bts record into a bts_struct
 531 * - bts_write translates one bts_struct into the raw format and
 532 *   writes it into the top of the parameter tracer's buffer.
 533 *
 534 * return: bytes read/written on success; -Eerrno, otherwise
 535 */
 536static int
 537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 538{
 539        if (!tracer)
 540                return -EINVAL;
 541
 542        if (at < tracer->trace.ds.begin)
 543                return -EINVAL;
 544
 545        if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
 546                return -EINVAL;
 547
 548        memset(out, 0, sizeof(*out));
 549        if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
 550                out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
 551                out->variant.event.clock = bts_get(at, bts_clock);
 552                out->variant.event.pid = bts_get(at, bts_pid);
 553        } else {
 554                out->qualifier = bts_branch;
 555                out->variant.lbr.from = bts_get(at, bts_from);
 556                out->variant.lbr.to   = bts_get(at, bts_to);
 557
 558                if (!out->variant.lbr.from && !out->variant.lbr.to)
 559                        out->qualifier = bts_invalid;
 560        }
 561
 562        return ds_cfg.sizeof_rec[ds_bts];
 563}
 564
 565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
 566{
 567        unsigned char raw[MAX_SIZEOF_BTS];
 568
 569        if (!tracer)
 570                return -EINVAL;
 571
 572        if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
 573                return -EOVERFLOW;
 574
 575        switch (in->qualifier) {
 576        case bts_invalid:
 577                bts_set(raw, bts_from, 0);
 578                bts_set(raw, bts_to, 0);
 579                bts_set(raw, bts_flags, 0);
 580                break;
 581        case bts_branch:
 582                bts_set(raw, bts_from, in->variant.lbr.from);
 583                bts_set(raw, bts_to,   in->variant.lbr.to);
 584                bts_set(raw, bts_flags, 0);
 585                break;
 586        case bts_task_arrives:
 587        case bts_task_departs:
 588                bts_set(raw, bts_qual, (bts_escape | in->qualifier));
 589                bts_set(raw, bts_clock, in->variant.event.clock);
 590                bts_set(raw, bts_pid, in->variant.event.pid);
 591                break;
 592        default:
 593                return -EINVAL;
 594        }
 595
 596        return ds_write(tracer->ds.context, ds_bts, raw,
 597                        ds_cfg.sizeof_rec[ds_bts]);
 598}
 599
 600
 601static void ds_write_config(struct ds_context *context,
 602                            struct ds_trace *cfg, enum ds_qualifier qual)
 603{
 604        unsigned char *ds = context->ds;
 605
 606        ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
 607        ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
 608        ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
 609        ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
 610}
 611
 612static void ds_read_config(struct ds_context *context,
 613                           struct ds_trace *cfg, enum ds_qualifier qual)
 614{
 615        unsigned char *ds = context->ds;
 616
 617        cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
 618        cfg->top = (void *)ds_get(ds, qual, ds_index);
 619        cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
 620        cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
 621}
 622
 623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 624                             void *base, size_t size, size_t ith,
 625                             unsigned int flags) {
 626        unsigned long buffer, adj;
 627
 628        /*
 629         * Adjust the buffer address and size to meet alignment
 630         * constraints:
 631         * - buffer is double-word aligned
 632         * - size is multiple of record size
 633         *
 634         * We checked the size at the very beginning; we have enough
 635         * space to do the adjustment.
 636         */
 637        buffer = (unsigned long)base;
 638
 639        adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
 640        buffer += adj;
 641        size   -= adj;
 642
 643        trace->n = size / ds_cfg.sizeof_rec[qual];
 644        trace->size = ds_cfg.sizeof_rec[qual];
 645
 646        size = (trace->n * trace->size);
 647
 648        trace->begin = (void *)buffer;
 649        trace->top = trace->begin;
 650        trace->end = (void *)(buffer + size);
 651        /*
 652         * The value for 'no threshold' is -1, which will set the
 653         * threshold outside of the buffer, just like we want it.
 654         */
 655        ith *= ds_cfg.sizeof_rec[qual];
 656        trace->ith = (void *)(buffer + size - ith);
 657
 658        trace->flags = flags;
 659}
 660
 661
 662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 663                      enum ds_qualifier qual, struct task_struct *task,
 664                      int cpu, void *base, size_t size, size_t th)
 665{
 666        struct ds_context *context;
 667        int error;
 668        size_t req_size;
 669
 670        error = -EOPNOTSUPP;
 671        if (!ds_cfg.sizeof_rec[qual])
 672                goto out;
 673
 674        error = -EINVAL;
 675        if (!base)
 676                goto out;
 677
 678        req_size = ds_cfg.sizeof_rec[qual];
 679        /* We might need space for alignment adjustments. */
 680        if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
 681                req_size += DS_ALIGNMENT;
 682
 683        error = -EINVAL;
 684        if (size < req_size)
 685                goto out;
 686
 687        if (th != (size_t)-1) {
 688                th *= ds_cfg.sizeof_rec[qual];
 689
 690                error = -EINVAL;
 691                if (size <= th)
 692                        goto out;
 693        }
 694
 695        tracer->buffer = base;
 696        tracer->size = size;
 697
 698        error = -ENOMEM;
 699        context = ds_get_context(task, cpu);
 700        if (!context)
 701                goto out;
 702        tracer->context = context;
 703
 704        /*
 705         * Defer any tracer-specific initialization work for the context until
 706         * context ownership has been clarified.
 707         */
 708
 709        error = 0;
 710 out:
 711        return error;
 712}
 713
 714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
 715                                         void *base, size_t size,
 716                                         bts_ovfl_callback_t ovfl, size_t th,
 717                                         unsigned int flags)
 718{
 719        struct bts_tracer *tracer;
 720        int error;
 721
 722        /* Buffer overflow notification is not yet implemented. */
 723        error = -EOPNOTSUPP;
 724        if (ovfl)
 725                goto out;
 726
 727        error = get_tracer(task);
 728        if (error < 0)
 729                goto out;
 730
 731        error = -ENOMEM;
 732        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 733        if (!tracer)
 734                goto out_put_tracer;
 735        tracer->ovfl = ovfl;
 736
 737        /* Do some more error checking and acquire a tracing context. */
 738        error = ds_request(&tracer->ds, &tracer->trace.ds,
 739                           ds_bts, task, cpu, base, size, th);
 740        if (error < 0)
 741                goto out_tracer;
 742
 743        /* Claim the bts part of the tracing context we acquired above. */
 744        spin_lock_irq(&ds_lock);
 745
 746        error = -EPERM;
 747        if (tracer->ds.context->bts_master)
 748                goto out_unlock;
 749        tracer->ds.context->bts_master = tracer;
 750
 751        spin_unlock_irq(&ds_lock);
 752
 753        /*
 754         * Now that we own the bts part of the context, let's complete the
 755         * initialization for that part.
 756         */
 757        ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
 758        ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
 759        ds_install_ds_area(tracer->ds.context);
 760
 761        tracer->trace.read  = bts_read;
 762        tracer->trace.write = bts_write;
 763
 764        /* Start tracing. */
 765        ds_resume_bts(tracer);
 766
 767        return tracer;
 768
 769 out_unlock:
 770        spin_unlock_irq(&ds_lock);
 771        ds_put_context(tracer->ds.context);
 772 out_tracer:
 773        kfree(tracer);
 774 out_put_tracer:
 775        put_tracer(task);
 776 out:
 777        return ERR_PTR(error);
 778}
 779
 780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
 781                                       void *base, size_t size,
 782                                       bts_ovfl_callback_t ovfl,
 783                                       size_t th, unsigned int flags)
 784{
 785        return ds_request_bts(task, 0, base, size, ovfl, th, flags);
 786}
 787
 788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
 789                                      bts_ovfl_callback_t ovfl,
 790                                      size_t th, unsigned int flags)
 791{
 792        return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
 793}
 794
 795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
 796                                           void *base, size_t size,
 797                                           pebs_ovfl_callback_t ovfl, size_t th,
 798                                           unsigned int flags)
 799{
 800        struct pebs_tracer *tracer;
 801        int error;
 802
 803        /* Buffer overflow notification is not yet implemented. */
 804        error = -EOPNOTSUPP;
 805        if (ovfl)
 806                goto out;
 807
 808        error = get_tracer(task);
 809        if (error < 0)
 810                goto out;
 811
 812        error = -ENOMEM;
 813        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 814        if (!tracer)
 815                goto out_put_tracer;
 816        tracer->ovfl = ovfl;
 817
 818        /* Do some more error checking and acquire a tracing context. */
 819        error = ds_request(&tracer->ds, &tracer->trace.ds,
 820                           ds_pebs, task, cpu, base, size, th);
 821        if (error < 0)
 822                goto out_tracer;
 823
 824        /* Claim the pebs part of the tracing context we acquired above. */
 825        spin_lock_irq(&ds_lock);
 826
 827        error = -EPERM;
 828        if (tracer->ds.context->pebs_master)
 829                goto out_unlock;
 830        tracer->ds.context->pebs_master = tracer;
 831
 832        spin_unlock_irq(&ds_lock);
 833
 834        /*
 835         * Now that we own the pebs part of the context, let's complete the
 836         * initialization for that part.
 837         */
 838        ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
 839        ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 840        ds_install_ds_area(tracer->ds.context);
 841
 842        /* Start tracing. */
 843        ds_resume_pebs(tracer);
 844
 845        return tracer;
 846
 847 out_unlock:
 848        spin_unlock_irq(&ds_lock);
 849        ds_put_context(tracer->ds.context);
 850 out_tracer:
 851        kfree(tracer);
 852 out_put_tracer:
 853        put_tracer(task);
 854 out:
 855        return ERR_PTR(error);
 856}
 857
 858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
 859                                         void *base, size_t size,
 860                                         pebs_ovfl_callback_t ovfl,
 861                                         size_t th, unsigned int flags)
 862{
 863        return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
 864}
 865
 866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
 867                                        pebs_ovfl_callback_t ovfl,
 868                                        size_t th, unsigned int flags)
 869{
 870        return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
 871}
 872
 873static void ds_free_bts(struct bts_tracer *tracer)
 874{
 875        struct task_struct *task;
 876
 877        task = tracer->ds.context->task;
 878
 879        WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
 880        tracer->ds.context->bts_master = NULL;
 881
 882        /* Make sure tracing stopped and the tracer is not in use. */
 883        if (task && (task != current))
 884                wait_task_context_switch(task);
 885
 886        ds_put_context(tracer->ds.context);
 887        put_tracer(task);
 888
 889        kfree(tracer);
 890}
 891
 892void ds_release_bts(struct bts_tracer *tracer)
 893{
 894        might_sleep();
 895
 896        if (!tracer)
 897                return;
 898
 899        ds_suspend_bts(tracer);
 900        ds_free_bts(tracer);
 901}
 902
 903int ds_release_bts_noirq(struct bts_tracer *tracer)
 904{
 905        struct task_struct *task;
 906        unsigned long irq;
 907        int error;
 908
 909        if (!tracer)
 910                return 0;
 911
 912        task = tracer->ds.context->task;
 913
 914        local_irq_save(irq);
 915
 916        error = -EPERM;
 917        if (!task &&
 918            (tracer->ds.context->cpu != smp_processor_id()))
 919                goto out;
 920
 921        error = -EPERM;
 922        if (task && (task != current))
 923                goto out;
 924
 925        ds_suspend_bts_noirq(tracer);
 926        ds_free_bts(tracer);
 927
 928        error = 0;
 929 out:
 930        local_irq_restore(irq);
 931        return error;
 932}
 933
 934static void update_task_debugctlmsr(struct task_struct *task,
 935                                    unsigned long debugctlmsr)
 936{
 937        task->thread.debugctlmsr = debugctlmsr;
 938
 939        get_cpu();
 940        if (task == current)
 941                update_debugctlmsr(debugctlmsr);
 942        put_cpu();
 943}
 944
 945void ds_suspend_bts(struct bts_tracer *tracer)
 946{
 947        struct task_struct *task;
 948        unsigned long debugctlmsr;
 949        int cpu;
 950
 951        if (!tracer)
 952                return;
 953
 954        tracer->flags = 0;
 955
 956        task = tracer->ds.context->task;
 957        cpu  = tracer->ds.context->cpu;
 958
 959        WARN_ON(!task && irqs_disabled());
 960
 961        debugctlmsr = (task ?
 962                       task->thread.debugctlmsr :
 963                       get_debugctlmsr_on_cpu(cpu));
 964        debugctlmsr &= ~BTS_CONTROL;
 965
 966        if (task)
 967                update_task_debugctlmsr(task, debugctlmsr);
 968        else
 969                update_debugctlmsr_on_cpu(cpu, debugctlmsr);
 970}
 971
 972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
 973{
 974        struct task_struct *task;
 975        unsigned long debugctlmsr, irq;
 976        int cpu, error = 0;
 977
 978        if (!tracer)
 979                return 0;
 980
 981        tracer->flags = 0;
 982
 983        task = tracer->ds.context->task;
 984        cpu  = tracer->ds.context->cpu;
 985
 986        local_irq_save(irq);
 987
 988        error = -EPERM;
 989        if (!task && (cpu != smp_processor_id()))
 990                goto out;
 991
 992        debugctlmsr = (task ?
 993                       task->thread.debugctlmsr :
 994                       get_debugctlmsr());
 995        debugctlmsr &= ~BTS_CONTROL;
 996
 997        if (task)
 998                update_task_debugctlmsr(task, debugctlmsr);
 999        else
1000                update_debugctlmsr(debugctlmsr);
1001
1002        error = 0;
1003 out:
1004        local_irq_restore(irq);
1005        return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010        unsigned long control;
1011
1012        control = ds_cfg.ctl[dsf_bts];
1013        if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014                control |= ds_cfg.ctl[dsf_bts_kernel];
1015        if (!(tracer->trace.ds.flags & BTS_USER))
1016                control |= ds_cfg.ctl[dsf_bts_user];
1017
1018        return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023        struct task_struct *task;
1024        unsigned long debugctlmsr;
1025        int cpu;
1026
1027        if (!tracer)
1028                return;
1029
1030        tracer->flags = tracer->trace.ds.flags;
1031
1032        task = tracer->ds.context->task;
1033        cpu  = tracer->ds.context->cpu;
1034
1035        WARN_ON(!task && irqs_disabled());
1036
1037        debugctlmsr = (task ?
1038                       task->thread.debugctlmsr :
1039                       get_debugctlmsr_on_cpu(cpu));
1040        debugctlmsr |= ds_bts_control(tracer);
1041
1042        if (task)
1043                update_task_debugctlmsr(task, debugctlmsr);
1044        else
1045                update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050        struct task_struct *task;
1051        unsigned long debugctlmsr, irq;
1052        int cpu, error = 0;
1053
1054        if (!tracer)
1055                return 0;
1056
1057        tracer->flags = tracer->trace.ds.flags;
1058
1059        task = tracer->ds.context->task;
1060        cpu  = tracer->ds.context->cpu;
1061
1062        local_irq_save(irq);
1063
1064        error = -EPERM;
1065        if (!task && (cpu != smp_processor_id()))
1066                goto out;
1067
1068        debugctlmsr = (task ?
1069                       task->thread.debugctlmsr :
1070                       get_debugctlmsr());
1071        debugctlmsr |= ds_bts_control(tracer);
1072
1073        if (task)
1074                update_task_debugctlmsr(task, debugctlmsr);
1075        else
1076                update_debugctlmsr(debugctlmsr);
1077
1078        error = 0;
1079 out:
1080        local_irq_restore(irq);
1081        return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086        struct task_struct *task;
1087
1088        task = tracer->ds.context->task;
1089
1090        WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091        tracer->ds.context->pebs_master = NULL;
1092
1093        ds_put_context(tracer->ds.context);
1094        put_tracer(task);
1095
1096        kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101        might_sleep();
1102
1103        if (!tracer)
1104                return;
1105
1106        ds_suspend_pebs(tracer);
1107        ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112        struct task_struct *task;
1113        unsigned long irq;
1114        int error;
1115
1116        if (!tracer)
1117                return 0;
1118
1119        task = tracer->ds.context->task;
1120
1121        local_irq_save(irq);
1122
1123        error = -EPERM;
1124        if (!task &&
1125            (tracer->ds.context->cpu != smp_processor_id()))
1126                goto out;
1127
1128        error = -EPERM;
1129        if (task && (task != current))
1130                goto out;
1131
1132        ds_suspend_pebs_noirq(tracer);
1133        ds_free_pebs(tracer);
1134
1135        error = 0;
1136 out:
1137        local_irq_restore(irq);
1138        return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148        return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158        return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163        if (!tracer)
1164                return NULL;
1165
1166        ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167        return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172        if (!tracer)
1173                return NULL;
1174
1175        ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177        tracer->trace.counters = ds_cfg.nr_counter_reset;
1178        memcpy(tracer->trace.counter_reset,
1179               tracer->ds.context->ds +
1180               (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181               ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183        return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188        if (!tracer)
1189                return -EINVAL;
1190
1191        tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193        ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194               (unsigned long)tracer->trace.ds.top);
1195
1196        return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201        if (!tracer)
1202                return -EINVAL;
1203
1204        tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206        ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207               (unsigned long)tracer->trace.ds.top);
1208
1209        return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213                      unsigned int counter, u64 value)
1214{
1215        if (!tracer)
1216                return -EINVAL;
1217
1218        if (ds_cfg.nr_counter_reset < counter)
1219                return -EINVAL;
1220
1221        *(u64 *)(tracer->ds.context->ds +
1222                 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223                 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225        return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229        .name = "Netburst",
1230        .ctl[dsf_bts]           = (1 << 2) | (1 << 3),
1231        .ctl[dsf_bts_kernel]    = (1 << 5),
1232        .ctl[dsf_bts_user]      = (1 << 6),
1233        .nr_counter_reset       = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236        .name = "Pentium M",
1237        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
1238        .nr_counter_reset       = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241        .name = "Core 2/Atom",
1242        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
1243        .ctl[dsf_bts_kernel]    = (1 << 9),
1244        .ctl[dsf_bts_user]      = (1 << 10),
1245        .nr_counter_reset       = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248        .name = "Core i7",
1249        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
1250        .ctl[dsf_bts_kernel]    = (1 << 9),
1251        .ctl[dsf_bts_user]      = (1 << 10),
1252        .nr_counter_reset       = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257             struct cpuinfo_x86 *cpu)
1258{
1259        unsigned long nr_pebs_fields = 0;
1260
1261        printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264        nr_pebs_fields = 10;
1265#else
1266        nr_pebs_fields = 18;
1267#endif
1268
1269        /*
1270         * Starting with version 2, architectural performance
1271         * monitoring supports a format specifier.
1272         */
1273        if ((cpuid_eax(0xa) & 0xff) > 1) {
1274                unsigned long perf_capabilities, format;
1275
1276                rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278                format = (perf_capabilities >> 8) & 0xf;
1279
1280                switch (format) {
1281                case 0:
1282                        nr_pebs_fields = 18;
1283                        break;
1284                case 1:
1285                        nr_pebs_fields = 22;
1286                        break;
1287                default:
1288                        printk(KERN_INFO
1289                               "[ds] unknown PEBS format: %lu\n", format);
1290                        nr_pebs_fields = 0;
1291                        break;
1292                }
1293        }
1294
1295        memset(&ds_cfg, 0, sizeof(ds_cfg));
1296        ds_cfg = *cfg;
1297
1298        ds_cfg.sizeof_ptr_field =
1299                (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301        ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
1302        ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304        if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305                ds_cfg.sizeof_rec[ds_bts] = 0;
1306                printk(KERN_INFO "[ds] bts not available\n");
1307        }
1308        if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309                ds_cfg.sizeof_rec[ds_pebs] = 0;
1310                printk(KERN_INFO "[ds] pebs not available\n");
1311        }
1312
1313        printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314               8 * ds_cfg.sizeof_ptr_field);
1315        printk("bts/pebs record: %u/%u bytes\n",
1316               ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318        WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323        /* Only configure the first cpu. Others are identical. */
1324        if (ds_cfg.name)
1325                return;
1326
1327        switch (c->x86) {
1328        case 0x6:
1329                switch (c->x86_model) {
1330                case 0x9:
1331                case 0xd: /* Pentium M */
1332                        ds_configure(&ds_cfg_pentium_m, c);
1333                        break;
1334                case 0xf:
1335                case 0x17: /* Core2 */
1336                case 0x1c: /* Atom */
1337                        ds_configure(&ds_cfg_core2_atom, c);
1338                        break;
1339                case 0x1a: /* Core i7 */
1340                        ds_configure(&ds_cfg_core_i7, c);
1341                        break;
1342                default:
1343                        /* Sorry, don't know about them. */
1344                        break;
1345                }
1346                break;
1347        case 0xf:
1348                switch (c->x86_model) {
1349                case 0x0:
1350                case 0x1:
1351                case 0x2: /* Netburst */
1352                        ds_configure(&ds_cfg_netburst, c);
1353                        break;
1354                default:
1355                        /* Sorry, don't know about them. */
1356                        break;
1357                }
1358                break;
1359        default:
1360                /* Sorry, don't know about them. */
1361                break;
1362        }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366                                     enum bts_qualifier qualifier,
1367                                     struct task_struct *task)
1368{
1369        struct bts_tracer *tracer = context->bts_master;
1370        struct bts_struct ts;
1371
1372        /* Prevent compilers from reading the tracer pointer twice. */
1373        barrier();
1374
1375        if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376                return;
1377
1378        memset(&ts, 0, sizeof(ts));
1379        ts.qualifier            = qualifier;
1380        ts.variant.event.clock  = trace_clock_global();
1381        ts.variant.event.pid    = task->pid;
1382
1383        bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391        struct ds_context *prev_ctx     = prev->thread.ds_ctx;
1392        struct ds_context *next_ctx     = next->thread.ds_ctx;
1393        unsigned long debugctlmsr       = next->thread.debugctlmsr;
1394
1395        /* Make sure all data is read before we start. */
1396        barrier();
1397
1398        if (prev_ctx) {
1399                update_debugctlmsr(0);
1400
1401                ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402        }
1403
1404        if (next_ctx) {
1405                ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407                wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408        }
1409
1410        update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415        if (ds_cfg.sizeof_rec[ds_bts]) {
1416                int error;
1417
1418                error = ds_selftest_bts();
1419                if (error) {
1420                        WARN(1, "[ds] selftest failed. disabling bts.\n");
1421                        ds_cfg.sizeof_rec[ds_bts] = 0;
1422                }
1423        }
1424
1425        if (ds_cfg.sizeof_rec[ds_pebs]) {
1426                int error;
1427
1428                error = ds_selftest_pebs();
1429                if (error) {
1430                        WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431                        ds_cfg.sizeof_rec[ds_pebs] = 0;
1432                }
1433        }
1434
1435        return 0;
1436}
1437device_initcall(ds_selftest);
1438