linux/arch/x86/events/intel/bts.c
<<
>>
Prefs
   1/*
   2 * BTS PMU driver for perf
   3 * Copyright (c) 2013-2014, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14
  15#undef DEBUG
  16
  17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  18
  19#include <linux/bitops.h>
  20#include <linux/types.h>
  21#include <linux/slab.h>
  22#include <linux/debugfs.h>
  23#include <linux/device.h>
  24#include <linux/coredump.h>
  25
  26#include <asm-generic/sizes.h>
  27#include <asm/perf_event.h>
  28
  29#include "../perf_event.h"
  30
  31struct bts_ctx {
  32        struct perf_output_handle       handle;
  33        struct debug_store              ds_back;
  34        int                             state;
  35};
  36
  37/* BTS context states: */
  38enum {
  39        /* no ongoing AUX transactions */
  40        BTS_STATE_STOPPED = 0,
  41        /* AUX transaction is on, BTS tracing is disabled */
  42        BTS_STATE_INACTIVE,
  43        /* AUX transaction is on, BTS tracing is running */
  44        BTS_STATE_ACTIVE,
  45};
  46
  47static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
  48
  49#define BTS_RECORD_SIZE         24
  50#define BTS_SAFETY_MARGIN       4080
  51
  52struct bts_phys {
  53        struct page     *page;
  54        unsigned long   size;
  55        unsigned long   offset;
  56        unsigned long   displacement;
  57};
  58
  59struct bts_buffer {
  60        size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
  61        unsigned int    nr_pages;
  62        unsigned int    nr_bufs;
  63        unsigned int    cur_buf;
  64        bool            snapshot;
  65        local_t         data_size;
  66        local_t         head;
  67        unsigned long   end;
  68        void            **data_pages;
  69        struct bts_phys buf[];
  70};
  71
  72static struct pmu bts_pmu;
  73
  74static int buf_nr_pages(struct page *page)
  75{
  76        if (!PagePrivate(page))
  77                return 1;
  78
  79        return 1 << page_private(page);
  80}
  81
  82static size_t buf_size(struct page *page)
  83{
  84        return buf_nr_pages(page) * PAGE_SIZE;
  85}
  86
  87static void *
  88bts_buffer_setup_aux(struct perf_event *event, void **pages,
  89                     int nr_pages, bool overwrite)
  90{
  91        struct bts_buffer *buf;
  92        struct page *page;
  93        int cpu = event->cpu;
  94        int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
  95        unsigned long offset;
  96        size_t size = nr_pages << PAGE_SHIFT;
  97        int pg, nbuf, pad;
  98
  99        /* count all the high order buffers */
 100        for (pg = 0, nbuf = 0; pg < nr_pages;) {
 101                page = virt_to_page(pages[pg]);
 102                pg += buf_nr_pages(page);
 103                nbuf++;
 104        }
 105
 106        /*
 107         * to avoid interrupts in overwrite mode, only allow one physical
 108         */
 109        if (overwrite && nbuf > 1)
 110                return NULL;
 111
 112        buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
 113        if (!buf)
 114                return NULL;
 115
 116        buf->nr_pages = nr_pages;
 117        buf->nr_bufs = nbuf;
 118        buf->snapshot = overwrite;
 119        buf->data_pages = pages;
 120        buf->real_size = size - size % BTS_RECORD_SIZE;
 121
 122        for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
 123                unsigned int __nr_pages;
 124
 125                page = virt_to_page(pages[pg]);
 126                __nr_pages = buf_nr_pages(page);
 127                buf->buf[nbuf].page = page;
 128                buf->buf[nbuf].offset = offset;
 129                buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
 130                buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
 131                pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
 132                buf->buf[nbuf].size -= pad;
 133
 134                pg += __nr_pages;
 135                offset += __nr_pages << PAGE_SHIFT;
 136        }
 137
 138        return buf;
 139}
 140
 141static void bts_buffer_free_aux(void *data)
 142{
 143        kfree(data);
 144}
 145
 146static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
 147{
 148        return buf->buf[idx].offset + buf->buf[idx].displacement;
 149}
 150
 151static void
 152bts_config_buffer(struct bts_buffer *buf)
 153{
 154        int cpu = raw_smp_processor_id();
 155        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 156        struct bts_phys *phys = &buf->buf[buf->cur_buf];
 157        unsigned long index, thresh = 0, end = phys->size;
 158        struct page *page = phys->page;
 159
 160        index = local_read(&buf->head);
 161
 162        if (!buf->snapshot) {
 163                if (buf->end < phys->offset + buf_size(page))
 164                        end = buf->end - phys->offset - phys->displacement;
 165
 166                index -= phys->offset + phys->displacement;
 167
 168                if (end - index > BTS_SAFETY_MARGIN)
 169                        thresh = end - BTS_SAFETY_MARGIN;
 170                else if (end - index > BTS_RECORD_SIZE)
 171                        thresh = end - BTS_RECORD_SIZE;
 172                else
 173                        thresh = end;
 174        }
 175
 176        ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
 177        ds->bts_index = ds->bts_buffer_base + index;
 178        ds->bts_absolute_maximum = ds->bts_buffer_base + end;
 179        ds->bts_interrupt_threshold = !buf->snapshot
 180                ? ds->bts_buffer_base + thresh
 181                : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
 182}
 183
 184static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
 185{
 186        unsigned long index = head - phys->offset;
 187
 188        memset(page_address(phys->page) + index, 0, phys->size - index);
 189}
 190
 191static void bts_update(struct bts_ctx *bts)
 192{
 193        int cpu = raw_smp_processor_id();
 194        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 195        struct bts_buffer *buf = perf_get_aux(&bts->handle);
 196        unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
 197
 198        if (!buf)
 199                return;
 200
 201        head = index + bts_buffer_offset(buf, buf->cur_buf);
 202        old = local_xchg(&buf->head, head);
 203
 204        if (!buf->snapshot) {
 205                if (old == head)
 206                        return;
 207
 208                if (ds->bts_index >= ds->bts_absolute_maximum)
 209                        perf_aux_output_flag(&bts->handle,
 210                                             PERF_AUX_FLAG_TRUNCATED);
 211
 212                /*
 213                 * old and head are always in the same physical buffer, so we
 214                 * can subtract them to get the data size.
 215                 */
 216                local_add(head - old, &buf->data_size);
 217        } else {
 218                local_set(&buf->data_size, head);
 219        }
 220}
 221
 222static int
 223bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
 224
 225/*
 226 * Ordering PMU callbacks wrt themselves and the PMI is done by means
 227 * of bts::state, which:
 228 *  - is set when bts::handle::event is valid, that is, between
 229 *    perf_aux_output_begin() and perf_aux_output_end();
 230 *  - is zero otherwise;
 231 *  - is ordered against bts::handle::event with a compiler barrier.
 232 */
 233
 234static void __bts_event_start(struct perf_event *event)
 235{
 236        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 237        struct bts_buffer *buf = perf_get_aux(&bts->handle);
 238        u64 config = 0;
 239
 240        if (!buf->snapshot)
 241                config |= ARCH_PERFMON_EVENTSEL_INT;
 242        if (!event->attr.exclude_kernel)
 243                config |= ARCH_PERFMON_EVENTSEL_OS;
 244        if (!event->attr.exclude_user)
 245                config |= ARCH_PERFMON_EVENTSEL_USR;
 246
 247        bts_config_buffer(buf);
 248
 249        /*
 250         * local barrier to make sure that ds configuration made it
 251         * before we enable BTS and bts::state goes ACTIVE
 252         */
 253        wmb();
 254
 255        /* INACTIVE/STOPPED -> ACTIVE */
 256        WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
 257
 258        intel_pmu_enable_bts(config);
 259
 260}
 261
 262static void bts_event_start(struct perf_event *event, int flags)
 263{
 264        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 265        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 266        struct bts_buffer *buf;
 267
 268        buf = perf_aux_output_begin(&bts->handle, event);
 269        if (!buf)
 270                goto fail_stop;
 271
 272        if (bts_buffer_reset(buf, &bts->handle))
 273                goto fail_end_stop;
 274
 275        bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
 276        bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
 277        bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
 278
 279        perf_event_itrace_started(event);
 280        event->hw.state = 0;
 281
 282        __bts_event_start(event);
 283
 284        return;
 285
 286fail_end_stop:
 287        perf_aux_output_end(&bts->handle, 0);
 288
 289fail_stop:
 290        event->hw.state = PERF_HES_STOPPED;
 291}
 292
 293static void __bts_event_stop(struct perf_event *event, int state)
 294{
 295        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 296
 297        /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
 298        WRITE_ONCE(bts->state, state);
 299
 300        /*
 301         * No extra synchronization is mandated by the documentation to have
 302         * BTS data stores globally visible.
 303         */
 304        intel_pmu_disable_bts();
 305}
 306
 307static void bts_event_stop(struct perf_event *event, int flags)
 308{
 309        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 310        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 311        struct bts_buffer *buf = NULL;
 312        int state = READ_ONCE(bts->state);
 313
 314        if (state == BTS_STATE_ACTIVE)
 315                __bts_event_stop(event, BTS_STATE_STOPPED);
 316
 317        if (state != BTS_STATE_STOPPED)
 318                buf = perf_get_aux(&bts->handle);
 319
 320        event->hw.state |= PERF_HES_STOPPED;
 321
 322        if (flags & PERF_EF_UPDATE) {
 323                bts_update(bts);
 324
 325                if (buf) {
 326                        if (buf->snapshot)
 327                                bts->handle.head =
 328                                        local_xchg(&buf->data_size,
 329                                                   buf->nr_pages << PAGE_SHIFT);
 330                        perf_aux_output_end(&bts->handle,
 331                                            local_xchg(&buf->data_size, 0));
 332                }
 333
 334                cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
 335                cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
 336                cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
 337                cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
 338        }
 339}
 340
 341void intel_bts_enable_local(void)
 342{
 343        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 344        int state = READ_ONCE(bts->state);
 345
 346        /*
 347         * Here we transition from INACTIVE to ACTIVE;
 348         * if we instead are STOPPED from the interrupt handler,
 349         * stay that way. Can't be ACTIVE here though.
 350         */
 351        if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
 352                return;
 353
 354        if (state == BTS_STATE_STOPPED)
 355                return;
 356
 357        if (bts->handle.event)
 358                __bts_event_start(bts->handle.event);
 359}
 360
 361void intel_bts_disable_local(void)
 362{
 363        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 364
 365        /*
 366         * Here we transition from ACTIVE to INACTIVE;
 367         * do nothing for STOPPED or INACTIVE.
 368         */
 369        if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
 370                return;
 371
 372        if (bts->handle.event)
 373                __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
 374}
 375
 376static int
 377bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
 378{
 379        unsigned long head, space, next_space, pad, gap, skip, wakeup;
 380        unsigned int next_buf;
 381        struct bts_phys *phys, *next_phys;
 382        int ret;
 383
 384        if (buf->snapshot)
 385                return 0;
 386
 387        head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
 388
 389        phys = &buf->buf[buf->cur_buf];
 390        space = phys->offset + phys->displacement + phys->size - head;
 391        pad = space;
 392        if (space > handle->size) {
 393                space = handle->size;
 394                space -= space % BTS_RECORD_SIZE;
 395        }
 396        if (space <= BTS_SAFETY_MARGIN) {
 397                /* See if next phys buffer has more space */
 398                next_buf = buf->cur_buf + 1;
 399                if (next_buf >= buf->nr_bufs)
 400                        next_buf = 0;
 401                next_phys = &buf->buf[next_buf];
 402                gap = buf_size(phys->page) - phys->displacement - phys->size +
 403                      next_phys->displacement;
 404                skip = pad + gap;
 405                if (handle->size >= skip) {
 406                        next_space = next_phys->size;
 407                        if (next_space + skip > handle->size) {
 408                                next_space = handle->size - skip;
 409                                next_space -= next_space % BTS_RECORD_SIZE;
 410                        }
 411                        if (next_space > space || !space) {
 412                                if (pad)
 413                                        bts_buffer_pad_out(phys, head);
 414                                ret = perf_aux_output_skip(handle, skip);
 415                                if (ret)
 416                                        return ret;
 417                                /* Advance to next phys buffer */
 418                                phys = next_phys;
 419                                space = next_space;
 420                                head = phys->offset + phys->displacement;
 421                                /*
 422                                 * After this, cur_buf and head won't match ds
 423                                 * anymore, so we must not be racing with
 424                                 * bts_update().
 425                                 */
 426                                buf->cur_buf = next_buf;
 427                                local_set(&buf->head, head);
 428                        }
 429                }
 430        }
 431
 432        /* Don't go far beyond wakeup watermark */
 433        wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
 434                 handle->head;
 435        if (space > wakeup) {
 436                space = wakeup;
 437                space -= space % BTS_RECORD_SIZE;
 438        }
 439
 440        buf->end = head + space;
 441
 442        /*
 443         * If we have no space, the lost notification would have been sent when
 444         * we hit absolute_maximum - see bts_update()
 445         */
 446        if (!space)
 447                return -ENOSPC;
 448
 449        return 0;
 450}
 451
 452int intel_bts_interrupt(void)
 453{
 454        struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
 455        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 456        struct perf_event *event = bts->handle.event;
 457        struct bts_buffer *buf;
 458        s64 old_head;
 459        int err = -ENOSPC, handled = 0;
 460
 461        /*
 462         * The only surefire way of knowing if this NMI is ours is by checking
 463         * the write ptr against the PMI threshold.
 464         */
 465        if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
 466                handled = 1;
 467
 468        /*
 469         * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
 470         * so we can only be INACTIVE or STOPPED
 471         */
 472        if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
 473                return handled;
 474
 475        buf = perf_get_aux(&bts->handle);
 476        if (!buf)
 477                return handled;
 478
 479        /*
 480         * Skip snapshot counters: they don't use the interrupt, but
 481         * there's no other way of telling, because the pointer will
 482         * keep moving
 483         */
 484        if (buf->snapshot)
 485                return 0;
 486
 487        old_head = local_read(&buf->head);
 488        bts_update(bts);
 489
 490        /* no new data */
 491        if (old_head == local_read(&buf->head))
 492                return handled;
 493
 494        perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 495
 496        buf = perf_aux_output_begin(&bts->handle, event);
 497        if (buf)
 498                err = bts_buffer_reset(buf, &bts->handle);
 499
 500        if (err) {
 501                WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
 502
 503                if (buf) {
 504                        /*
 505                         * BTS_STATE_STOPPED should be visible before
 506                         * cleared handle::event
 507                         */
 508                        barrier();
 509                        perf_aux_output_end(&bts->handle, 0);
 510                }
 511        }
 512
 513        return 1;
 514}
 515
 516static void bts_event_del(struct perf_event *event, int mode)
 517{
 518        bts_event_stop(event, PERF_EF_UPDATE);
 519}
 520
 521static int bts_event_add(struct perf_event *event, int mode)
 522{
 523        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 524        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 525        struct hw_perf_event *hwc = &event->hw;
 526
 527        event->hw.state = PERF_HES_STOPPED;
 528
 529        if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 530                return -EBUSY;
 531
 532        if (bts->handle.event)
 533                return -EBUSY;
 534
 535        if (mode & PERF_EF_START) {
 536                bts_event_start(event, 0);
 537                if (hwc->state & PERF_HES_STOPPED)
 538                        return -EINVAL;
 539        }
 540
 541        return 0;
 542}
 543
 544static void bts_event_destroy(struct perf_event *event)
 545{
 546        x86_release_hardware();
 547        x86_del_exclusive(x86_lbr_exclusive_bts);
 548}
 549
 550static int bts_event_init(struct perf_event *event)
 551{
 552        int ret;
 553
 554        if (event->attr.type != bts_pmu.type)
 555                return -ENOENT;
 556
 557        /*
 558         * BTS leaks kernel addresses even when CPL0 tracing is
 559         * disabled, so disallow intel_bts driver for unprivileged
 560         * users on paranoid systems since it provides trace data
 561         * to the user in a zero-copy fashion.
 562         *
 563         * Note that the default paranoia setting permits unprivileged
 564         * users to profile the kernel.
 565         */
 566        if (event->attr.exclude_kernel) {
 567                ret = perf_allow_kernel(&event->attr);
 568                if (ret)
 569                        return ret;
 570        }
 571
 572        if (x86_add_exclusive(x86_lbr_exclusive_bts))
 573                return -EBUSY;
 574
 575        ret = x86_reserve_hardware();
 576        if (ret) {
 577                x86_del_exclusive(x86_lbr_exclusive_bts);
 578                return ret;
 579        }
 580
 581        event->destroy = bts_event_destroy;
 582
 583        return 0;
 584}
 585
 586static void bts_event_read(struct perf_event *event)
 587{
 588}
 589
 590static __init int bts_init(void)
 591{
 592        if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
 593                return -ENODEV;
 594
 595        if (boot_cpu_has(X86_FEATURE_PTI)) {
 596                /*
 597                 * BTS hardware writes through a virtual memory map we must
 598                 * either use the kernel physical map, or the user mapping of
 599                 * the AUX buffer.
 600                 *
 601                 * However, since this driver supports per-CPU and per-task inherit
 602                 * we cannot use the user mapping since it will not be available
 603                 * if we're not running the owning process.
 604                 *
 605                 * With PTI we can't use the kernal map either, because its not
 606                 * there when we run userspace.
 607                 *
 608                 * For now, disable this driver when using PTI.
 609                 */
 610                return -ENODEV;
 611        }
 612
 613        bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
 614                                  PERF_PMU_CAP_EXCLUSIVE;
 615        bts_pmu.task_ctx_nr     = perf_sw_context;
 616        bts_pmu.event_init      = bts_event_init;
 617        bts_pmu.add             = bts_event_add;
 618        bts_pmu.del             = bts_event_del;
 619        bts_pmu.start           = bts_event_start;
 620        bts_pmu.stop            = bts_event_stop;
 621        bts_pmu.read            = bts_event_read;
 622        bts_pmu.setup_aux       = bts_buffer_setup_aux;
 623        bts_pmu.free_aux        = bts_buffer_free_aux;
 624
 625        return perf_pmu_register(&bts_pmu, "intel_bts", -1);
 626}
 627arch_initcall(bts_init);
 628