linux/arch/x86/events/intel/bts.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * BTS PMU driver for perf
   4 * Copyright (c) 2013-2014, Intel Corporation.
   5 */
   6
   7#undef DEBUG
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include <linux/bitops.h>
  12#include <linux/types.h>
  13#include <linux/slab.h>
  14#include <linux/debugfs.h>
  15#include <linux/device.h>
  16#include <linux/coredump.h>
  17
  18#include <linux/sizes.h>
  19#include <asm/perf_event.h>
  20
  21#include "../perf_event.h"
  22
  23struct bts_ctx {
  24        struct perf_output_handle       handle;
  25        struct debug_store              ds_back;
  26        int                             state;
  27};
  28
  29/* BTS context states: */
  30enum {
  31        /* no ongoing AUX transactions */
  32        BTS_STATE_STOPPED = 0,
  33        /* AUX transaction is on, BTS tracing is disabled */
  34        BTS_STATE_INACTIVE,
  35        /* AUX transaction is on, BTS tracing is running */
  36        BTS_STATE_ACTIVE,
  37};
  38
  39static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
  40
  41#define BTS_RECORD_SIZE         24
  42#define BTS_SAFETY_MARGIN       4080
  43
  44struct bts_phys {
  45        struct page     *page;
  46        unsigned long   size;
  47        unsigned long   offset;
  48        unsigned long   displacement;
  49};
  50
  51struct bts_buffer {
  52        size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
  53        unsigned int    nr_pages;
  54        unsigned int    nr_bufs;
  55        unsigned int    cur_buf;
  56        bool            snapshot;
  57        local_t         data_size;
  58        local_t         head;
  59        unsigned long   end;
  60        void            **data_pages;
  61        struct bts_phys buf[0];
  62};
  63
  64static struct pmu bts_pmu;
  65
  66static int buf_nr_pages(struct page *page)
  67{
  68        if (!PagePrivate(page))
  69                return 1;
  70
  71        return 1 << page_private(page);
  72}
  73
  74static size_t buf_size(struct page *page)
  75{
  76        return buf_nr_pages(page) * PAGE_SIZE;
  77}
  78
  79static void *
  80bts_buffer_setup_aux(struct perf_event *event, void **pages,
  81                     int nr_pages, bool overwrite)
  82{
  83        struct bts_buffer *buf;
  84        struct page *page;
  85        int cpu = event->cpu;
  86        int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
  87        unsigned long offset;
  88        size_t size = nr_pages << PAGE_SHIFT;
  89        int pg, nbuf, pad;
  90
  91        /* count all the high order buffers */
  92        for (pg = 0, nbuf = 0; pg < nr_pages;) {
  93                page = virt_to_page(pages[pg]);
  94                pg += buf_nr_pages(page);
  95                nbuf++;
  96        }
  97
  98        /*
  99         * to avoid interrupts in overwrite mode, only allow one physical
 100         */
 101        if (overwrite && nbuf > 1)
 102                return NULL;
 103
 104        buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
 105        if (!buf)
 106                return NULL;
 107
 108        buf->nr_pages = nr_pages;
 109        buf->nr_bufs = nbuf;
 110        buf->snapshot = overwrite;
 111        buf->data_pages = pages;
 112        buf->real_size = size - size % BTS_RECORD_SIZE;
 113
 114        for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
 115                unsigned int __nr_pages;
 116
 117                page = virt_to_page(pages[pg]);
 118                __nr_pages = buf_nr_pages(page);
 119                buf->buf[nbuf].page = page;
 120                buf->buf[nbuf].offset = offset;
 121                buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
 122                buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
 123                pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
 124                buf->buf[nbuf].size -= pad;
 125
 126                pg += __nr_pages;
 127                offset += __nr_pages << PAGE_SHIFT;
 128        }
 129
 130        return buf;
 131}
 132
 133static void bts_buffer_free_aux(void *data)
 134{
 135        kfree(data);
 136}
 137
 138static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
 139{
 140        return buf->buf[idx].offset + buf->buf[idx].displacement;
 141}
 142
 143static void
 144bts_config_buffer(struct bts_buffer *buf)
 145{
 146        int cpu = raw_smp_processor_id();
 147        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 148        struct bts_phys *phys = &buf->buf[buf->cur_buf];
 149        unsigned long index, thresh = 0, end = phys->size;
 150        struct page *page = phys->page;
 151
 152        index = local_read(&buf->head);
 153
 154        if (!buf->snapshot) {
 155                if (buf->end < phys->offset + buf_size(page))
 156                        end = buf->end - phys->offset - phys->displacement;
 157
 158                index -= phys->offset + phys->displacement;
 159
 160                if (end - index > BTS_SAFETY_MARGIN)
 161                        thresh = end - BTS_SAFETY_MARGIN;
 162                else if (end - index > BTS_RECORD_SIZE)
 163                        thresh = end - BTS_RECORD_SIZE;
 164                else
 165                        thresh = end;
 166        }
 167
 168        ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
 169        ds->bts_index = ds->bts_buffer_base + index;
 170        ds->bts_absolute_maximum = ds->bts_buffer_base + end;
 171        ds->bts_interrupt_threshold = !buf->snapshot
 172                ? ds->bts_buffer_base + thresh
 173                : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
 174}
 175
 176static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
 177{
 178        unsigned long index = head - phys->offset;
 179
 180        memset(page_address(phys->page) + index, 0, phys->size - index);
 181}
 182
 183static void bts_update(struct bts_ctx *bts)
 184{
 185        int cpu = raw_smp_processor_id();
 186        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 187        struct bts_buffer *buf = perf_get_aux(&bts->handle);
 188        unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
 189
 190        if (!buf)
 191                return;
 192
 193        head = index + bts_buffer_offset(buf, buf->cur_buf);
 194        old = local_xchg(&buf->head, head);
 195
 196        if (!buf->snapshot) {
 197                if (old == head)
 198                        return;
 199
 200                if (ds->bts_index >= ds->bts_absolute_maximum)
 201                        perf_aux_output_flag(&bts->handle,
 202                                             PERF_AUX_FLAG_TRUNCATED);
 203
 204                /*
 205                 * old and head are always in the same physical buffer, so we
 206                 * can subtract them to get the data size.
 207                 */
 208                local_add(head - old, &buf->data_size);
 209        } else {
 210                local_set(&buf->data_size, head);
 211        }
 212}
 213
 214static int
 215bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
 216
 217/*
 218 * Ordering PMU callbacks wrt themselves and the PMI is done by means
 219 * of bts::state, which:
 220 *  - is set when bts::handle::event is valid, that is, between
 221 *    perf_aux_output_begin() and perf_aux_output_end();
 222 *  - is zero otherwise;
 223 *  - is ordered against bts::handle::event with a compiler barrier.
 224 */
 225
 226static void __bts_event_start(struct perf_event *event)
 227{
 228        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 229        struct bts_buffer *buf = perf_get_aux(&bts->handle);
 230        u64 config = 0;
 231
 232        if (!buf->snapshot)
 233                config |= ARCH_PERFMON_EVENTSEL_INT;
 234        if (!event->attr.exclude_kernel)
 235                config |= ARCH_PERFMON_EVENTSEL_OS;
 236        if (!event->attr.exclude_user)
 237                config |= ARCH_PERFMON_EVENTSEL_USR;
 238
 239        bts_config_buffer(buf);
 240
 241        /*
 242         * local barrier to make sure that ds configuration made it
 243         * before we enable BTS and bts::state goes ACTIVE
 244         */
 245        wmb();
 246
 247        /* INACTIVE/STOPPED -> ACTIVE */
 248        WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
 249
 250        intel_pmu_enable_bts(config);
 251
 252}
 253
 254static void bts_event_start(struct perf_event *event, int flags)
 255{
 256        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 257        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 258        struct bts_buffer *buf;
 259
 260        buf = perf_aux_output_begin(&bts->handle, event);
 261        if (!buf)
 262                goto fail_stop;
 263
 264        if (bts_buffer_reset(buf, &bts->handle))
 265                goto fail_end_stop;
 266
 267        bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
 268        bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
 269        bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
 270
 271        perf_event_itrace_started(event);
 272        event->hw.state = 0;
 273
 274        __bts_event_start(event);
 275
 276        return;
 277
 278fail_end_stop:
 279        perf_aux_output_end(&bts->handle, 0);
 280
 281fail_stop:
 282        event->hw.state = PERF_HES_STOPPED;
 283}
 284
 285static void __bts_event_stop(struct perf_event *event, int state)
 286{
 287        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 288
 289        /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
 290        WRITE_ONCE(bts->state, state);
 291
 292        /*
 293         * No extra synchronization is mandated by the documentation to have
 294         * BTS data stores globally visible.
 295         */
 296        intel_pmu_disable_bts();
 297}
 298
 299static void bts_event_stop(struct perf_event *event, int flags)
 300{
 301        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 302        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 303        struct bts_buffer *buf = NULL;
 304        int state = READ_ONCE(bts->state);
 305
 306        if (state == BTS_STATE_ACTIVE)
 307                __bts_event_stop(event, BTS_STATE_STOPPED);
 308
 309        if (state != BTS_STATE_STOPPED)
 310                buf = perf_get_aux(&bts->handle);
 311
 312        event->hw.state |= PERF_HES_STOPPED;
 313
 314        if (flags & PERF_EF_UPDATE) {
 315                bts_update(bts);
 316
 317                if (buf) {
 318                        if (buf->snapshot)
 319                                bts->handle.head =
 320                                        local_xchg(&buf->data_size,
 321                                                   buf->nr_pages << PAGE_SHIFT);
 322                        perf_aux_output_end(&bts->handle,
 323                                            local_xchg(&buf->data_size, 0));
 324                }
 325
 326                cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
 327                cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
 328                cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
 329                cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
 330        }
 331}
 332
 333void intel_bts_enable_local(void)
 334{
 335        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 336        int state = READ_ONCE(bts->state);
 337
 338        /*
 339         * Here we transition from INACTIVE to ACTIVE;
 340         * if we instead are STOPPED from the interrupt handler,
 341         * stay that way. Can't be ACTIVE here though.
 342         */
 343        if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
 344                return;
 345
 346        if (state == BTS_STATE_STOPPED)
 347                return;
 348
 349        if (bts->handle.event)
 350                __bts_event_start(bts->handle.event);
 351}
 352
 353void intel_bts_disable_local(void)
 354{
 355        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 356
 357        /*
 358         * Here we transition from ACTIVE to INACTIVE;
 359         * do nothing for STOPPED or INACTIVE.
 360         */
 361        if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
 362                return;
 363
 364        if (bts->handle.event)
 365                __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
 366}
 367
 368static int
 369bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
 370{
 371        unsigned long head, space, next_space, pad, gap, skip, wakeup;
 372        unsigned int next_buf;
 373        struct bts_phys *phys, *next_phys;
 374        int ret;
 375
 376        if (buf->snapshot)
 377                return 0;
 378
 379        head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
 380
 381        phys = &buf->buf[buf->cur_buf];
 382        space = phys->offset + phys->displacement + phys->size - head;
 383        pad = space;
 384        if (space > handle->size) {
 385                space = handle->size;
 386                space -= space % BTS_RECORD_SIZE;
 387        }
 388        if (space <= BTS_SAFETY_MARGIN) {
 389                /* See if next phys buffer has more space */
 390                next_buf = buf->cur_buf + 1;
 391                if (next_buf >= buf->nr_bufs)
 392                        next_buf = 0;
 393                next_phys = &buf->buf[next_buf];
 394                gap = buf_size(phys->page) - phys->displacement - phys->size +
 395                      next_phys->displacement;
 396                skip = pad + gap;
 397                if (handle->size >= skip) {
 398                        next_space = next_phys->size;
 399                        if (next_space + skip > handle->size) {
 400                                next_space = handle->size - skip;
 401                                next_space -= next_space % BTS_RECORD_SIZE;
 402                        }
 403                        if (next_space > space || !space) {
 404                                if (pad)
 405                                        bts_buffer_pad_out(phys, head);
 406                                ret = perf_aux_output_skip(handle, skip);
 407                                if (ret)
 408                                        return ret;
 409                                /* Advance to next phys buffer */
 410                                phys = next_phys;
 411                                space = next_space;
 412                                head = phys->offset + phys->displacement;
 413                                /*
 414                                 * After this, cur_buf and head won't match ds
 415                                 * anymore, so we must not be racing with
 416                                 * bts_update().
 417                                 */
 418                                buf->cur_buf = next_buf;
 419                                local_set(&buf->head, head);
 420                        }
 421                }
 422        }
 423
 424        /* Don't go far beyond wakeup watermark */
 425        wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
 426                 handle->head;
 427        if (space > wakeup) {
 428                space = wakeup;
 429                space -= space % BTS_RECORD_SIZE;
 430        }
 431
 432        buf->end = head + space;
 433
 434        /*
 435         * If we have no space, the lost notification would have been sent when
 436         * we hit absolute_maximum - see bts_update()
 437         */
 438        if (!space)
 439                return -ENOSPC;
 440
 441        return 0;
 442}
 443
 444int intel_bts_interrupt(void)
 445{
 446        struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
 447        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 448        struct perf_event *event = bts->handle.event;
 449        struct bts_buffer *buf;
 450        s64 old_head;
 451        int err = -ENOSPC, handled = 0;
 452
 453        /*
 454         * The only surefire way of knowing if this NMI is ours is by checking
 455         * the write ptr against the PMI threshold.
 456         */
 457        if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
 458                handled = 1;
 459
 460        /*
 461         * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
 462         * so we can only be INACTIVE or STOPPED
 463         */
 464        if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
 465                return handled;
 466
 467        buf = perf_get_aux(&bts->handle);
 468        if (!buf)
 469                return handled;
 470
 471        /*
 472         * Skip snapshot counters: they don't use the interrupt, but
 473         * there's no other way of telling, because the pointer will
 474         * keep moving
 475         */
 476        if (buf->snapshot)
 477                return 0;
 478
 479        old_head = local_read(&buf->head);
 480        bts_update(bts);
 481
 482        /* no new data */
 483        if (old_head == local_read(&buf->head))
 484                return handled;
 485
 486        perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 487
 488        buf = perf_aux_output_begin(&bts->handle, event);
 489        if (buf)
 490                err = bts_buffer_reset(buf, &bts->handle);
 491
 492        if (err) {
 493                WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
 494
 495                if (buf) {
 496                        /*
 497                         * BTS_STATE_STOPPED should be visible before
 498                         * cleared handle::event
 499                         */
 500                        barrier();
 501                        perf_aux_output_end(&bts->handle, 0);
 502                }
 503        }
 504
 505        return 1;
 506}
 507
 508static void bts_event_del(struct perf_event *event, int mode)
 509{
 510        bts_event_stop(event, PERF_EF_UPDATE);
 511}
 512
 513static int bts_event_add(struct perf_event *event, int mode)
 514{
 515        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 516        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 517        struct hw_perf_event *hwc = &event->hw;
 518
 519        event->hw.state = PERF_HES_STOPPED;
 520
 521        if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 522                return -EBUSY;
 523
 524        if (bts->handle.event)
 525                return -EBUSY;
 526
 527        if (mode & PERF_EF_START) {
 528                bts_event_start(event, 0);
 529                if (hwc->state & PERF_HES_STOPPED)
 530                        return -EINVAL;
 531        }
 532
 533        return 0;
 534}
 535
 536static void bts_event_destroy(struct perf_event *event)
 537{
 538        x86_release_hardware();
 539        x86_del_exclusive(x86_lbr_exclusive_bts);
 540}
 541
 542static int bts_event_init(struct perf_event *event)
 543{
 544        int ret;
 545
 546        if (event->attr.type != bts_pmu.type)
 547                return -ENOENT;
 548
 549        /*
 550         * BTS leaks kernel addresses even when CPL0 tracing is
 551         * disabled, so disallow intel_bts driver for unprivileged
 552         * users on paranoid systems since it provides trace data
 553         * to the user in a zero-copy fashion.
 554         *
 555         * Note that the default paranoia setting permits unprivileged
 556         * users to profile the kernel.
 557         */
 558        if (event->attr.exclude_kernel) {
 559                ret = perf_allow_kernel(&event->attr);
 560                if (ret)
 561                        return ret;
 562        }
 563
 564        if (x86_add_exclusive(x86_lbr_exclusive_bts))
 565                return -EBUSY;
 566
 567        ret = x86_reserve_hardware();
 568        if (ret) {
 569                x86_del_exclusive(x86_lbr_exclusive_bts);
 570                return ret;
 571        }
 572
 573        event->destroy = bts_event_destroy;
 574
 575        return 0;
 576}
 577
 578static void bts_event_read(struct perf_event *event)
 579{
 580}
 581
 582static __init int bts_init(void)
 583{
 584        if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
 585                return -ENODEV;
 586
 587        if (boot_cpu_has(X86_FEATURE_PTI)) {
 588                /*
 589                 * BTS hardware writes through a virtual memory map we must
 590                 * either use the kernel physical map, or the user mapping of
 591                 * the AUX buffer.
 592                 *
 593                 * However, since this driver supports per-CPU and per-task inherit
 594                 * we cannot use the user mapping since it will not be available
 595                 * if we're not running the owning process.
 596                 *
 597                 * With PTI we can't use the kernal map either, because its not
 598                 * there when we run userspace.
 599                 *
 600                 * For now, disable this driver when using PTI.
 601                 */
 602                return -ENODEV;
 603        }
 604
 605        bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
 606                                  PERF_PMU_CAP_EXCLUSIVE;
 607        bts_pmu.task_ctx_nr     = perf_sw_context;
 608        bts_pmu.event_init      = bts_event_init;
 609        bts_pmu.add             = bts_event_add;
 610        bts_pmu.del             = bts_event_del;
 611        bts_pmu.start           = bts_event_start;
 612        bts_pmu.stop            = bts_event_stop;
 613        bts_pmu.read            = bts_event_read;
 614        bts_pmu.setup_aux       = bts_buffer_setup_aux;
 615        bts_pmu.free_aux        = bts_buffer_free_aux;
 616
 617        return perf_pmu_register(&bts_pmu, "intel_bts", -1);
 618}
 619arch_initcall(bts_init);
 620