linux/arch/x86/events/intel/pt.c
<<
>>
Prefs
   1/*
   2 * Intel(R) Processor Trace PMU driver for perf
   3 * Copyright (c) 2013-2014, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 *
  14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
  15 * Programming Reference:
  16 * http://software.intel.com/en-us/intel-isa-extensions
  17 */
  18
  19#undef DEBUG
  20
  21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  22
  23#include <linux/types.h>
  24#include <linux/slab.h>
  25#include <linux/device.h>
  26
  27#include <asm/perf_event.h>
  28#include <asm/insn.h>
  29#include <asm/io.h>
  30#include <asm/intel_pt.h>
  31
  32#include "../perf_event.h"
  33#include "pt.h"
  34
  35static DEFINE_PER_CPU(struct pt, pt_ctx);
  36
  37static struct pt_pmu pt_pmu;
  38
  39enum cpuid_regs {
  40        CR_EAX = 0,
  41        CR_ECX,
  42        CR_EDX,
  43        CR_EBX
  44};
  45
  46/*
  47 * Capabilities of Intel PT hardware, such as number of address bits or
  48 * supported output schemes, are cached and exported to userspace as "caps"
  49 * attribute group of pt pmu device
  50 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
  51 * relevant bits together with intel_pt traces.
  52 *
  53 * These are necessary for both trace decoding (payloads_lip, contains address
  54 * width encoded in IP-related packets), and event configuration (bitmasks with
  55 * permitted values for certain bit fields).
  56 */
  57#define PT_CAP(_n, _l, _r, _m)                                          \
  58        [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
  59                            .reg = _r, .mask = _m }
  60
  61static struct pt_cap_desc {
  62        const char      *name;
  63        u32             leaf;
  64        u8              reg;
  65        u32             mask;
  66} pt_caps[] = {
  67        PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
  68        PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
  69        PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
  70        PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
  71        PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
  72        PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
  73        PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
  74        PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
  75        PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
  76        PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
  77        PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
  78};
  79
  80static u32 pt_cap_get(enum pt_capabilities cap)
  81{
  82        struct pt_cap_desc *cd = &pt_caps[cap];
  83        u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
  84        unsigned int shift = __ffs(cd->mask);
  85
  86        return (c & cd->mask) >> shift;
  87}
  88
  89static ssize_t pt_cap_show(struct device *cdev,
  90                           struct device_attribute *attr,
  91                           char *buf)
  92{
  93        struct dev_ext_attribute *ea =
  94                container_of(attr, struct dev_ext_attribute, attr);
  95        enum pt_capabilities cap = (long)ea->var;
  96
  97        return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
  98}
  99
 100static struct attribute_group pt_cap_group = {
 101        .name   = "caps",
 102};
 103
 104PMU_FORMAT_ATTR(cyc,            "config:1"      );
 105PMU_FORMAT_ATTR(mtc,            "config:9"      );
 106PMU_FORMAT_ATTR(tsc,            "config:10"     );
 107PMU_FORMAT_ATTR(noretcomp,      "config:11"     );
 108PMU_FORMAT_ATTR(mtc_period,     "config:14-17"  );
 109PMU_FORMAT_ATTR(cyc_thresh,     "config:19-22"  );
 110PMU_FORMAT_ATTR(psb_period,     "config:24-27"  );
 111
 112static struct attribute *pt_formats_attr[] = {
 113        &format_attr_cyc.attr,
 114        &format_attr_mtc.attr,
 115        &format_attr_tsc.attr,
 116        &format_attr_noretcomp.attr,
 117        &format_attr_mtc_period.attr,
 118        &format_attr_cyc_thresh.attr,
 119        &format_attr_psb_period.attr,
 120        NULL,
 121};
 122
 123static struct attribute_group pt_format_group = {
 124        .name   = "format",
 125        .attrs  = pt_formats_attr,
 126};
 127
 128static const struct attribute_group *pt_attr_groups[] = {
 129        &pt_cap_group,
 130        &pt_format_group,
 131        NULL,
 132};
 133
 134static int __init pt_pmu_hw_init(void)
 135{
 136        struct dev_ext_attribute *de_attrs;
 137        struct attribute **attrs;
 138        size_t size;
 139        u64 reg;
 140        int ret;
 141        long i;
 142
 143        if (boot_cpu_has(X86_FEATURE_VMX)) {
 144                /*
 145                 * Intel SDM, 36.5 "Tracing post-VMXON" says that
 146                 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
 147                 * post-VMXON.
 148                 */
 149                rdmsrl(MSR_IA32_VMX_MISC, reg);
 150                if (reg & BIT(14))
 151                        pt_pmu.vmx = true;
 152        }
 153
 154        attrs = NULL;
 155
 156        for (i = 0; i < PT_CPUID_LEAVES; i++) {
 157                cpuid_count(20, i,
 158                            &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
 159                            &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
 160                            &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
 161                            &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
 162        }
 163
 164        ret = -ENOMEM;
 165        size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
 166        attrs = kzalloc(size, GFP_KERNEL);
 167        if (!attrs)
 168                goto fail;
 169
 170        size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
 171        de_attrs = kzalloc(size, GFP_KERNEL);
 172        if (!de_attrs)
 173                goto fail;
 174
 175        for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
 176                struct dev_ext_attribute *de_attr = de_attrs + i;
 177
 178                de_attr->attr.attr.name = pt_caps[i].name;
 179
 180                sysfs_attr_init(&de_attr->attr.attr);
 181
 182                de_attr->attr.attr.mode         = S_IRUGO;
 183                de_attr->attr.show              = pt_cap_show;
 184                de_attr->var                    = (void *)i;
 185
 186                attrs[i] = &de_attr->attr.attr;
 187        }
 188
 189        pt_cap_group.attrs = attrs;
 190
 191        return 0;
 192
 193fail:
 194        kfree(attrs);
 195
 196        return ret;
 197}
 198
 199#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC     | \
 200                          RTIT_CTL_CYC_THRESH   | \
 201                          RTIT_CTL_PSB_FREQ)
 202
 203#define RTIT_CTL_MTC    (RTIT_CTL_MTC_EN        | \
 204                         RTIT_CTL_MTC_RANGE)
 205
 206#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN         | \
 207                        RTIT_CTL_DISRETC        | \
 208                        RTIT_CTL_CYC_PSB        | \
 209                        RTIT_CTL_MTC)
 210
 211static bool pt_event_valid(struct perf_event *event)
 212{
 213        u64 config = event->attr.config;
 214        u64 allowed, requested;
 215
 216        if ((config & PT_CONFIG_MASK) != config)
 217                return false;
 218
 219        if (config & RTIT_CTL_CYC_PSB) {
 220                if (!pt_cap_get(PT_CAP_psb_cyc))
 221                        return false;
 222
 223                allowed = pt_cap_get(PT_CAP_psb_periods);
 224                requested = (config & RTIT_CTL_PSB_FREQ) >>
 225                        RTIT_CTL_PSB_FREQ_OFFSET;
 226                if (requested && (!(allowed & BIT(requested))))
 227                        return false;
 228
 229                allowed = pt_cap_get(PT_CAP_cycle_thresholds);
 230                requested = (config & RTIT_CTL_CYC_THRESH) >>
 231                        RTIT_CTL_CYC_THRESH_OFFSET;
 232                if (requested && (!(allowed & BIT(requested))))
 233                        return false;
 234        }
 235
 236        if (config & RTIT_CTL_MTC) {
 237                /*
 238                 * In the unlikely case that CPUID lists valid mtc periods,
 239                 * but not the mtc capability, drop out here.
 240                 *
 241                 * Spec says that setting mtc period bits while mtc bit in
 242                 * CPUID is 0 will #GP, so better safe than sorry.
 243                 */
 244                if (!pt_cap_get(PT_CAP_mtc))
 245                        return false;
 246
 247                allowed = pt_cap_get(PT_CAP_mtc_periods);
 248                if (!allowed)
 249                        return false;
 250
 251                requested = (config & RTIT_CTL_MTC_RANGE) >>
 252                        RTIT_CTL_MTC_RANGE_OFFSET;
 253
 254                if (!(allowed & BIT(requested)))
 255                        return false;
 256        }
 257
 258        return true;
 259}
 260
 261/*
 262 * PT configuration helpers
 263 * These all are cpu affine and operate on a local PT
 264 */
 265
 266static void pt_config(struct perf_event *event)
 267{
 268        u64 reg;
 269
 270        if (!event->hw.itrace_started) {
 271                event->hw.itrace_started = 1;
 272                wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 273        }
 274
 275        reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 276
 277        if (!event->attr.exclude_kernel)
 278                reg |= RTIT_CTL_OS;
 279        if (!event->attr.exclude_user)
 280                reg |= RTIT_CTL_USR;
 281
 282        reg |= (event->attr.config & PT_CONFIG_MASK);
 283
 284        event->hw.config = reg;
 285        wrmsrl(MSR_IA32_RTIT_CTL, reg);
 286}
 287
 288static void pt_config_stop(struct perf_event *event)
 289{
 290        u64 ctl = READ_ONCE(event->hw.config);
 291
 292        /* may be already stopped by a PMI */
 293        if (!(ctl & RTIT_CTL_TRACEEN))
 294                return;
 295
 296        ctl &= ~RTIT_CTL_TRACEEN;
 297        wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 298
 299        WRITE_ONCE(event->hw.config, ctl);
 300
 301        /*
 302         * A wrmsr that disables trace generation serializes other PT
 303         * registers and causes all data packets to be written to memory,
 304         * but a fence is required for the data to become globally visible.
 305         *
 306         * The below WMB, separating data store and aux_head store matches
 307         * the consumer's RMB that separates aux_head load and data load.
 308         */
 309        wmb();
 310}
 311
 312static void pt_config_buffer(void *buf, unsigned int topa_idx,
 313                             unsigned int output_off)
 314{
 315        u64 reg;
 316
 317        wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
 318
 319        reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
 320
 321        wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
 322}
 323
 324/*
 325 * Keep ToPA table-related metadata on the same page as the actual table,
 326 * taking up a few words from the top
 327 */
 328
 329#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
 330
 331/**
 332 * struct topa - page-sized ToPA table with metadata at the top
 333 * @table:      actual ToPA table entries, as understood by PT hardware
 334 * @list:       linkage to struct pt_buffer's list of tables
 335 * @phys:       physical address of this page
 336 * @offset:     offset of the first entry in this table in the buffer
 337 * @size:       total size of all entries in this table
 338 * @last:       index of the last initialized entry in this table
 339 */
 340struct topa {
 341        struct topa_entry       table[TENTS_PER_PAGE];
 342        struct list_head        list;
 343        u64                     phys;
 344        u64                     offset;
 345        size_t                  size;
 346        int                     last;
 347};
 348
 349/* make -1 stand for the last table entry */
 350#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
 351
 352/**
 353 * topa_alloc() - allocate page-sized ToPA table
 354 * @cpu:        CPU on which to allocate.
 355 * @gfp:        Allocation flags.
 356 *
 357 * Return:      On success, return the pointer to ToPA table page.
 358 */
 359static struct topa *topa_alloc(int cpu, gfp_t gfp)
 360{
 361        int node = cpu_to_node(cpu);
 362        struct topa *topa;
 363        struct page *p;
 364
 365        p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 366        if (!p)
 367                return NULL;
 368
 369        topa = page_address(p);
 370        topa->last = 0;
 371        topa->phys = page_to_phys(p);
 372
 373        /*
 374         * In case of singe-entry ToPA, always put the self-referencing END
 375         * link as the 2nd entry in the table
 376         */
 377        if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
 378                TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
 379                TOPA_ENTRY(topa, 1)->end = 1;
 380        }
 381
 382        return topa;
 383}
 384
 385/**
 386 * topa_free() - free a page-sized ToPA table
 387 * @topa:       Table to deallocate.
 388 */
 389static void topa_free(struct topa *topa)
 390{
 391        free_page((unsigned long)topa);
 392}
 393
 394/**
 395 * topa_insert_table() - insert a ToPA table into a buffer
 396 * @buf:         PT buffer that's being extended.
 397 * @topa:        New topa table to be inserted.
 398 *
 399 * If it's the first table in this buffer, set up buffer's pointers
 400 * accordingly; otherwise, add a END=1 link entry to @topa to the current
 401 * "last" table and adjust the last table pointer to @topa.
 402 */
 403static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
 404{
 405        struct topa *last = buf->last;
 406
 407        list_add_tail(&topa->list, &buf->tables);
 408
 409        if (!buf->first) {
 410                buf->first = buf->last = buf->cur = topa;
 411                return;
 412        }
 413
 414        topa->offset = last->offset + last->size;
 415        buf->last = topa;
 416
 417        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 418                return;
 419
 420        BUG_ON(last->last != TENTS_PER_PAGE - 1);
 421
 422        TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
 423        TOPA_ENTRY(last, -1)->end = 1;
 424}
 425
 426/**
 427 * topa_table_full() - check if a ToPA table is filled up
 428 * @topa:       ToPA table.
 429 */
 430static bool topa_table_full(struct topa *topa)
 431{
 432        /* single-entry ToPA is a special case */
 433        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 434                return !!topa->last;
 435
 436        return topa->last == TENTS_PER_PAGE - 1;
 437}
 438
 439/**
 440 * topa_insert_pages() - create a list of ToPA tables
 441 * @buf:        PT buffer being initialized.
 442 * @gfp:        Allocation flags.
 443 *
 444 * This initializes a list of ToPA tables with entries from
 445 * the data_pages provided by rb_alloc_aux().
 446 *
 447 * Return:      0 on success or error code.
 448 */
 449static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
 450{
 451        struct topa *topa = buf->last;
 452        int order = 0;
 453        struct page *p;
 454
 455        p = virt_to_page(buf->data_pages[buf->nr_pages]);
 456        if (PagePrivate(p))
 457                order = page_private(p);
 458
 459        if (topa_table_full(topa)) {
 460                topa = topa_alloc(buf->cpu, gfp);
 461                if (!topa)
 462                        return -ENOMEM;
 463
 464                topa_insert_table(buf, topa);
 465        }
 466
 467        TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
 468        TOPA_ENTRY(topa, -1)->size = order;
 469        if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
 470                TOPA_ENTRY(topa, -1)->intr = 1;
 471                TOPA_ENTRY(topa, -1)->stop = 1;
 472        }
 473
 474        topa->last++;
 475        topa->size += sizes(order);
 476
 477        buf->nr_pages += 1ul << order;
 478
 479        return 0;
 480}
 481
 482/**
 483 * pt_topa_dump() - print ToPA tables and their entries
 484 * @buf:        PT buffer.
 485 */
 486static void pt_topa_dump(struct pt_buffer *buf)
 487{
 488        struct topa *topa;
 489
 490        list_for_each_entry(topa, &buf->tables, list) {
 491                int i;
 492
 493                pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
 494                         topa->phys, topa->offset, topa->size);
 495                for (i = 0; i < TENTS_PER_PAGE; i++) {
 496                        pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
 497                                 &topa->table[i],
 498                                 (unsigned long)topa->table[i].base << TOPA_SHIFT,
 499                                 sizes(topa->table[i].size),
 500                                 topa->table[i].end ?  'E' : ' ',
 501                                 topa->table[i].intr ? 'I' : ' ',
 502                                 topa->table[i].stop ? 'S' : ' ',
 503                                 *(u64 *)&topa->table[i]);
 504                        if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
 505                             topa->table[i].stop) ||
 506                            topa->table[i].end)
 507                                break;
 508                }
 509        }
 510}
 511
 512/**
 513 * pt_buffer_advance() - advance to the next output region
 514 * @buf:        PT buffer.
 515 *
 516 * Advance the current pointers in the buffer to the next ToPA entry.
 517 */
 518static void pt_buffer_advance(struct pt_buffer *buf)
 519{
 520        buf->output_off = 0;
 521        buf->cur_idx++;
 522
 523        if (buf->cur_idx == buf->cur->last) {
 524                if (buf->cur == buf->last)
 525                        buf->cur = buf->first;
 526                else
 527                        buf->cur = list_entry(buf->cur->list.next, struct topa,
 528                                              list);
 529                buf->cur_idx = 0;
 530        }
 531}
 532
 533/**
 534 * pt_update_head() - calculate current offsets and sizes
 535 * @pt:         Per-cpu pt context.
 536 *
 537 * Update buffer's current write pointer position and data size.
 538 */
 539static void pt_update_head(struct pt *pt)
 540{
 541        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 542        u64 topa_idx, base, old;
 543
 544        /* offset of the first region in this table from the beginning of buf */
 545        base = buf->cur->offset + buf->output_off;
 546
 547        /* offset of the current output region within this table */
 548        for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
 549                base += sizes(buf->cur->table[topa_idx].size);
 550
 551        if (buf->snapshot) {
 552                local_set(&buf->data_size, base);
 553        } else {
 554                old = (local64_xchg(&buf->head, base) &
 555                       ((buf->nr_pages << PAGE_SHIFT) - 1));
 556                if (base < old)
 557                        base += buf->nr_pages << PAGE_SHIFT;
 558
 559                local_add(base - old, &buf->data_size);
 560        }
 561}
 562
 563/**
 564 * pt_buffer_region() - obtain current output region's address
 565 * @buf:        PT buffer.
 566 */
 567static void *pt_buffer_region(struct pt_buffer *buf)
 568{
 569        return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
 570}
 571
 572/**
 573 * pt_buffer_region_size() - obtain current output region's size
 574 * @buf:        PT buffer.
 575 */
 576static size_t pt_buffer_region_size(struct pt_buffer *buf)
 577{
 578        return sizes(buf->cur->table[buf->cur_idx].size);
 579}
 580
 581/**
 582 * pt_handle_status() - take care of possible status conditions
 583 * @pt:         Per-cpu pt context.
 584 */
 585static void pt_handle_status(struct pt *pt)
 586{
 587        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 588        int advance = 0;
 589        u64 status;
 590
 591        rdmsrl(MSR_IA32_RTIT_STATUS, status);
 592
 593        if (status & RTIT_STATUS_ERROR) {
 594                pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
 595                pt_topa_dump(buf);
 596                status &= ~RTIT_STATUS_ERROR;
 597        }
 598
 599        if (status & RTIT_STATUS_STOPPED) {
 600                status &= ~RTIT_STATUS_STOPPED;
 601
 602                /*
 603                 * On systems that only do single-entry ToPA, hitting STOP
 604                 * means we are already losing data; need to let the decoder
 605                 * know.
 606                 */
 607                if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
 608                    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
 609                        local_inc(&buf->lost);
 610                        advance++;
 611                }
 612        }
 613
 614        /*
 615         * Also on single-entry ToPA implementations, interrupt will come
 616         * before the output reaches its output region's boundary.
 617         */
 618        if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
 619            pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
 620                void *head = pt_buffer_region(buf);
 621
 622                /* everything within this margin needs to be zeroed out */
 623                memset(head + buf->output_off, 0,
 624                       pt_buffer_region_size(buf) -
 625                       buf->output_off);
 626                advance++;
 627        }
 628
 629        if (advance)
 630                pt_buffer_advance(buf);
 631
 632        wrmsrl(MSR_IA32_RTIT_STATUS, status);
 633}
 634
 635/**
 636 * pt_read_offset() - translate registers into buffer pointers
 637 * @buf:        PT buffer.
 638 *
 639 * Set buffer's output pointers from MSR values.
 640 */
 641static void pt_read_offset(struct pt_buffer *buf)
 642{
 643        u64 offset, base_topa;
 644
 645        rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
 646        buf->cur = phys_to_virt(base_topa);
 647
 648        rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
 649        /* offset within current output region */
 650        buf->output_off = offset >> 32;
 651        /* index of current output region within this table */
 652        buf->cur_idx = (offset & 0xffffff80) >> 7;
 653}
 654
 655/**
 656 * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
 657 * @buf:        PT buffer.
 658 * @pg:         Page offset in the buffer.
 659 *
 660 * When advancing to the next output region (ToPA entry), given a page offset
 661 * into the buffer, we need to find the offset of the first page in the next
 662 * region.
 663 */
 664static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
 665{
 666        struct topa_entry *te = buf->topa_index[pg];
 667
 668        /* one region */
 669        if (buf->first == buf->last && buf->first->last == 1)
 670                return pg;
 671
 672        do {
 673                pg++;
 674                pg &= buf->nr_pages - 1;
 675        } while (buf->topa_index[pg] == te);
 676
 677        return pg;
 678}
 679
 680/**
 681 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
 682 * @buf:        PT buffer.
 683 * @handle:     Current output handle.
 684 *
 685 * Place INT and STOP marks to prevent overwriting old data that the consumer
 686 * hasn't yet collected and waking up the consumer after a certain fraction of
 687 * the buffer has filled up. Only needed and sensible for non-snapshot counters.
 688 *
 689 * This obviously relies on buf::head to figure out buffer markers, so it has
 690 * to be called after pt_buffer_reset_offsets() and before the hardware tracing
 691 * is enabled.
 692 */
 693static int pt_buffer_reset_markers(struct pt_buffer *buf,
 694                                   struct perf_output_handle *handle)
 695
 696{
 697        unsigned long head = local64_read(&buf->head);
 698        unsigned long idx, npages, wakeup;
 699
 700        /* can't stop in the middle of an output region */
 701        if (buf->output_off + handle->size + 1 <
 702            sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
 703                return -EINVAL;
 704
 705
 706        /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
 707        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 708                return 0;
 709
 710        /* clear STOP and INT from current entry */
 711        buf->topa_index[buf->stop_pos]->stop = 0;
 712        buf->topa_index[buf->stop_pos]->intr = 0;
 713        buf->topa_index[buf->intr_pos]->intr = 0;
 714
 715        /* how many pages till the STOP marker */
 716        npages = handle->size >> PAGE_SHIFT;
 717
 718        /* if it's on a page boundary, fill up one more page */
 719        if (!offset_in_page(head + handle->size + 1))
 720                npages++;
 721
 722        idx = (head >> PAGE_SHIFT) + npages;
 723        idx &= buf->nr_pages - 1;
 724        buf->stop_pos = idx;
 725
 726        wakeup = handle->wakeup >> PAGE_SHIFT;
 727
 728        /* in the worst case, wake up the consumer one page before hard stop */
 729        idx = (head >> PAGE_SHIFT) + npages - 1;
 730        if (idx > wakeup)
 731                idx = wakeup;
 732
 733        idx &= buf->nr_pages - 1;
 734        buf->intr_pos = idx;
 735
 736        buf->topa_index[buf->stop_pos]->stop = 1;
 737        buf->topa_index[buf->stop_pos]->intr = 1;
 738        buf->topa_index[buf->intr_pos]->intr = 1;
 739
 740        return 0;
 741}
 742
 743/**
 744 * pt_buffer_setup_topa_index() - build topa_index[] table of regions
 745 * @buf:        PT buffer.
 746 *
 747 * topa_index[] references output regions indexed by offset into the
 748 * buffer for purposes of quick reverse lookup.
 749 */
 750static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 751{
 752        struct topa *cur = buf->first, *prev = buf->last;
 753        struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
 754                *te_prev = TOPA_ENTRY(prev, prev->last - 1);
 755        int pg = 0, idx = 0;
 756
 757        while (pg < buf->nr_pages) {
 758                int tidx;
 759
 760                /* pages within one topa entry */
 761                for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
 762                        buf->topa_index[pg] = te_prev;
 763
 764                te_prev = te_cur;
 765
 766                if (idx == cur->last - 1) {
 767                        /* advance to next topa table */
 768                        idx = 0;
 769                        cur = list_entry(cur->list.next, struct topa, list);
 770                } else {
 771                        idx++;
 772                }
 773                te_cur = TOPA_ENTRY(cur, idx);
 774        }
 775
 776}
 777
 778/**
 779 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
 780 * @buf:        PT buffer.
 781 * @head:       Write pointer (aux_head) from AUX buffer.
 782 *
 783 * Find the ToPA table and entry corresponding to given @head and set buffer's
 784 * "current" pointers accordingly. This is done after we have obtained the
 785 * current aux_head position from a successful call to perf_aux_output_begin()
 786 * to make sure the hardware is writing to the right place.
 787 *
 788 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
 789 * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
 790 * which are used to determine INT and STOP markers' locations by a subsequent
 791 * call to pt_buffer_reset_markers().
 792 */
 793static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 794{
 795        int pg;
 796
 797        if (buf->snapshot)
 798                head &= (buf->nr_pages << PAGE_SHIFT) - 1;
 799
 800        pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
 801        pg = pt_topa_next_entry(buf, pg);
 802
 803        buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
 804        buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
 805                        (unsigned long)buf->cur) / sizeof(struct topa_entry);
 806        buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
 807
 808        local64_set(&buf->head, head);
 809        local_set(&buf->data_size, 0);
 810}
 811
 812/**
 813 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
 814 * @buf:        PT buffer.
 815 */
 816static void pt_buffer_fini_topa(struct pt_buffer *buf)
 817{
 818        struct topa *topa, *iter;
 819
 820        list_for_each_entry_safe(topa, iter, &buf->tables, list) {
 821                /*
 822                 * right now, this is in free_aux() path only, so
 823                 * no need to unlink this table from the list
 824                 */
 825                topa_free(topa);
 826        }
 827}
 828
 829/**
 830 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
 831 * @buf:        PT buffer.
 832 * @size:       Total size of all regions within this ToPA.
 833 * @gfp:        Allocation flags.
 834 */
 835static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
 836                               gfp_t gfp)
 837{
 838        struct topa *topa;
 839        int err;
 840
 841        topa = topa_alloc(buf->cpu, gfp);
 842        if (!topa)
 843                return -ENOMEM;
 844
 845        topa_insert_table(buf, topa);
 846
 847        while (buf->nr_pages < nr_pages) {
 848                err = topa_insert_pages(buf, gfp);
 849                if (err) {
 850                        pt_buffer_fini_topa(buf);
 851                        return -ENOMEM;
 852                }
 853        }
 854
 855        pt_buffer_setup_topa_index(buf);
 856
 857        /* link last table to the first one, unless we're double buffering */
 858        if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
 859                TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
 860                TOPA_ENTRY(buf->last, -1)->end = 1;
 861        }
 862
 863        pt_topa_dump(buf);
 864        return 0;
 865}
 866
 867/**
 868 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
 869 * @cpu:        Cpu on which to allocate, -1 means current.
 870 * @pages:      Array of pointers to buffer pages passed from perf core.
 871 * @nr_pages:   Number of pages in the buffer.
 872 * @snapshot:   If this is a snapshot/overwrite counter.
 873 *
 874 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
 875 * bookkeeping for an AUX buffer.
 876 *
 877 * Return:      Our private PT buffer structure.
 878 */
 879static void *
 880pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
 881{
 882        struct pt_buffer *buf;
 883        int node, ret;
 884
 885        if (!nr_pages)
 886                return NULL;
 887
 888        if (cpu == -1)
 889                cpu = raw_smp_processor_id();
 890        node = cpu_to_node(cpu);
 891
 892        buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
 893                           GFP_KERNEL, node);
 894        if (!buf)
 895                return NULL;
 896
 897        buf->cpu = cpu;
 898        buf->snapshot = snapshot;
 899        buf->data_pages = pages;
 900
 901        INIT_LIST_HEAD(&buf->tables);
 902
 903        ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
 904        if (ret) {
 905                kfree(buf);
 906                return NULL;
 907        }
 908
 909        return buf;
 910}
 911
 912/**
 913 * pt_buffer_free_aux() - perf AUX deallocation path callback
 914 * @data:       PT buffer.
 915 */
 916static void pt_buffer_free_aux(void *data)
 917{
 918        struct pt_buffer *buf = data;
 919
 920        pt_buffer_fini_topa(buf);
 921        kfree(buf);
 922}
 923
 924/**
 925 * pt_buffer_is_full() - check if the buffer is full
 926 * @buf:        PT buffer.
 927 * @pt:         Per-cpu pt handle.
 928 *
 929 * If the user hasn't read data from the output region that aux_head
 930 * points to, the buffer is considered full: the user needs to read at
 931 * least this region and update aux_tail to point past it.
 932 */
 933static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
 934{
 935        if (buf->snapshot)
 936                return false;
 937
 938        if (local_read(&buf->data_size) >= pt->handle.size)
 939                return true;
 940
 941        return false;
 942}
 943
 944/**
 945 * intel_pt_interrupt() - PT PMI handler
 946 */
 947void intel_pt_interrupt(void)
 948{
 949        struct pt *pt = this_cpu_ptr(&pt_ctx);
 950        struct pt_buffer *buf;
 951        struct perf_event *event = pt->handle.event;
 952
 953        /*
 954         * There may be a dangling PT bit in the interrupt status register
 955         * after PT has been disabled by pt_event_stop(). Make sure we don't
 956         * do anything (particularly, re-enable) for this event here.
 957         */
 958        if (!ACCESS_ONCE(pt->handle_nmi))
 959                return;
 960
 961        /*
 962         * If VMX is on and PT does not support it, don't touch anything.
 963         */
 964        if (READ_ONCE(pt->vmx_on))
 965                return;
 966
 967        if (!event)
 968                return;
 969
 970        pt_config_stop(event);
 971
 972        buf = perf_get_aux(&pt->handle);
 973        if (!buf)
 974                return;
 975
 976        pt_read_offset(buf);
 977
 978        pt_handle_status(pt);
 979
 980        pt_update_head(pt);
 981
 982        perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
 983                            local_xchg(&buf->lost, 0));
 984
 985        if (!event->hw.state) {
 986                int ret;
 987
 988                buf = perf_aux_output_begin(&pt->handle, event);
 989                if (!buf) {
 990                        event->hw.state = PERF_HES_STOPPED;
 991                        return;
 992                }
 993
 994                pt_buffer_reset_offsets(buf, pt->handle.head);
 995                /* snapshot counters don't use PMI, so it's safe */
 996                ret = pt_buffer_reset_markers(buf, &pt->handle);
 997                if (ret) {
 998                        perf_aux_output_end(&pt->handle, 0, true);
 999                        return;
1000                }
1001
1002                pt_config_buffer(buf->cur->table, buf->cur_idx,
1003                                 buf->output_off);
1004                pt_config(event);
1005        }
1006}
1007
1008void intel_pt_handle_vmx(int on)
1009{
1010        struct pt *pt = this_cpu_ptr(&pt_ctx);
1011        struct perf_event *event;
1012        unsigned long flags;
1013
1014        /* PT plays nice with VMX, do nothing */
1015        if (pt_pmu.vmx)
1016                return;
1017
1018        /*
1019         * VMXON will clear RTIT_CTL.TraceEn; we need to make
1020         * sure to not try to set it while VMX is on. Disable
1021         * interrupts to avoid racing with pmu callbacks;
1022         * concurrent PMI should be handled fine.
1023         */
1024        local_irq_save(flags);
1025        WRITE_ONCE(pt->vmx_on, on);
1026
1027        if (on) {
1028                /* prevent pt_config_stop() from writing RTIT_CTL */
1029                event = pt->handle.event;
1030                if (event)
1031                        event->hw.config = 0;
1032        }
1033        local_irq_restore(flags);
1034}
1035EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1036
1037/*
1038 * PMU callbacks
1039 */
1040
1041static void pt_event_start(struct perf_event *event, int mode)
1042{
1043        struct pt *pt = this_cpu_ptr(&pt_ctx);
1044        struct pt_buffer *buf = perf_get_aux(&pt->handle);
1045
1046        if (READ_ONCE(pt->vmx_on))
1047                return;
1048
1049        if (!buf || pt_buffer_is_full(buf, pt)) {
1050                event->hw.state = PERF_HES_STOPPED;
1051                return;
1052        }
1053
1054        ACCESS_ONCE(pt->handle_nmi) = 1;
1055        event->hw.state = 0;
1056
1057        pt_config_buffer(buf->cur->table, buf->cur_idx,
1058                         buf->output_off);
1059        pt_config(event);
1060}
1061
1062static void pt_event_stop(struct perf_event *event, int mode)
1063{
1064        struct pt *pt = this_cpu_ptr(&pt_ctx);
1065
1066        /*
1067         * Protect against the PMI racing with disabling wrmsr,
1068         * see comment in intel_pt_interrupt().
1069         */
1070        ACCESS_ONCE(pt->handle_nmi) = 0;
1071
1072        pt_config_stop(event);
1073
1074        if (event->hw.state == PERF_HES_STOPPED)
1075                return;
1076
1077        event->hw.state = PERF_HES_STOPPED;
1078
1079        if (mode & PERF_EF_UPDATE) {
1080                struct pt_buffer *buf = perf_get_aux(&pt->handle);
1081
1082                if (!buf)
1083                        return;
1084
1085                if (WARN_ON_ONCE(pt->handle.event != event))
1086                        return;
1087
1088                pt_read_offset(buf);
1089
1090                pt_handle_status(pt);
1091
1092                pt_update_head(pt);
1093        }
1094}
1095
1096static void pt_event_del(struct perf_event *event, int mode)
1097{
1098        struct pt *pt = this_cpu_ptr(&pt_ctx);
1099        struct pt_buffer *buf;
1100
1101        pt_event_stop(event, PERF_EF_UPDATE);
1102
1103        buf = perf_get_aux(&pt->handle);
1104
1105        if (buf) {
1106                if (buf->snapshot)
1107                        pt->handle.head =
1108                                local_xchg(&buf->data_size,
1109                                           buf->nr_pages << PAGE_SHIFT);
1110                perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
1111                                    local_xchg(&buf->lost, 0));
1112        }
1113}
1114
1115static int pt_event_add(struct perf_event *event, int mode)
1116{
1117        struct pt_buffer *buf;
1118        struct pt *pt = this_cpu_ptr(&pt_ctx);
1119        struct hw_perf_event *hwc = &event->hw;
1120        int ret = -EBUSY;
1121
1122        if (pt->handle.event)
1123                goto fail;
1124
1125        buf = perf_aux_output_begin(&pt->handle, event);
1126        ret = -EINVAL;
1127        if (!buf)
1128                goto fail_stop;
1129
1130        pt_buffer_reset_offsets(buf, pt->handle.head);
1131        if (!buf->snapshot) {
1132                ret = pt_buffer_reset_markers(buf, &pt->handle);
1133                if (ret)
1134                        goto fail_end_stop;
1135        }
1136
1137        if (mode & PERF_EF_START) {
1138                pt_event_start(event, 0);
1139                ret = -EBUSY;
1140                if (hwc->state == PERF_HES_STOPPED)
1141                        goto fail_end_stop;
1142        } else {
1143                hwc->state = PERF_HES_STOPPED;
1144        }
1145
1146        return 0;
1147
1148fail_end_stop:
1149        perf_aux_output_end(&pt->handle, 0, true);
1150fail_stop:
1151        hwc->state = PERF_HES_STOPPED;
1152fail:
1153        return ret;
1154}
1155
1156static void pt_event_read(struct perf_event *event)
1157{
1158}
1159
1160static void pt_event_destroy(struct perf_event *event)
1161{
1162        x86_del_exclusive(x86_lbr_exclusive_pt);
1163}
1164
1165static int pt_event_init(struct perf_event *event)
1166{
1167        if (event->attr.type != pt_pmu.pmu.type)
1168                return -ENOENT;
1169
1170        if (!pt_event_valid(event))
1171                return -EINVAL;
1172
1173        if (x86_add_exclusive(x86_lbr_exclusive_pt))
1174                return -EBUSY;
1175
1176        event->destroy = pt_event_destroy;
1177
1178        return 0;
1179}
1180
1181void cpu_emergency_stop_pt(void)
1182{
1183        struct pt *pt = this_cpu_ptr(&pt_ctx);
1184
1185        if (pt->handle.event)
1186                pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1187}
1188
1189static __init int pt_init(void)
1190{
1191        int ret, cpu, prior_warn = 0;
1192
1193        BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1194
1195        if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
1196                return -ENODEV;
1197
1198        get_online_cpus();
1199        for_each_online_cpu(cpu) {
1200                u64 ctl;
1201
1202                ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1203                if (!ret && (ctl & RTIT_CTL_TRACEEN))
1204                        prior_warn++;
1205        }
1206        put_online_cpus();
1207
1208        if (prior_warn) {
1209                x86_add_exclusive(x86_lbr_exclusive_pt);
1210                pr_warn("PT is enabled at boot time, doing nothing\n");
1211
1212                return -EBUSY;
1213        }
1214
1215        ret = pt_pmu_hw_init();
1216        if (ret)
1217                return ret;
1218
1219        if (!pt_cap_get(PT_CAP_topa_output)) {
1220                pr_warn("ToPA output is not supported on this CPU\n");
1221                return -ENODEV;
1222        }
1223
1224        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1225                pt_pmu.pmu.capabilities =
1226                        PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1227
1228        pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1229        pt_pmu.pmu.attr_groups  = pt_attr_groups;
1230        pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
1231        pt_pmu.pmu.event_init   = pt_event_init;
1232        pt_pmu.pmu.add          = pt_event_add;
1233        pt_pmu.pmu.del          = pt_event_del;
1234        pt_pmu.pmu.start        = pt_event_start;
1235        pt_pmu.pmu.stop         = pt_event_stop;
1236        pt_pmu.pmu.read         = pt_event_read;
1237        pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
1238        pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
1239        ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1240
1241        return ret;
1242}
1243arch_initcall(pt_init);
1244