linux/arch/x86/kernel/cpu/perf_event_intel_pt.c
<<
>>
Prefs
   1/*
   2 * Intel(R) Processor Trace PMU driver for perf
   3 * Copyright (c) 2013-2014, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 *
  14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
  15 * Programming Reference:
  16 * http://software.intel.com/en-us/intel-isa-extensions
  17 */
  18
  19#undef DEBUG
  20
  21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  22
  23#include <linux/types.h>
  24#include <linux/slab.h>
  25#include <linux/device.h>
  26
  27#include <asm/perf_event.h>
  28#include <asm/insn.h>
  29#include <asm/io.h>
  30
  31#include "perf_event.h"
  32#include "intel_pt.h"
  33
  34static DEFINE_PER_CPU(struct pt, pt_ctx);
  35
  36static struct pt_pmu pt_pmu;
  37
  38enum cpuid_regs {
  39        CR_EAX = 0,
  40        CR_ECX,
  41        CR_EDX,
  42        CR_EBX
  43};
  44
  45/*
  46 * Capabilities of Intel PT hardware, such as number of address bits or
  47 * supported output schemes, are cached and exported to userspace as "caps"
  48 * attribute group of pt pmu device
  49 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
  50 * relevant bits together with intel_pt traces.
  51 *
  52 * These are necessary for both trace decoding (payloads_lip, contains address
  53 * width encoded in IP-related packets), and event configuration (bitmasks with
  54 * permitted values for certain bit fields).
  55 */
  56#define PT_CAP(_n, _l, _r, _m)                                          \
  57        [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
  58                            .reg = _r, .mask = _m }
  59
  60static struct pt_cap_desc {
  61        const char      *name;
  62        u32             leaf;
  63        u8              reg;
  64        u32             mask;
  65} pt_caps[] = {
  66        PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
  67        PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
  68        PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
  69        PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
  70        PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
  71};
  72
  73static u32 pt_cap_get(enum pt_capabilities cap)
  74{
  75        struct pt_cap_desc *cd = &pt_caps[cap];
  76        u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
  77        unsigned int shift = __ffs(cd->mask);
  78
  79        return (c & cd->mask) >> shift;
  80}
  81
  82static ssize_t pt_cap_show(struct device *cdev,
  83                           struct device_attribute *attr,
  84                           char *buf)
  85{
  86        struct dev_ext_attribute *ea =
  87                container_of(attr, struct dev_ext_attribute, attr);
  88        enum pt_capabilities cap = (long)ea->var;
  89
  90        return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
  91}
  92
  93static struct attribute_group pt_cap_group = {
  94        .name   = "caps",
  95};
  96
  97PMU_FORMAT_ATTR(tsc,            "config:10"     );
  98PMU_FORMAT_ATTR(noretcomp,      "config:11"     );
  99
 100static struct attribute *pt_formats_attr[] = {
 101        &format_attr_tsc.attr,
 102        &format_attr_noretcomp.attr,
 103        NULL,
 104};
 105
 106static struct attribute_group pt_format_group = {
 107        .name   = "format",
 108        .attrs  = pt_formats_attr,
 109};
 110
 111static const struct attribute_group *pt_attr_groups[] = {
 112        &pt_cap_group,
 113        &pt_format_group,
 114        NULL,
 115};
 116
 117static int __init pt_pmu_hw_init(void)
 118{
 119        struct dev_ext_attribute *de_attrs;
 120        struct attribute **attrs;
 121        size_t size;
 122        int ret;
 123        long i;
 124
 125        attrs = NULL;
 126        ret = -ENODEV;
 127        if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
 128                goto fail;
 129
 130        for (i = 0; i < PT_CPUID_LEAVES; i++) {
 131                cpuid_count(20, i,
 132                            &pt_pmu.caps[CR_EAX + i*4],
 133                            &pt_pmu.caps[CR_EBX + i*4],
 134                            &pt_pmu.caps[CR_ECX + i*4],
 135                            &pt_pmu.caps[CR_EDX + i*4]);
 136        }
 137
 138        ret = -ENOMEM;
 139        size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
 140        attrs = kzalloc(size, GFP_KERNEL);
 141        if (!attrs)
 142                goto fail;
 143
 144        size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
 145        de_attrs = kzalloc(size, GFP_KERNEL);
 146        if (!de_attrs)
 147                goto fail;
 148
 149        for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
 150                struct dev_ext_attribute *de_attr = de_attrs + i;
 151
 152                de_attr->attr.attr.name = pt_caps[i].name;
 153
 154                sysfs_attr_init(&de_attr->attr.attr);
 155
 156                de_attr->attr.attr.mode         = S_IRUGO;
 157                de_attr->attr.show              = pt_cap_show;
 158                de_attr->var                    = (void *)i;
 159
 160                attrs[i] = &de_attr->attr.attr;
 161        }
 162
 163        pt_cap_group.attrs = attrs;
 164
 165        return 0;
 166
 167fail:
 168        kfree(attrs);
 169
 170        return ret;
 171}
 172
 173#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
 174
 175static bool pt_event_valid(struct perf_event *event)
 176{
 177        u64 config = event->attr.config;
 178
 179        if ((config & PT_CONFIG_MASK) != config)
 180                return false;
 181
 182        return true;
 183}
 184
 185/*
 186 * PT configuration helpers
 187 * These all are cpu affine and operate on a local PT
 188 */
 189
 190static bool pt_is_running(void)
 191{
 192        u64 ctl;
 193
 194        rdmsrl(MSR_IA32_RTIT_CTL, ctl);
 195
 196        return !!(ctl & RTIT_CTL_TRACEEN);
 197}
 198
 199static void pt_config(struct perf_event *event)
 200{
 201        u64 reg;
 202
 203        reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 204
 205        if (!event->attr.exclude_kernel)
 206                reg |= RTIT_CTL_OS;
 207        if (!event->attr.exclude_user)
 208                reg |= RTIT_CTL_USR;
 209
 210        reg |= (event->attr.config & PT_CONFIG_MASK);
 211
 212        wrmsrl(MSR_IA32_RTIT_CTL, reg);
 213}
 214
 215static void pt_config_start(bool start)
 216{
 217        u64 ctl;
 218
 219        rdmsrl(MSR_IA32_RTIT_CTL, ctl);
 220        if (start)
 221                ctl |= RTIT_CTL_TRACEEN;
 222        else
 223                ctl &= ~RTIT_CTL_TRACEEN;
 224        wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 225
 226        /*
 227         * A wrmsr that disables trace generation serializes other PT
 228         * registers and causes all data packets to be written to memory,
 229         * but a fence is required for the data to become globally visible.
 230         *
 231         * The below WMB, separating data store and aux_head store matches
 232         * the consumer's RMB that separates aux_head load and data load.
 233         */
 234        if (!start)
 235                wmb();
 236}
 237
 238static void pt_config_buffer(void *buf, unsigned int topa_idx,
 239                             unsigned int output_off)
 240{
 241        u64 reg;
 242
 243        wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
 244
 245        reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
 246
 247        wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
 248}
 249
 250/*
 251 * Keep ToPA table-related metadata on the same page as the actual table,
 252 * taking up a few words from the top
 253 */
 254
 255#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
 256
 257/**
 258 * struct topa - page-sized ToPA table with metadata at the top
 259 * @table:      actual ToPA table entries, as understood by PT hardware
 260 * @list:       linkage to struct pt_buffer's list of tables
 261 * @phys:       physical address of this page
 262 * @offset:     offset of the first entry in this table in the buffer
 263 * @size:       total size of all entries in this table
 264 * @last:       index of the last initialized entry in this table
 265 */
 266struct topa {
 267        struct topa_entry       table[TENTS_PER_PAGE];
 268        struct list_head        list;
 269        u64                     phys;
 270        u64                     offset;
 271        size_t                  size;
 272        int                     last;
 273};
 274
 275/* make -1 stand for the last table entry */
 276#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
 277
 278/**
 279 * topa_alloc() - allocate page-sized ToPA table
 280 * @cpu:        CPU on which to allocate.
 281 * @gfp:        Allocation flags.
 282 *
 283 * Return:      On success, return the pointer to ToPA table page.
 284 */
 285static struct topa *topa_alloc(int cpu, gfp_t gfp)
 286{
 287        int node = cpu_to_node(cpu);
 288        struct topa *topa;
 289        struct page *p;
 290
 291        p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 292        if (!p)
 293                return NULL;
 294
 295        topa = page_address(p);
 296        topa->last = 0;
 297        topa->phys = page_to_phys(p);
 298
 299        /*
 300         * In case of singe-entry ToPA, always put the self-referencing END
 301         * link as the 2nd entry in the table
 302         */
 303        if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
 304                TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
 305                TOPA_ENTRY(topa, 1)->end = 1;
 306        }
 307
 308        return topa;
 309}
 310
 311/**
 312 * topa_free() - free a page-sized ToPA table
 313 * @topa:       Table to deallocate.
 314 */
 315static void topa_free(struct topa *topa)
 316{
 317        free_page((unsigned long)topa);
 318}
 319
 320/**
 321 * topa_insert_table() - insert a ToPA table into a buffer
 322 * @buf:         PT buffer that's being extended.
 323 * @topa:        New topa table to be inserted.
 324 *
 325 * If it's the first table in this buffer, set up buffer's pointers
 326 * accordingly; otherwise, add a END=1 link entry to @topa to the current
 327 * "last" table and adjust the last table pointer to @topa.
 328 */
 329static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
 330{
 331        struct topa *last = buf->last;
 332
 333        list_add_tail(&topa->list, &buf->tables);
 334
 335        if (!buf->first) {
 336                buf->first = buf->last = buf->cur = topa;
 337                return;
 338        }
 339
 340        topa->offset = last->offset + last->size;
 341        buf->last = topa;
 342
 343        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 344                return;
 345
 346        BUG_ON(last->last != TENTS_PER_PAGE - 1);
 347
 348        TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
 349        TOPA_ENTRY(last, -1)->end = 1;
 350}
 351
 352/**
 353 * topa_table_full() - check if a ToPA table is filled up
 354 * @topa:       ToPA table.
 355 */
 356static bool topa_table_full(struct topa *topa)
 357{
 358        /* single-entry ToPA is a special case */
 359        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 360                return !!topa->last;
 361
 362        return topa->last == TENTS_PER_PAGE - 1;
 363}
 364
 365/**
 366 * topa_insert_pages() - create a list of ToPA tables
 367 * @buf:        PT buffer being initialized.
 368 * @gfp:        Allocation flags.
 369 *
 370 * This initializes a list of ToPA tables with entries from
 371 * the data_pages provided by rb_alloc_aux().
 372 *
 373 * Return:      0 on success or error code.
 374 */
 375static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
 376{
 377        struct topa *topa = buf->last;
 378        int order = 0;
 379        struct page *p;
 380
 381        p = virt_to_page(buf->data_pages[buf->nr_pages]);
 382        if (PagePrivate(p))
 383                order = page_private(p);
 384
 385        if (topa_table_full(topa)) {
 386                topa = topa_alloc(buf->cpu, gfp);
 387                if (!topa)
 388                        return -ENOMEM;
 389
 390                topa_insert_table(buf, topa);
 391        }
 392
 393        TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
 394        TOPA_ENTRY(topa, -1)->size = order;
 395        if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
 396                TOPA_ENTRY(topa, -1)->intr = 1;
 397                TOPA_ENTRY(topa, -1)->stop = 1;
 398        }
 399
 400        topa->last++;
 401        topa->size += sizes(order);
 402
 403        buf->nr_pages += 1ul << order;
 404
 405        return 0;
 406}
 407
 408/**
 409 * pt_topa_dump() - print ToPA tables and their entries
 410 * @buf:        PT buffer.
 411 */
 412static void pt_topa_dump(struct pt_buffer *buf)
 413{
 414        struct topa *topa;
 415
 416        list_for_each_entry(topa, &buf->tables, list) {
 417                int i;
 418
 419                pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
 420                         topa->phys, topa->offset, topa->size);
 421                for (i = 0; i < TENTS_PER_PAGE; i++) {
 422                        pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
 423                                 &topa->table[i],
 424                                 (unsigned long)topa->table[i].base << TOPA_SHIFT,
 425                                 sizes(topa->table[i].size),
 426                                 topa->table[i].end ?  'E' : ' ',
 427                                 topa->table[i].intr ? 'I' : ' ',
 428                                 topa->table[i].stop ? 'S' : ' ',
 429                                 *(u64 *)&topa->table[i]);
 430                        if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
 431                             topa->table[i].stop) ||
 432                            topa->table[i].end)
 433                                break;
 434                }
 435        }
 436}
 437
 438/**
 439 * pt_buffer_advance() - advance to the next output region
 440 * @buf:        PT buffer.
 441 *
 442 * Advance the current pointers in the buffer to the next ToPA entry.
 443 */
 444static void pt_buffer_advance(struct pt_buffer *buf)
 445{
 446        buf->output_off = 0;
 447        buf->cur_idx++;
 448
 449        if (buf->cur_idx == buf->cur->last) {
 450                if (buf->cur == buf->last)
 451                        buf->cur = buf->first;
 452                else
 453                        buf->cur = list_entry(buf->cur->list.next, struct topa,
 454                                              list);
 455                buf->cur_idx = 0;
 456        }
 457}
 458
 459/**
 460 * pt_update_head() - calculate current offsets and sizes
 461 * @pt:         Per-cpu pt context.
 462 *
 463 * Update buffer's current write pointer position and data size.
 464 */
 465static void pt_update_head(struct pt *pt)
 466{
 467        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 468        u64 topa_idx, base, old;
 469
 470        /* offset of the first region in this table from the beginning of buf */
 471        base = buf->cur->offset + buf->output_off;
 472
 473        /* offset of the current output region within this table */
 474        for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
 475                base += sizes(buf->cur->table[topa_idx].size);
 476
 477        if (buf->snapshot) {
 478                local_set(&buf->data_size, base);
 479        } else {
 480                old = (local64_xchg(&buf->head, base) &
 481                       ((buf->nr_pages << PAGE_SHIFT) - 1));
 482                if (base < old)
 483                        base += buf->nr_pages << PAGE_SHIFT;
 484
 485                local_add(base - old, &buf->data_size);
 486        }
 487}
 488
 489/**
 490 * pt_buffer_region() - obtain current output region's address
 491 * @buf:        PT buffer.
 492 */
 493static void *pt_buffer_region(struct pt_buffer *buf)
 494{
 495        return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
 496}
 497
 498/**
 499 * pt_buffer_region_size() - obtain current output region's size
 500 * @buf:        PT buffer.
 501 */
 502static size_t pt_buffer_region_size(struct pt_buffer *buf)
 503{
 504        return sizes(buf->cur->table[buf->cur_idx].size);
 505}
 506
 507/**
 508 * pt_handle_status() - take care of possible status conditions
 509 * @pt:         Per-cpu pt context.
 510 */
 511static void pt_handle_status(struct pt *pt)
 512{
 513        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 514        int advance = 0;
 515        u64 status;
 516
 517        rdmsrl(MSR_IA32_RTIT_STATUS, status);
 518
 519        if (status & RTIT_STATUS_ERROR) {
 520                pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
 521                pt_topa_dump(buf);
 522                status &= ~RTIT_STATUS_ERROR;
 523        }
 524
 525        if (status & RTIT_STATUS_STOPPED) {
 526                status &= ~RTIT_STATUS_STOPPED;
 527
 528                /*
 529                 * On systems that only do single-entry ToPA, hitting STOP
 530                 * means we are already losing data; need to let the decoder
 531                 * know.
 532                 */
 533                if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
 534                    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
 535                        local_inc(&buf->lost);
 536                        advance++;
 537                }
 538        }
 539
 540        /*
 541         * Also on single-entry ToPA implementations, interrupt will come
 542         * before the output reaches its output region's boundary.
 543         */
 544        if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
 545            pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
 546                void *head = pt_buffer_region(buf);
 547
 548                /* everything within this margin needs to be zeroed out */
 549                memset(head + buf->output_off, 0,
 550                       pt_buffer_region_size(buf) -
 551                       buf->output_off);
 552                advance++;
 553        }
 554
 555        if (advance)
 556                pt_buffer_advance(buf);
 557
 558        wrmsrl(MSR_IA32_RTIT_STATUS, status);
 559}
 560
 561/**
 562 * pt_read_offset() - translate registers into buffer pointers
 563 * @buf:        PT buffer.
 564 *
 565 * Set buffer's output pointers from MSR values.
 566 */
 567static void pt_read_offset(struct pt_buffer *buf)
 568{
 569        u64 offset, base_topa;
 570
 571        rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
 572        buf->cur = phys_to_virt(base_topa);
 573
 574        rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
 575        /* offset within current output region */
 576        buf->output_off = offset >> 32;
 577        /* index of current output region within this table */
 578        buf->cur_idx = (offset & 0xffffff80) >> 7;
 579}
 580
 581/**
 582 * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
 583 * @buf:        PT buffer.
 584 * @pg:         Page offset in the buffer.
 585 *
 586 * When advancing to the next output region (ToPA entry), given a page offset
 587 * into the buffer, we need to find the offset of the first page in the next
 588 * region.
 589 */
 590static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
 591{
 592        struct topa_entry *te = buf->topa_index[pg];
 593
 594        /* one region */
 595        if (buf->first == buf->last && buf->first->last == 1)
 596                return pg;
 597
 598        do {
 599                pg++;
 600                pg &= buf->nr_pages - 1;
 601        } while (buf->topa_index[pg] == te);
 602
 603        return pg;
 604}
 605
 606/**
 607 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
 608 * @buf:        PT buffer.
 609 * @handle:     Current output handle.
 610 *
 611 * Place INT and STOP marks to prevent overwriting old data that the consumer
 612 * hasn't yet collected.
 613 */
 614static int pt_buffer_reset_markers(struct pt_buffer *buf,
 615                                   struct perf_output_handle *handle)
 616
 617{
 618        unsigned long head = local64_read(&buf->head);
 619        unsigned long idx, npages, wakeup;
 620
 621        if (buf->snapshot)
 622                return 0;
 623
 624        /* can't stop in the middle of an output region */
 625        if (buf->output_off + handle->size + 1 <
 626            sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
 627                return -EINVAL;
 628
 629
 630        /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
 631        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
 632                return 0;
 633
 634        /* clear STOP and INT from current entry */
 635        buf->topa_index[buf->stop_pos]->stop = 0;
 636        buf->topa_index[buf->intr_pos]->intr = 0;
 637
 638        /* how many pages till the STOP marker */
 639        npages = handle->size >> PAGE_SHIFT;
 640
 641        /* if it's on a page boundary, fill up one more page */
 642        if (!offset_in_page(head + handle->size + 1))
 643                npages++;
 644
 645        idx = (head >> PAGE_SHIFT) + npages;
 646        idx &= buf->nr_pages - 1;
 647        buf->stop_pos = idx;
 648
 649        wakeup = handle->wakeup >> PAGE_SHIFT;
 650
 651        /* in the worst case, wake up the consumer one page before hard stop */
 652        idx = (head >> PAGE_SHIFT) + npages - 1;
 653        if (idx > wakeup)
 654                idx = wakeup;
 655
 656        idx &= buf->nr_pages - 1;
 657        buf->intr_pos = idx;
 658
 659        buf->topa_index[buf->stop_pos]->stop = 1;
 660        buf->topa_index[buf->intr_pos]->intr = 1;
 661
 662        return 0;
 663}
 664
 665/**
 666 * pt_buffer_setup_topa_index() - build topa_index[] table of regions
 667 * @buf:        PT buffer.
 668 *
 669 * topa_index[] references output regions indexed by offset into the
 670 * buffer for purposes of quick reverse lookup.
 671 */
 672static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 673{
 674        struct topa *cur = buf->first, *prev = buf->last;
 675        struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
 676                *te_prev = TOPA_ENTRY(prev, prev->last - 1);
 677        int pg = 0, idx = 0, ntopa = 0;
 678
 679        while (pg < buf->nr_pages) {
 680                int tidx;
 681
 682                /* pages within one topa entry */
 683                for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
 684                        buf->topa_index[pg] = te_prev;
 685
 686                te_prev = te_cur;
 687
 688                if (idx == cur->last - 1) {
 689                        /* advance to next topa table */
 690                        idx = 0;
 691                        cur = list_entry(cur->list.next, struct topa, list);
 692                        ntopa++;
 693                } else
 694                        idx++;
 695                te_cur = TOPA_ENTRY(cur, idx);
 696        }
 697
 698}
 699
 700/**
 701 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
 702 * @buf:        PT buffer.
 703 * @head:       Write pointer (aux_head) from AUX buffer.
 704 *
 705 * Find the ToPA table and entry corresponding to given @head and set buffer's
 706 * "current" pointers accordingly.
 707 */
 708static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 709{
 710        int pg;
 711
 712        if (buf->snapshot)
 713                head &= (buf->nr_pages << PAGE_SHIFT) - 1;
 714
 715        pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
 716        pg = pt_topa_next_entry(buf, pg);
 717
 718        buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
 719        buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
 720                        (unsigned long)buf->cur) / sizeof(struct topa_entry);
 721        buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
 722
 723        local64_set(&buf->head, head);
 724        local_set(&buf->data_size, 0);
 725}
 726
 727/**
 728 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
 729 * @buf:        PT buffer.
 730 */
 731static void pt_buffer_fini_topa(struct pt_buffer *buf)
 732{
 733        struct topa *topa, *iter;
 734
 735        list_for_each_entry_safe(topa, iter, &buf->tables, list) {
 736                /*
 737                 * right now, this is in free_aux() path only, so
 738                 * no need to unlink this table from the list
 739                 */
 740                topa_free(topa);
 741        }
 742}
 743
 744/**
 745 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
 746 * @buf:        PT buffer.
 747 * @size:       Total size of all regions within this ToPA.
 748 * @gfp:        Allocation flags.
 749 */
 750static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
 751                               gfp_t gfp)
 752{
 753        struct topa *topa;
 754        int err;
 755
 756        topa = topa_alloc(buf->cpu, gfp);
 757        if (!topa)
 758                return -ENOMEM;
 759
 760        topa_insert_table(buf, topa);
 761
 762        while (buf->nr_pages < nr_pages) {
 763                err = topa_insert_pages(buf, gfp);
 764                if (err) {
 765                        pt_buffer_fini_topa(buf);
 766                        return -ENOMEM;
 767                }
 768        }
 769
 770        pt_buffer_setup_topa_index(buf);
 771
 772        /* link last table to the first one, unless we're double buffering */
 773        if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
 774                TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
 775                TOPA_ENTRY(buf->last, -1)->end = 1;
 776        }
 777
 778        pt_topa_dump(buf);
 779        return 0;
 780}
 781
 782/**
 783 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
 784 * @cpu:        Cpu on which to allocate, -1 means current.
 785 * @pages:      Array of pointers to buffer pages passed from perf core.
 786 * @nr_pages:   Number of pages in the buffer.
 787 * @snapshot:   If this is a snapshot/overwrite counter.
 788 *
 789 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
 790 * bookkeeping for an AUX buffer.
 791 *
 792 * Return:      Our private PT buffer structure.
 793 */
 794static void *
 795pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
 796{
 797        struct pt_buffer *buf;
 798        int node, ret;
 799
 800        if (!nr_pages)
 801                return NULL;
 802
 803        if (cpu == -1)
 804                cpu = raw_smp_processor_id();
 805        node = cpu_to_node(cpu);
 806
 807        buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
 808                           GFP_KERNEL, node);
 809        if (!buf)
 810                return NULL;
 811
 812        buf->cpu = cpu;
 813        buf->snapshot = snapshot;
 814        buf->data_pages = pages;
 815
 816        INIT_LIST_HEAD(&buf->tables);
 817
 818        ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
 819        if (ret) {
 820                kfree(buf);
 821                return NULL;
 822        }
 823
 824        return buf;
 825}
 826
 827/**
 828 * pt_buffer_free_aux() - perf AUX deallocation path callback
 829 * @data:       PT buffer.
 830 */
 831static void pt_buffer_free_aux(void *data)
 832{
 833        struct pt_buffer *buf = data;
 834
 835        pt_buffer_fini_topa(buf);
 836        kfree(buf);
 837}
 838
 839/**
 840 * pt_buffer_is_full() - check if the buffer is full
 841 * @buf:        PT buffer.
 842 * @pt:         Per-cpu pt handle.
 843 *
 844 * If the user hasn't read data from the output region that aux_head
 845 * points to, the buffer is considered full: the user needs to read at
 846 * least this region and update aux_tail to point past it.
 847 */
 848static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
 849{
 850        if (buf->snapshot)
 851                return false;
 852
 853        if (local_read(&buf->data_size) >= pt->handle.size)
 854                return true;
 855
 856        return false;
 857}
 858
 859/**
 860 * intel_pt_interrupt() - PT PMI handler
 861 */
 862void intel_pt_interrupt(void)
 863{
 864        struct pt *pt = this_cpu_ptr(&pt_ctx);
 865        struct pt_buffer *buf;
 866        struct perf_event *event = pt->handle.event;
 867
 868        /*
 869         * There may be a dangling PT bit in the interrupt status register
 870         * after PT has been disabled by pt_event_stop(). Make sure we don't
 871         * do anything (particularly, re-enable) for this event here.
 872         */
 873        if (!ACCESS_ONCE(pt->handle_nmi))
 874                return;
 875
 876        pt_config_start(false);
 877
 878        if (!event)
 879                return;
 880
 881        buf = perf_get_aux(&pt->handle);
 882        if (!buf)
 883                return;
 884
 885        pt_read_offset(buf);
 886
 887        pt_handle_status(pt);
 888
 889        pt_update_head(pt);
 890
 891        perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
 892                            local_xchg(&buf->lost, 0));
 893
 894        if (!event->hw.state) {
 895                int ret;
 896
 897                buf = perf_aux_output_begin(&pt->handle, event);
 898                if (!buf) {
 899                        event->hw.state = PERF_HES_STOPPED;
 900                        return;
 901                }
 902
 903                pt_buffer_reset_offsets(buf, pt->handle.head);
 904                ret = pt_buffer_reset_markers(buf, &pt->handle);
 905                if (ret) {
 906                        perf_aux_output_end(&pt->handle, 0, true);
 907                        return;
 908                }
 909
 910                pt_config_buffer(buf->cur->table, buf->cur_idx,
 911                                 buf->output_off);
 912                wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 913                pt_config(event);
 914        }
 915}
 916
 917/*
 918 * PMU callbacks
 919 */
 920
 921static void pt_event_start(struct perf_event *event, int mode)
 922{
 923        struct pt *pt = this_cpu_ptr(&pt_ctx);
 924        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 925
 926        if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
 927                event->hw.state = PERF_HES_STOPPED;
 928                return;
 929        }
 930
 931        ACCESS_ONCE(pt->handle_nmi) = 1;
 932        event->hw.state = 0;
 933
 934        pt_config_buffer(buf->cur->table, buf->cur_idx,
 935                         buf->output_off);
 936        wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 937        pt_config(event);
 938}
 939
 940static void pt_event_stop(struct perf_event *event, int mode)
 941{
 942        struct pt *pt = this_cpu_ptr(&pt_ctx);
 943
 944        /*
 945         * Protect against the PMI racing with disabling wrmsr,
 946         * see comment in intel_pt_interrupt().
 947         */
 948        ACCESS_ONCE(pt->handle_nmi) = 0;
 949        pt_config_start(false);
 950
 951        if (event->hw.state == PERF_HES_STOPPED)
 952                return;
 953
 954        event->hw.state = PERF_HES_STOPPED;
 955
 956        if (mode & PERF_EF_UPDATE) {
 957                struct pt *pt = this_cpu_ptr(&pt_ctx);
 958                struct pt_buffer *buf = perf_get_aux(&pt->handle);
 959
 960                if (!buf)
 961                        return;
 962
 963                if (WARN_ON_ONCE(pt->handle.event != event))
 964                        return;
 965
 966                pt_read_offset(buf);
 967
 968                pt_handle_status(pt);
 969
 970                pt_update_head(pt);
 971        }
 972}
 973
 974static void pt_event_del(struct perf_event *event, int mode)
 975{
 976        struct pt *pt = this_cpu_ptr(&pt_ctx);
 977        struct pt_buffer *buf;
 978
 979        pt_event_stop(event, PERF_EF_UPDATE);
 980
 981        buf = perf_get_aux(&pt->handle);
 982
 983        if (buf) {
 984                if (buf->snapshot)
 985                        pt->handle.head =
 986                                local_xchg(&buf->data_size,
 987                                           buf->nr_pages << PAGE_SHIFT);
 988                perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
 989                                    local_xchg(&buf->lost, 0));
 990        }
 991}
 992
 993static int pt_event_add(struct perf_event *event, int mode)
 994{
 995        struct pt_buffer *buf;
 996        struct pt *pt = this_cpu_ptr(&pt_ctx);
 997        struct hw_perf_event *hwc = &event->hw;
 998        int ret = -EBUSY;
 999
1000        if (pt->handle.event)
1001                goto fail;
1002
1003        buf = perf_aux_output_begin(&pt->handle, event);
1004        ret = -EINVAL;
1005        if (!buf)
1006                goto fail_stop;
1007
1008        pt_buffer_reset_offsets(buf, pt->handle.head);
1009        if (!buf->snapshot) {
1010                ret = pt_buffer_reset_markers(buf, &pt->handle);
1011                if (ret)
1012                        goto fail_end_stop;
1013        }
1014
1015        if (mode & PERF_EF_START) {
1016                pt_event_start(event, 0);
1017                ret = -EBUSY;
1018                if (hwc->state == PERF_HES_STOPPED)
1019                        goto fail_end_stop;
1020        } else {
1021                hwc->state = PERF_HES_STOPPED;
1022        }
1023
1024        return 0;
1025
1026fail_end_stop:
1027        perf_aux_output_end(&pt->handle, 0, true);
1028fail_stop:
1029        hwc->state = PERF_HES_STOPPED;
1030fail:
1031        return ret;
1032}
1033
1034static void pt_event_read(struct perf_event *event)
1035{
1036}
1037
1038static void pt_event_destroy(struct perf_event *event)
1039{
1040        x86_del_exclusive(x86_lbr_exclusive_pt);
1041}
1042
1043static int pt_event_init(struct perf_event *event)
1044{
1045        if (event->attr.type != pt_pmu.pmu.type)
1046                return -ENOENT;
1047
1048        if (!pt_event_valid(event))
1049                return -EINVAL;
1050
1051        if (x86_add_exclusive(x86_lbr_exclusive_pt))
1052                return -EBUSY;
1053
1054        event->destroy = pt_event_destroy;
1055
1056        return 0;
1057}
1058
1059static __init int pt_init(void)
1060{
1061        int ret, cpu, prior_warn = 0;
1062
1063        BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1064        get_online_cpus();
1065        for_each_online_cpu(cpu) {
1066                u64 ctl;
1067
1068                ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1069                if (!ret && (ctl & RTIT_CTL_TRACEEN))
1070                        prior_warn++;
1071        }
1072        put_online_cpus();
1073
1074        if (prior_warn) {
1075                x86_add_exclusive(x86_lbr_exclusive_pt);
1076                pr_warn("PT is enabled at boot time, doing nothing\n");
1077
1078                return -EBUSY;
1079        }
1080
1081        ret = pt_pmu_hw_init();
1082        if (ret)
1083                return ret;
1084
1085        if (!pt_cap_get(PT_CAP_topa_output)) {
1086                pr_warn("ToPA output is not supported on this CPU\n");
1087                return -ENODEV;
1088        }
1089
1090        if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1091                pt_pmu.pmu.capabilities =
1092                        PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1093
1094        pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1095        pt_pmu.pmu.attr_groups  = pt_attr_groups;
1096        pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
1097        pt_pmu.pmu.event_init   = pt_event_init;
1098        pt_pmu.pmu.add          = pt_event_add;
1099        pt_pmu.pmu.del          = pt_event_del;
1100        pt_pmu.pmu.start        = pt_event_start;
1101        pt_pmu.pmu.stop         = pt_event_stop;
1102        pt_pmu.pmu.read         = pt_event_read;
1103        pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
1104        pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
1105        ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1106
1107        return ret;
1108}
1109
1110module_init(pt_init);
1111