linux/arch/x86/events/intel/pt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Intel(R) Processor Trace PMU driver for perf
   4 * Copyright (c) 2013-2014, Intel Corporation.
   5 *
   6 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
   7 * Programming Reference:
   8 * http://software.intel.com/en-us/intel-isa-extensions
   9 */
  10
  11#undef DEBUG
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/types.h>
  16#include <linux/slab.h>
  17#include <linux/device.h>
  18
  19#include <asm/perf_event.h>
  20#include <asm/insn.h>
  21#include <asm/io.h>
  22#include <asm/intel_pt.h>
  23#include <asm/intel-family.h>
  24
  25#include "../perf_event.h"
  26#include "pt.h"
  27
  28static DEFINE_PER_CPU(struct pt, pt_ctx);
  29
  30static struct pt_pmu pt_pmu;
  31
  32/*
  33 * Capabilities of Intel PT hardware, such as number of address bits or
  34 * supported output schemes, are cached and exported to userspace as "caps"
  35 * attribute group of pt pmu device
  36 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
  37 * relevant bits together with intel_pt traces.
  38 *
  39 * These are necessary for both trace decoding (payloads_lip, contains address
  40 * width encoded in IP-related packets), and event configuration (bitmasks with
  41 * permitted values for certain bit fields).
  42 */
  43#define PT_CAP(_n, _l, _r, _m)                                          \
  44        [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
  45                            .reg = _r, .mask = _m }
  46
  47static struct pt_cap_desc {
  48        const char      *name;
  49        u32             leaf;
  50        u8              reg;
  51        u32             mask;
  52} pt_caps[] = {
  53        PT_CAP(max_subleaf,             0, CPUID_EAX, 0xffffffff),
  54        PT_CAP(cr3_filtering,           0, CPUID_EBX, BIT(0)),
  55        PT_CAP(psb_cyc,                 0, CPUID_EBX, BIT(1)),
  56        PT_CAP(ip_filtering,            0, CPUID_EBX, BIT(2)),
  57        PT_CAP(mtc,                     0, CPUID_EBX, BIT(3)),
  58        PT_CAP(ptwrite,                 0, CPUID_EBX, BIT(4)),
  59        PT_CAP(power_event_trace,       0, CPUID_EBX, BIT(5)),
  60        PT_CAP(topa_output,             0, CPUID_ECX, BIT(0)),
  61        PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
  62        PT_CAP(single_range_output,     0, CPUID_ECX, BIT(2)),
  63        PT_CAP(output_subsys,           0, CPUID_ECX, BIT(3)),
  64        PT_CAP(payloads_lip,            0, CPUID_ECX, BIT(31)),
  65        PT_CAP(num_address_ranges,      1, CPUID_EAX, 0x3),
  66        PT_CAP(mtc_periods,             1, CPUID_EAX, 0xffff0000),
  67        PT_CAP(cycle_thresholds,        1, CPUID_EBX, 0xffff),
  68        PT_CAP(psb_periods,             1, CPUID_EBX, 0xffff0000),
  69};
  70
  71u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
  72{
  73        struct pt_cap_desc *cd = &pt_caps[capability];
  74        u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
  75        unsigned int shift = __ffs(cd->mask);
  76
  77        return (c & cd->mask) >> shift;
  78}
  79EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
  80
  81u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
  82{
  83        return intel_pt_validate_cap(pt_pmu.caps, cap);
  84}
  85EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
  86
  87static ssize_t pt_cap_show(struct device *cdev,
  88                           struct device_attribute *attr,
  89                           char *buf)
  90{
  91        struct dev_ext_attribute *ea =
  92                container_of(attr, struct dev_ext_attribute, attr);
  93        enum pt_capabilities cap = (long)ea->var;
  94
  95        return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
  96}
  97
  98static struct attribute_group pt_cap_group __ro_after_init = {
  99        .name   = "caps",
 100};
 101
 102PMU_FORMAT_ATTR(pt,             "config:0"      );
 103PMU_FORMAT_ATTR(cyc,            "config:1"      );
 104PMU_FORMAT_ATTR(pwr_evt,        "config:4"      );
 105PMU_FORMAT_ATTR(fup_on_ptw,     "config:5"      );
 106PMU_FORMAT_ATTR(mtc,            "config:9"      );
 107PMU_FORMAT_ATTR(tsc,            "config:10"     );
 108PMU_FORMAT_ATTR(noretcomp,      "config:11"     );
 109PMU_FORMAT_ATTR(ptw,            "config:12"     );
 110PMU_FORMAT_ATTR(branch,         "config:13"     );
 111PMU_FORMAT_ATTR(mtc_period,     "config:14-17"  );
 112PMU_FORMAT_ATTR(cyc_thresh,     "config:19-22"  );
 113PMU_FORMAT_ATTR(psb_period,     "config:24-27"  );
 114
 115static struct attribute *pt_formats_attr[] = {
 116        &format_attr_pt.attr,
 117        &format_attr_cyc.attr,
 118        &format_attr_pwr_evt.attr,
 119        &format_attr_fup_on_ptw.attr,
 120        &format_attr_mtc.attr,
 121        &format_attr_tsc.attr,
 122        &format_attr_noretcomp.attr,
 123        &format_attr_ptw.attr,
 124        &format_attr_branch.attr,
 125        &format_attr_mtc_period.attr,
 126        &format_attr_cyc_thresh.attr,
 127        &format_attr_psb_period.attr,
 128        NULL,
 129};
 130
 131static struct attribute_group pt_format_group = {
 132        .name   = "format",
 133        .attrs  = pt_formats_attr,
 134};
 135
 136static ssize_t
 137pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
 138                    char *page)
 139{
 140        struct perf_pmu_events_attr *pmu_attr =
 141                container_of(attr, struct perf_pmu_events_attr, attr);
 142
 143        switch (pmu_attr->id) {
 144        case 0:
 145                return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
 146        case 1:
 147                return sprintf(page, "%u:%u\n",
 148                               pt_pmu.tsc_art_num,
 149                               pt_pmu.tsc_art_den);
 150        default:
 151                break;
 152        }
 153
 154        return -EINVAL;
 155}
 156
 157PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
 158               pt_timing_attr_show);
 159PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
 160               pt_timing_attr_show);
 161
 162static struct attribute *pt_timing_attr[] = {
 163        &timing_attr_max_nonturbo_ratio.attr.attr,
 164        &timing_attr_tsc_art_ratio.attr.attr,
 165        NULL,
 166};
 167
 168static struct attribute_group pt_timing_group = {
 169        .attrs  = pt_timing_attr,
 170};
 171
 172static const struct attribute_group *pt_attr_groups[] = {
 173        &pt_cap_group,
 174        &pt_format_group,
 175        &pt_timing_group,
 176        NULL,
 177};
 178
 179static int __init pt_pmu_hw_init(void)
 180{
 181        struct dev_ext_attribute *de_attrs;
 182        struct attribute **attrs;
 183        size_t size;
 184        u64 reg;
 185        int ret;
 186        long i;
 187
 188        rdmsrl(MSR_PLATFORM_INFO, reg);
 189        pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
 190
 191        /*
 192         * if available, read in TSC to core crystal clock ratio,
 193         * otherwise, zero for numerator stands for "not enumerated"
 194         * as per SDM
 195         */
 196        if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
 197                u32 eax, ebx, ecx, edx;
 198
 199                cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
 200
 201                pt_pmu.tsc_art_num = ebx;
 202                pt_pmu.tsc_art_den = eax;
 203        }
 204
 205        /* model-specific quirks */
 206        switch (boot_cpu_data.x86_model) {
 207        case INTEL_FAM6_BROADWELL:
 208        case INTEL_FAM6_BROADWELL_D:
 209        case INTEL_FAM6_BROADWELL_G:
 210        case INTEL_FAM6_BROADWELL_X:
 211                /* not setting BRANCH_EN will #GP, erratum BDM106 */
 212                pt_pmu.branch_en_always_on = true;
 213                break;
 214        default:
 215                break;
 216        }
 217
 218        if (boot_cpu_has(X86_FEATURE_VMX)) {
 219                /*
 220                 * Intel SDM, 36.5 "Tracing post-VMXON" says that
 221                 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
 222                 * post-VMXON.
 223                 */
 224                rdmsrl(MSR_IA32_VMX_MISC, reg);
 225                if (reg & BIT(14))
 226                        pt_pmu.vmx = true;
 227        }
 228
 229        attrs = NULL;
 230
 231        for (i = 0; i < PT_CPUID_LEAVES; i++) {
 232                cpuid_count(20, i,
 233                            &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
 234                            &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
 235                            &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
 236                            &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
 237        }
 238
 239        ret = -ENOMEM;
 240        size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
 241        attrs = kzalloc(size, GFP_KERNEL);
 242        if (!attrs)
 243                goto fail;
 244
 245        size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
 246        de_attrs = kzalloc(size, GFP_KERNEL);
 247        if (!de_attrs)
 248                goto fail;
 249
 250        for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
 251                struct dev_ext_attribute *de_attr = de_attrs + i;
 252
 253                de_attr->attr.attr.name = pt_caps[i].name;
 254
 255                sysfs_attr_init(&de_attr->attr.attr);
 256
 257                de_attr->attr.attr.mode         = S_IRUGO;
 258                de_attr->attr.show              = pt_cap_show;
 259                de_attr->var                    = (void *)i;
 260
 261                attrs[i] = &de_attr->attr.attr;
 262        }
 263
 264        pt_cap_group.attrs = attrs;
 265
 266        return 0;
 267
 268fail:
 269        kfree(attrs);
 270
 271        return ret;
 272}
 273
 274#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC     | \
 275                          RTIT_CTL_CYC_THRESH   | \
 276                          RTIT_CTL_PSB_FREQ)
 277
 278#define RTIT_CTL_MTC    (RTIT_CTL_MTC_EN        | \
 279                         RTIT_CTL_MTC_RANGE)
 280
 281#define RTIT_CTL_PTW    (RTIT_CTL_PTW_EN        | \
 282                         RTIT_CTL_FUP_ON_PTW)
 283
 284/*
 285 * Bit 0 (TraceEn) in the attr.config is meaningless as the
 286 * corresponding bit in the RTIT_CTL can only be controlled
 287 * by the driver; therefore, repurpose it to mean: pass
 288 * through the bit that was previously assumed to be always
 289 * on for PT, thereby allowing the user to *not* set it if
 290 * they so wish. See also pt_event_valid() and pt_config().
 291 */
 292#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
 293
 294#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN        | \
 295                        RTIT_CTL_TSC_EN         | \
 296                        RTIT_CTL_DISRETC        | \
 297                        RTIT_CTL_BRANCH_EN      | \
 298                        RTIT_CTL_CYC_PSB        | \
 299                        RTIT_CTL_MTC            | \
 300                        RTIT_CTL_PWR_EVT_EN     | \
 301                        RTIT_CTL_FUP_ON_PTW     | \
 302                        RTIT_CTL_PTW_EN)
 303
 304static bool pt_event_valid(struct perf_event *event)
 305{
 306        u64 config = event->attr.config;
 307        u64 allowed, requested;
 308
 309        if ((config & PT_CONFIG_MASK) != config)
 310                return false;
 311
 312        if (config & RTIT_CTL_CYC_PSB) {
 313                if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
 314                        return false;
 315
 316                allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
 317                requested = (config & RTIT_CTL_PSB_FREQ) >>
 318                        RTIT_CTL_PSB_FREQ_OFFSET;
 319                if (requested && (!(allowed & BIT(requested))))
 320                        return false;
 321
 322                allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
 323                requested = (config & RTIT_CTL_CYC_THRESH) >>
 324                        RTIT_CTL_CYC_THRESH_OFFSET;
 325                if (requested && (!(allowed & BIT(requested))))
 326                        return false;
 327        }
 328
 329        if (config & RTIT_CTL_MTC) {
 330                /*
 331                 * In the unlikely case that CPUID lists valid mtc periods,
 332                 * but not the mtc capability, drop out here.
 333                 *
 334                 * Spec says that setting mtc period bits while mtc bit in
 335                 * CPUID is 0 will #GP, so better safe than sorry.
 336                 */
 337                if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
 338                        return false;
 339
 340                allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
 341                if (!allowed)
 342                        return false;
 343
 344                requested = (config & RTIT_CTL_MTC_RANGE) >>
 345                        RTIT_CTL_MTC_RANGE_OFFSET;
 346
 347                if (!(allowed & BIT(requested)))
 348                        return false;
 349        }
 350
 351        if (config & RTIT_CTL_PWR_EVT_EN &&
 352            !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
 353                return false;
 354
 355        if (config & RTIT_CTL_PTW) {
 356                if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
 357                        return false;
 358
 359                /* FUPonPTW without PTW doesn't make sense */
 360                if ((config & RTIT_CTL_FUP_ON_PTW) &&
 361                    !(config & RTIT_CTL_PTW_EN))
 362                        return false;
 363        }
 364
 365        /*
 366         * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
 367         * clears the assomption that BranchEn must always be enabled,
 368         * as was the case with the first implementation of PT.
 369         * If this bit is not set, the legacy behavior is preserved
 370         * for compatibility with the older userspace.
 371         *
 372         * Re-using bit 0 for this purpose is fine because it is never
 373         * directly set by the user; previous attempts at setting it in
 374         * the attr.config resulted in -EINVAL.
 375         */
 376        if (config & RTIT_CTL_PASSTHROUGH) {
 377                /*
 378                 * Disallow not setting BRANCH_EN where BRANCH_EN is
 379                 * always required.
 380                 */
 381                if (pt_pmu.branch_en_always_on &&
 382                    !(config & RTIT_CTL_BRANCH_EN))
 383                        return false;
 384        } else {
 385                /*
 386                 * Disallow BRANCH_EN without the PASSTHROUGH.
 387                 */
 388                if (config & RTIT_CTL_BRANCH_EN)
 389                        return false;
 390        }
 391
 392        return true;
 393}
 394
 395/*
 396 * PT configuration helpers
 397 * These all are cpu affine and operate on a local PT
 398 */
 399
 400static void pt_config_start(struct perf_event *event)
 401{
 402        struct pt *pt = this_cpu_ptr(&pt_ctx);
 403        u64 ctl = event->hw.config;
 404
 405        ctl |= RTIT_CTL_TRACEEN;
 406        if (READ_ONCE(pt->vmx_on))
 407                perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
 408        else
 409                wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 410
 411        WRITE_ONCE(event->hw.config, ctl);
 412}
 413
 414/* Address ranges and their corresponding msr configuration registers */
 415static const struct pt_address_range {
 416        unsigned long   msr_a;
 417        unsigned long   msr_b;
 418        unsigned int    reg_off;
 419} pt_address_ranges[] = {
 420        {
 421                .msr_a   = MSR_IA32_RTIT_ADDR0_A,
 422                .msr_b   = MSR_IA32_RTIT_ADDR0_B,
 423                .reg_off = RTIT_CTL_ADDR0_OFFSET,
 424        },
 425        {
 426                .msr_a   = MSR_IA32_RTIT_ADDR1_A,
 427                .msr_b   = MSR_IA32_RTIT_ADDR1_B,
 428                .reg_off = RTIT_CTL_ADDR1_OFFSET,
 429        },
 430        {
 431                .msr_a   = MSR_IA32_RTIT_ADDR2_A,
 432                .msr_b   = MSR_IA32_RTIT_ADDR2_B,
 433                .reg_off = RTIT_CTL_ADDR2_OFFSET,
 434        },
 435        {
 436                .msr_a   = MSR_IA32_RTIT_ADDR3_A,
 437                .msr_b   = MSR_IA32_RTIT_ADDR3_B,
 438                .reg_off = RTIT_CTL_ADDR3_OFFSET,
 439        }
 440};
 441
 442static u64 pt_config_filters(struct perf_event *event)
 443{
 444        struct pt_filters *filters = event->hw.addr_filters;
 445        struct pt *pt = this_cpu_ptr(&pt_ctx);
 446        unsigned int range = 0;
 447        u64 rtit_ctl = 0;
 448
 449        if (!filters)
 450                return 0;
 451
 452        perf_event_addr_filters_sync(event);
 453
 454        for (range = 0; range < filters->nr_filters; range++) {
 455                struct pt_filter *filter = &filters->filter[range];
 456
 457                /*
 458                 * Note, if the range has zero start/end addresses due
 459                 * to its dynamic object not being loaded yet, we just
 460                 * go ahead and program zeroed range, which will simply
 461                 * produce no data. Note^2: if executable code at 0x0
 462                 * is a concern, we can set up an "invalid" configuration
 463                 * such as msr_b < msr_a.
 464                 */
 465
 466                /* avoid redundant msr writes */
 467                if (pt->filters.filter[range].msr_a != filter->msr_a) {
 468                        wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
 469                        pt->filters.filter[range].msr_a = filter->msr_a;
 470                }
 471
 472                if (pt->filters.filter[range].msr_b != filter->msr_b) {
 473                        wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
 474                        pt->filters.filter[range].msr_b = filter->msr_b;
 475                }
 476
 477                rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
 478        }
 479
 480        return rtit_ctl;
 481}
 482
 483static void pt_config(struct perf_event *event)
 484{
 485        struct pt *pt = this_cpu_ptr(&pt_ctx);
 486        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 487        u64 reg;
 488
 489        /* First round: clear STATUS, in particular the PSB byte counter. */
 490        if (!event->hw.config) {
 491                perf_event_itrace_started(event);
 492                wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 493        }
 494
 495        reg = pt_config_filters(event);
 496        reg |= RTIT_CTL_TRACEEN;
 497        if (!buf->single)
 498                reg |= RTIT_CTL_TOPA;
 499
 500        /*
 501         * Previously, we had BRANCH_EN on by default, but now that PT has
 502         * grown features outside of branch tracing, it is useful to allow
 503         * the user to disable it. Setting bit 0 in the event's attr.config
 504         * allows BRANCH_EN to pass through instead of being always on. See
 505         * also the comment in pt_event_valid().
 506         */
 507        if (event->attr.config & BIT(0)) {
 508                reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
 509        } else {
 510                reg |= RTIT_CTL_BRANCH_EN;
 511        }
 512
 513        if (!event->attr.exclude_kernel)
 514                reg |= RTIT_CTL_OS;
 515        if (!event->attr.exclude_user)
 516                reg |= RTIT_CTL_USR;
 517
 518        reg |= (event->attr.config & PT_CONFIG_MASK);
 519
 520        event->hw.config = reg;
 521        pt_config_start(event);
 522}
 523
 524static void pt_config_stop(struct perf_event *event)
 525{
 526        struct pt *pt = this_cpu_ptr(&pt_ctx);
 527        u64 ctl = READ_ONCE(event->hw.config);
 528
 529        /* may be already stopped by a PMI */
 530        if (!(ctl & RTIT_CTL_TRACEEN))
 531                return;
 532
 533        ctl &= ~RTIT_CTL_TRACEEN;
 534        if (!READ_ONCE(pt->vmx_on))
 535                wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 536
 537        WRITE_ONCE(event->hw.config, ctl);
 538
 539        /*
 540         * A wrmsr that disables trace generation serializes other PT
 541         * registers and causes all data packets to be written to memory,
 542         * but a fence is required for the data to become globally visible.
 543         *
 544         * The below WMB, separating data store and aux_head store matches
 545         * the consumer's RMB that separates aux_head load and data load.
 546         */
 547        wmb();
 548}
 549
 550/**
 551 * struct topa - ToPA metadata
 552 * @list:       linkage to struct pt_buffer's list of tables
 553 * @offset:     offset of the first entry in this table in the buffer
 554 * @size:       total size of all entries in this table
 555 * @last:       index of the last initialized entry in this table
 556 * @z_count:    how many times the first entry repeats
 557 */
 558struct topa {
 559        struct list_head        list;
 560        u64                     offset;
 561        size_t                  size;
 562        int                     last;
 563        unsigned int            z_count;
 564};
 565
 566/*
 567 * Keep ToPA table-related metadata on the same page as the actual table,
 568 * taking up a few words from the top
 569 */
 570
 571#define TENTS_PER_PAGE  \
 572        ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
 573
 574/**
 575 * struct topa_page - page-sized ToPA table with metadata at the top
 576 * @table:      actual ToPA table entries, as understood by PT hardware
 577 * @topa:       metadata
 578 */
 579struct topa_page {
 580        struct topa_entry       table[TENTS_PER_PAGE];
 581        struct topa             topa;
 582};
 583
 584static inline struct topa_page *topa_to_page(struct topa *topa)
 585{
 586        return container_of(topa, struct topa_page, topa);
 587}
 588
 589static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
 590{
 591        return (struct topa_page *)((unsigned long)te & PAGE_MASK);
 592}
 593
 594static inline phys_addr_t topa_pfn(struct topa *topa)
 595{
 596        return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
 597}
 598
 599/* make -1 stand for the last table entry */
 600#define TOPA_ENTRY(t, i)                                \
 601        ((i) == -1                                      \
 602                ? &topa_to_page(t)->table[(t)->last]    \
 603                : &topa_to_page(t)->table[(i)])
 604#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
 605#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
 606
 607static void pt_config_buffer(struct pt_buffer *buf)
 608{
 609        struct pt *pt = this_cpu_ptr(&pt_ctx);
 610        u64 reg, mask;
 611        void *base;
 612
 613        if (buf->single) {
 614                base = buf->data_pages[0];
 615                mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
 616        } else {
 617                base = topa_to_page(buf->cur)->table;
 618                mask = (u64)buf->cur_idx;
 619        }
 620
 621        reg = virt_to_phys(base);
 622        if (pt->output_base != reg) {
 623                pt->output_base = reg;
 624                wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
 625        }
 626
 627        reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
 628        if (pt->output_mask != reg) {
 629                pt->output_mask = reg;
 630                wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
 631        }
 632}
 633
 634/**
 635 * topa_alloc() - allocate page-sized ToPA table
 636 * @cpu:        CPU on which to allocate.
 637 * @gfp:        Allocation flags.
 638 *
 639 * Return:      On success, return the pointer to ToPA table page.
 640 */
 641static struct topa *topa_alloc(int cpu, gfp_t gfp)
 642{
 643        int node = cpu_to_node(cpu);
 644        struct topa_page *tp;
 645        struct page *p;
 646
 647        p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 648        if (!p)
 649                return NULL;
 650
 651        tp = page_address(p);
 652        tp->topa.last = 0;
 653
 654        /*
 655         * In case of singe-entry ToPA, always put the self-referencing END
 656         * link as the 2nd entry in the table
 657         */
 658        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
 659                TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
 660                TOPA_ENTRY(&tp->topa, 1)->end = 1;
 661        }
 662
 663        return &tp->topa;
 664}
 665
 666/**
 667 * topa_free() - free a page-sized ToPA table
 668 * @topa:       Table to deallocate.
 669 */
 670static void topa_free(struct topa *topa)
 671{
 672        free_page((unsigned long)topa);
 673}
 674
 675/**
 676 * topa_insert_table() - insert a ToPA table into a buffer
 677 * @buf:         PT buffer that's being extended.
 678 * @topa:        New topa table to be inserted.
 679 *
 680 * If it's the first table in this buffer, set up buffer's pointers
 681 * accordingly; otherwise, add a END=1 link entry to @topa to the current
 682 * "last" table and adjust the last table pointer to @topa.
 683 */
 684static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
 685{
 686        struct topa *last = buf->last;
 687
 688        list_add_tail(&topa->list, &buf->tables);
 689
 690        if (!buf->first) {
 691                buf->first = buf->last = buf->cur = topa;
 692                return;
 693        }
 694
 695        topa->offset = last->offset + last->size;
 696        buf->last = topa;
 697
 698        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
 699                return;
 700
 701        BUG_ON(last->last != TENTS_PER_PAGE - 1);
 702
 703        TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
 704        TOPA_ENTRY(last, -1)->end = 1;
 705}
 706
 707/**
 708 * topa_table_full() - check if a ToPA table is filled up
 709 * @topa:       ToPA table.
 710 */
 711static bool topa_table_full(struct topa *topa)
 712{
 713        /* single-entry ToPA is a special case */
 714        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
 715                return !!topa->last;
 716
 717        return topa->last == TENTS_PER_PAGE - 1;
 718}
 719
 720/**
 721 * topa_insert_pages() - create a list of ToPA tables
 722 * @buf:        PT buffer being initialized.
 723 * @gfp:        Allocation flags.
 724 *
 725 * This initializes a list of ToPA tables with entries from
 726 * the data_pages provided by rb_alloc_aux().
 727 *
 728 * Return:      0 on success or error code.
 729 */
 730static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
 731{
 732        struct topa *topa = buf->last;
 733        int order = 0;
 734        struct page *p;
 735
 736        p = virt_to_page(buf->data_pages[buf->nr_pages]);
 737        if (PagePrivate(p))
 738                order = page_private(p);
 739
 740        if (topa_table_full(topa)) {
 741                topa = topa_alloc(cpu, gfp);
 742                if (!topa)
 743                        return -ENOMEM;
 744
 745                topa_insert_table(buf, topa);
 746        }
 747
 748        if (topa->z_count == topa->last - 1) {
 749                if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
 750                        topa->z_count++;
 751        }
 752
 753        TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
 754        TOPA_ENTRY(topa, -1)->size = order;
 755        if (!buf->snapshot &&
 756            !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
 757                TOPA_ENTRY(topa, -1)->intr = 1;
 758                TOPA_ENTRY(topa, -1)->stop = 1;
 759        }
 760
 761        topa->last++;
 762        topa->size += sizes(order);
 763
 764        buf->nr_pages += 1ul << order;
 765
 766        return 0;
 767}
 768
 769/**
 770 * pt_topa_dump() - print ToPA tables and their entries
 771 * @buf:        PT buffer.
 772 */
 773static void pt_topa_dump(struct pt_buffer *buf)
 774{
 775        struct topa *topa;
 776
 777        list_for_each_entry(topa, &buf->tables, list) {
 778                struct topa_page *tp = topa_to_page(topa);
 779                int i;
 780
 781                pr_debug("# table @%p, off %llx size %zx\n", tp->table,
 782                         topa->offset, topa->size);
 783                for (i = 0; i < TENTS_PER_PAGE; i++) {
 784                        pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
 785                                 &tp->table[i],
 786                                 (unsigned long)tp->table[i].base << TOPA_SHIFT,
 787                                 sizes(tp->table[i].size),
 788                                 tp->table[i].end ?  'E' : ' ',
 789                                 tp->table[i].intr ? 'I' : ' ',
 790                                 tp->table[i].stop ? 'S' : ' ',
 791                                 *(u64 *)&tp->table[i]);
 792                        if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
 793                             tp->table[i].stop) ||
 794                            tp->table[i].end)
 795                                break;
 796                        if (!i && topa->z_count)
 797                                i += topa->z_count;
 798                }
 799        }
 800}
 801
 802/**
 803 * pt_buffer_advance() - advance to the next output region
 804 * @buf:        PT buffer.
 805 *
 806 * Advance the current pointers in the buffer to the next ToPA entry.
 807 */
 808static void pt_buffer_advance(struct pt_buffer *buf)
 809{
 810        buf->output_off = 0;
 811        buf->cur_idx++;
 812
 813        if (buf->cur_idx == buf->cur->last) {
 814                if (buf->cur == buf->last)
 815                        buf->cur = buf->first;
 816                else
 817                        buf->cur = list_entry(buf->cur->list.next, struct topa,
 818                                              list);
 819                buf->cur_idx = 0;
 820        }
 821}
 822
 823/**
 824 * pt_update_head() - calculate current offsets and sizes
 825 * @pt:         Per-cpu pt context.
 826 *
 827 * Update buffer's current write pointer position and data size.
 828 */
 829static void pt_update_head(struct pt *pt)
 830{
 831        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 832        u64 topa_idx, base, old;
 833
 834        if (buf->single) {
 835                local_set(&buf->data_size, buf->output_off);
 836                return;
 837        }
 838
 839        /* offset of the first region in this table from the beginning of buf */
 840        base = buf->cur->offset + buf->output_off;
 841
 842        /* offset of the current output region within this table */
 843        for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
 844                base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
 845
 846        if (buf->snapshot) {
 847                local_set(&buf->data_size, base);
 848        } else {
 849                old = (local64_xchg(&buf->head, base) &
 850                       ((buf->nr_pages << PAGE_SHIFT) - 1));
 851                if (base < old)
 852                        base += buf->nr_pages << PAGE_SHIFT;
 853
 854                local_add(base - old, &buf->data_size);
 855        }
 856}
 857
 858/**
 859 * pt_buffer_region() - obtain current output region's address
 860 * @buf:        PT buffer.
 861 */
 862static void *pt_buffer_region(struct pt_buffer *buf)
 863{
 864        return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
 865}
 866
 867/**
 868 * pt_buffer_region_size() - obtain current output region's size
 869 * @buf:        PT buffer.
 870 */
 871static size_t pt_buffer_region_size(struct pt_buffer *buf)
 872{
 873        return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
 874}
 875
 876/**
 877 * pt_handle_status() - take care of possible status conditions
 878 * @pt:         Per-cpu pt context.
 879 */
 880static void pt_handle_status(struct pt *pt)
 881{
 882        struct pt_buffer *buf = perf_get_aux(&pt->handle);
 883        int advance = 0;
 884        u64 status;
 885
 886        rdmsrl(MSR_IA32_RTIT_STATUS, status);
 887
 888        if (status & RTIT_STATUS_ERROR) {
 889                pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
 890                pt_topa_dump(buf);
 891                status &= ~RTIT_STATUS_ERROR;
 892        }
 893
 894        if (status & RTIT_STATUS_STOPPED) {
 895                status &= ~RTIT_STATUS_STOPPED;
 896
 897                /*
 898                 * On systems that only do single-entry ToPA, hitting STOP
 899                 * means we are already losing data; need to let the decoder
 900                 * know.
 901                 */
 902                if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
 903                    buf->output_off == pt_buffer_region_size(buf)) {
 904                        perf_aux_output_flag(&pt->handle,
 905                                             PERF_AUX_FLAG_TRUNCATED);
 906                        advance++;
 907                }
 908        }
 909
 910        /*
 911         * Also on single-entry ToPA implementations, interrupt will come
 912         * before the output reaches its output region's boundary.
 913         */
 914        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
 915            !buf->snapshot &&
 916            pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
 917                void *head = pt_buffer_region(buf);
 918
 919                /* everything within this margin needs to be zeroed out */
 920                memset(head + buf->output_off, 0,
 921                       pt_buffer_region_size(buf) -
 922                       buf->output_off);
 923                advance++;
 924        }
 925
 926        if (advance)
 927                pt_buffer_advance(buf);
 928
 929        wrmsrl(MSR_IA32_RTIT_STATUS, status);
 930}
 931
 932/**
 933 * pt_read_offset() - translate registers into buffer pointers
 934 * @buf:        PT buffer.
 935 *
 936 * Set buffer's output pointers from MSR values.
 937 */
 938static void pt_read_offset(struct pt_buffer *buf)
 939{
 940        struct pt *pt = this_cpu_ptr(&pt_ctx);
 941        struct topa_page *tp;
 942
 943        if (!buf->single) {
 944                rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
 945                tp = phys_to_virt(pt->output_base);
 946                buf->cur = &tp->topa;
 947        }
 948
 949        rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
 950        /* offset within current output region */
 951        buf->output_off = pt->output_mask >> 32;
 952        /* index of current output region within this table */
 953        if (!buf->single)
 954                buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
 955}
 956
 957static struct topa_entry *
 958pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
 959{
 960        struct topa_page *tp;
 961        struct topa *topa;
 962        unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
 963
 964        /*
 965         * Indicates a bug in the caller.
 966         */
 967        if (WARN_ON_ONCE(pg >= buf->nr_pages))
 968                return NULL;
 969
 970        /*
 971         * First, find the ToPA table where @pg fits. With high
 972         * order allocations, there shouldn't be many of these.
 973         */
 974        list_for_each_entry(topa, &buf->tables, list) {
 975                if (topa->offset + topa->size > pg << PAGE_SHIFT)
 976                        goto found;
 977        }
 978
 979        /*
 980         * Hitting this means we have a problem in the ToPA
 981         * allocation code.
 982         */
 983        WARN_ON_ONCE(1);
 984
 985        return NULL;
 986
 987found:
 988        /*
 989         * Indicates a problem in the ToPA allocation code.
 990         */
 991        if (WARN_ON_ONCE(topa->last == -1))
 992                return NULL;
 993
 994        tp = topa_to_page(topa);
 995        cur_pg = PFN_DOWN(topa->offset);
 996        if (topa->z_count) {
 997                z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
 998                start_idx = topa->z_count + 1;
 999        }
1000
1001        /*
1002         * Multiple entries at the beginning of the table have the same size,
1003         * ideally all of them; if @pg falls there, the search is done.
1004         */
1005        if (pg >= cur_pg && pg < cur_pg + z_pg) {
1006                idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
1007                return &tp->table[idx];
1008        }
1009
1010        /*
1011         * Otherwise, slow path: iterate through the remaining entries.
1012         */
1013        for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
1014                if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
1015                        return &tp->table[idx];
1016
1017                cur_pg += TOPA_ENTRY_PAGES(topa, idx);
1018        }
1019
1020        /*
1021         * Means we couldn't find a ToPA entry in the table that does match.
1022         */
1023        WARN_ON_ONCE(1);
1024
1025        return NULL;
1026}
1027
1028static struct topa_entry *
1029pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
1030{
1031        unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
1032        struct topa_page *tp;
1033        struct topa *topa;
1034
1035        tp = (struct topa_page *)table;
1036        if (tp->table != te)
1037                return --te;
1038
1039        topa = &tp->topa;
1040        if (topa == buf->first)
1041                topa = buf->last;
1042        else
1043                topa = list_prev_entry(topa, list);
1044
1045        tp = topa_to_page(topa);
1046
1047        return &tp->table[topa->last - 1];
1048}
1049
1050/**
1051 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
1052 * @buf:        PT buffer.
1053 * @handle:     Current output handle.
1054 *
1055 * Place INT and STOP marks to prevent overwriting old data that the consumer
1056 * hasn't yet collected and waking up the consumer after a certain fraction of
1057 * the buffer has filled up. Only needed and sensible for non-snapshot counters.
1058 *
1059 * This obviously relies on buf::head to figure out buffer markers, so it has
1060 * to be called after pt_buffer_reset_offsets() and before the hardware tracing
1061 * is enabled.
1062 */
1063static int pt_buffer_reset_markers(struct pt_buffer *buf,
1064                                   struct perf_output_handle *handle)
1065
1066{
1067        unsigned long head = local64_read(&buf->head);
1068        unsigned long idx, npages, wakeup;
1069
1070        if (buf->single)
1071                return 0;
1072
1073        /* can't stop in the middle of an output region */
1074        if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
1075                perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
1076                return -EINVAL;
1077        }
1078
1079
1080        /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
1081        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1082                return 0;
1083
1084        /* clear STOP and INT from current entry */
1085        if (buf->stop_te) {
1086                buf->stop_te->stop = 0;
1087                buf->stop_te->intr = 0;
1088        }
1089
1090        if (buf->intr_te)
1091                buf->intr_te->intr = 0;
1092
1093        /* how many pages till the STOP marker */
1094        npages = handle->size >> PAGE_SHIFT;
1095
1096        /* if it's on a page boundary, fill up one more page */
1097        if (!offset_in_page(head + handle->size + 1))
1098                npages++;
1099
1100        idx = (head >> PAGE_SHIFT) + npages;
1101        idx &= buf->nr_pages - 1;
1102
1103        if (idx != buf->stop_pos) {
1104                buf->stop_pos = idx;
1105                buf->stop_te = pt_topa_entry_for_page(buf, idx);
1106                buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
1107        }
1108
1109        wakeup = handle->wakeup >> PAGE_SHIFT;
1110
1111        /* in the worst case, wake up the consumer one page before hard stop */
1112        idx = (head >> PAGE_SHIFT) + npages - 1;
1113        if (idx > wakeup)
1114                idx = wakeup;
1115
1116        idx &= buf->nr_pages - 1;
1117        if (idx != buf->intr_pos) {
1118                buf->intr_pos = idx;
1119                buf->intr_te = pt_topa_entry_for_page(buf, idx);
1120                buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
1121        }
1122
1123        buf->stop_te->stop = 1;
1124        buf->stop_te->intr = 1;
1125        buf->intr_te->intr = 1;
1126
1127        return 0;
1128}
1129
1130/**
1131 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
1132 * @buf:        PT buffer.
1133 * @head:       Write pointer (aux_head) from AUX buffer.
1134 *
1135 * Find the ToPA table and entry corresponding to given @head and set buffer's
1136 * "current" pointers accordingly. This is done after we have obtained the
1137 * current aux_head position from a successful call to perf_aux_output_begin()
1138 * to make sure the hardware is writing to the right place.
1139 *
1140 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
1141 * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
1142 * which are used to determine INT and STOP markers' locations by a subsequent
1143 * call to pt_buffer_reset_markers().
1144 */
1145static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
1146{
1147        struct topa_page *cur_tp;
1148        struct topa_entry *te;
1149        int pg;
1150
1151        if (buf->snapshot)
1152                head &= (buf->nr_pages << PAGE_SHIFT) - 1;
1153
1154        if (!buf->single) {
1155                pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
1156                te = pt_topa_entry_for_page(buf, pg);
1157
1158                cur_tp = topa_entry_to_page(te);
1159                buf->cur = &cur_tp->topa;
1160                buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
1161                buf->output_off = head & (pt_buffer_region_size(buf) - 1);
1162        } else {
1163                buf->output_off = head;
1164        }
1165
1166        local64_set(&buf->head, head);
1167        local_set(&buf->data_size, 0);
1168}
1169
1170/**
1171 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
1172 * @buf:        PT buffer.
1173 */
1174static void pt_buffer_fini_topa(struct pt_buffer *buf)
1175{
1176        struct topa *topa, *iter;
1177
1178        if (buf->single)
1179                return;
1180
1181        list_for_each_entry_safe(topa, iter, &buf->tables, list) {
1182                /*
1183                 * right now, this is in free_aux() path only, so
1184                 * no need to unlink this table from the list
1185                 */
1186                topa_free(topa);
1187        }
1188}
1189
1190/**
1191 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
1192 * @buf:        PT buffer.
1193 * @size:       Total size of all regions within this ToPA.
1194 * @gfp:        Allocation flags.
1195 */
1196static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
1197                               unsigned long nr_pages, gfp_t gfp)
1198{
1199        struct topa *topa;
1200        int err;
1201
1202        topa = topa_alloc(cpu, gfp);
1203        if (!topa)
1204                return -ENOMEM;
1205
1206        topa_insert_table(buf, topa);
1207
1208        while (buf->nr_pages < nr_pages) {
1209                err = topa_insert_pages(buf, cpu, gfp);
1210                if (err) {
1211                        pt_buffer_fini_topa(buf);
1212                        return -ENOMEM;
1213                }
1214        }
1215
1216        /* link last table to the first one, unless we're double buffering */
1217        if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
1218                TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
1219                TOPA_ENTRY(buf->last, -1)->end = 1;
1220        }
1221
1222        pt_topa_dump(buf);
1223        return 0;
1224}
1225
1226static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
1227{
1228        struct page *p = virt_to_page(buf->data_pages[0]);
1229        int ret = -ENOTSUPP, order = 0;
1230
1231        /*
1232         * We can use single range output mode
1233         * + in snapshot mode, where we don't need interrupts;
1234         * + if the hardware supports it;
1235         * + if the entire buffer is one contiguous allocation.
1236         */
1237        if (!buf->snapshot)
1238                goto out;
1239
1240        if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
1241                goto out;
1242
1243        if (PagePrivate(p))
1244                order = page_private(p);
1245
1246        if (1 << order != nr_pages)
1247                goto out;
1248
1249        buf->single = true;
1250        buf->nr_pages = nr_pages;
1251        ret = 0;
1252out:
1253        return ret;
1254}
1255
1256/**
1257 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
1258 * @cpu:        Cpu on which to allocate, -1 means current.
1259 * @pages:      Array of pointers to buffer pages passed from perf core.
1260 * @nr_pages:   Number of pages in the buffer.
1261 * @snapshot:   If this is a snapshot/overwrite counter.
1262 *
1263 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
1264 * bookkeeping for an AUX buffer.
1265 *
1266 * Return:      Our private PT buffer structure.
1267 */
1268static void *
1269pt_buffer_setup_aux(struct perf_event *event, void **pages,
1270                    int nr_pages, bool snapshot)
1271{
1272        struct pt_buffer *buf;
1273        int node, ret, cpu = event->cpu;
1274
1275        if (!nr_pages)
1276                return NULL;
1277
1278        /*
1279         * Only support AUX sampling in snapshot mode, where we don't
1280         * generate NMIs.
1281         */
1282        if (event->attr.aux_sample_size && !snapshot)
1283                return NULL;
1284
1285        if (cpu == -1)
1286                cpu = raw_smp_processor_id();
1287        node = cpu_to_node(cpu);
1288
1289        buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
1290        if (!buf)
1291                return NULL;
1292
1293        buf->snapshot = snapshot;
1294        buf->data_pages = pages;
1295        buf->stop_pos = -1;
1296        buf->intr_pos = -1;
1297
1298        INIT_LIST_HEAD(&buf->tables);
1299
1300        ret = pt_buffer_try_single(buf, nr_pages);
1301        if (!ret)
1302                return buf;
1303
1304        ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
1305        if (ret) {
1306                kfree(buf);
1307                return NULL;
1308        }
1309
1310        return buf;
1311}
1312
1313/**
1314 * pt_buffer_free_aux() - perf AUX deallocation path callback
1315 * @data:       PT buffer.
1316 */
1317static void pt_buffer_free_aux(void *data)
1318{
1319        struct pt_buffer *buf = data;
1320
1321        pt_buffer_fini_topa(buf);
1322        kfree(buf);
1323}
1324
1325static int pt_addr_filters_init(struct perf_event *event)
1326{
1327        struct pt_filters *filters;
1328        int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1329
1330        if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1331                return 0;
1332
1333        filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
1334        if (!filters)
1335                return -ENOMEM;
1336
1337        if (event->parent)
1338                memcpy(filters, event->parent->hw.addr_filters,
1339                       sizeof(*filters));
1340
1341        event->hw.addr_filters = filters;
1342
1343        return 0;
1344}
1345
1346static void pt_addr_filters_fini(struct perf_event *event)
1347{
1348        kfree(event->hw.addr_filters);
1349        event->hw.addr_filters = NULL;
1350}
1351
1352static inline bool valid_kernel_ip(unsigned long ip)
1353{
1354        return virt_addr_valid(ip) && kernel_ip(ip);
1355}
1356
1357static int pt_event_addr_filters_validate(struct list_head *filters)
1358{
1359        struct perf_addr_filter *filter;
1360        int range = 0;
1361
1362        list_for_each_entry(filter, filters, entry) {
1363                /*
1364                 * PT doesn't support single address triggers and
1365                 * 'start' filters.
1366                 */
1367                if (!filter->size ||
1368                    filter->action == PERF_ADDR_FILTER_ACTION_START)
1369                        return -EOPNOTSUPP;
1370
1371                if (!filter->path.dentry) {
1372                        if (!valid_kernel_ip(filter->offset))
1373                                return -EINVAL;
1374
1375                        if (!valid_kernel_ip(filter->offset + filter->size))
1376                                return -EINVAL;
1377                }
1378
1379                if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1380                        return -EOPNOTSUPP;
1381        }
1382
1383        return 0;
1384}
1385
1386static void pt_event_addr_filters_sync(struct perf_event *event)
1387{
1388        struct perf_addr_filters_head *head = perf_event_addr_filters(event);
1389        unsigned long msr_a, msr_b;
1390        struct perf_addr_filter_range *fr = event->addr_filter_ranges;
1391        struct pt_filters *filters = event->hw.addr_filters;
1392        struct perf_addr_filter *filter;
1393        int range = 0;
1394
1395        if (!filters)
1396                return;
1397
1398        list_for_each_entry(filter, &head->list, entry) {
1399                if (filter->path.dentry && !fr[range].start) {
1400                        msr_a = msr_b = 0;
1401                } else {
1402                        /* apply the offset */
1403                        msr_a = fr[range].start;
1404                        msr_b = msr_a + fr[range].size - 1;
1405                }
1406
1407                filters->filter[range].msr_a  = msr_a;
1408                filters->filter[range].msr_b  = msr_b;
1409                if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
1410                        filters->filter[range].config = 1;
1411                else
1412                        filters->filter[range].config = 2;
1413                range++;
1414        }
1415
1416        filters->nr_filters = range;
1417}
1418
1419/**
1420 * intel_pt_interrupt() - PT PMI handler
1421 */
1422void intel_pt_interrupt(void)
1423{
1424        struct pt *pt = this_cpu_ptr(&pt_ctx);
1425        struct pt_buffer *buf;
1426        struct perf_event *event = pt->handle.event;
1427
1428        /*
1429         * There may be a dangling PT bit in the interrupt status register
1430         * after PT has been disabled by pt_event_stop(). Make sure we don't
1431         * do anything (particularly, re-enable) for this event here.
1432         */
1433        if (!READ_ONCE(pt->handle_nmi))
1434                return;
1435
1436        if (!event)
1437                return;
1438
1439        pt_config_stop(event);
1440
1441        buf = perf_get_aux(&pt->handle);
1442        if (!buf)
1443                return;
1444
1445        pt_read_offset(buf);
1446
1447        pt_handle_status(pt);
1448
1449        pt_update_head(pt);
1450
1451        perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1452
1453        if (!event->hw.state) {
1454                int ret;
1455
1456                buf = perf_aux_output_begin(&pt->handle, event);
1457                if (!buf) {
1458                        event->hw.state = PERF_HES_STOPPED;
1459                        return;
1460                }
1461
1462                pt_buffer_reset_offsets(buf, pt->handle.head);
1463                /* snapshot counters don't use PMI, so it's safe */
1464                ret = pt_buffer_reset_markers(buf, &pt->handle);
1465                if (ret) {
1466                        perf_aux_output_end(&pt->handle, 0);
1467                        return;
1468                }
1469
1470                pt_config_buffer(buf);
1471                pt_config_start(event);
1472        }
1473}
1474
1475void intel_pt_handle_vmx(int on)
1476{
1477        struct pt *pt = this_cpu_ptr(&pt_ctx);
1478        struct perf_event *event;
1479        unsigned long flags;
1480
1481        /* PT plays nice with VMX, do nothing */
1482        if (pt_pmu.vmx)
1483                return;
1484
1485        /*
1486         * VMXON will clear RTIT_CTL.TraceEn; we need to make
1487         * sure to not try to set it while VMX is on. Disable
1488         * interrupts to avoid racing with pmu callbacks;
1489         * concurrent PMI should be handled fine.
1490         */
1491        local_irq_save(flags);
1492        WRITE_ONCE(pt->vmx_on, on);
1493
1494        /*
1495         * If an AUX transaction is in progress, it will contain
1496         * gap(s), so flag it PARTIAL to inform the user.
1497         */
1498        event = pt->handle.event;
1499        if (event)
1500                perf_aux_output_flag(&pt->handle,
1501                                     PERF_AUX_FLAG_PARTIAL);
1502
1503        /* Turn PTs back on */
1504        if (!on && event)
1505                wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
1506
1507        local_irq_restore(flags);
1508}
1509EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1510
1511/*
1512 * PMU callbacks
1513 */
1514
1515static void pt_event_start(struct perf_event *event, int mode)
1516{
1517        struct hw_perf_event *hwc = &event->hw;
1518        struct pt *pt = this_cpu_ptr(&pt_ctx);
1519        struct pt_buffer *buf;
1520
1521        buf = perf_aux_output_begin(&pt->handle, event);
1522        if (!buf)
1523                goto fail_stop;
1524
1525        pt_buffer_reset_offsets(buf, pt->handle.head);
1526        if (!buf->snapshot) {
1527                if (pt_buffer_reset_markers(buf, &pt->handle))
1528                        goto fail_end_stop;
1529        }
1530
1531        WRITE_ONCE(pt->handle_nmi, 1);
1532        hwc->state = 0;
1533
1534        pt_config_buffer(buf);
1535        pt_config(event);
1536
1537        return;
1538
1539fail_end_stop:
1540        perf_aux_output_end(&pt->handle, 0);
1541fail_stop:
1542        hwc->state = PERF_HES_STOPPED;
1543}
1544
1545static void pt_event_stop(struct perf_event *event, int mode)
1546{
1547        struct pt *pt = this_cpu_ptr(&pt_ctx);
1548
1549        /*
1550         * Protect against the PMI racing with disabling wrmsr,
1551         * see comment in intel_pt_interrupt().
1552         */
1553        WRITE_ONCE(pt->handle_nmi, 0);
1554
1555        pt_config_stop(event);
1556
1557        if (event->hw.state == PERF_HES_STOPPED)
1558                return;
1559
1560        event->hw.state = PERF_HES_STOPPED;
1561
1562        if (mode & PERF_EF_UPDATE) {
1563                struct pt_buffer *buf = perf_get_aux(&pt->handle);
1564
1565                if (!buf)
1566                        return;
1567
1568                if (WARN_ON_ONCE(pt->handle.event != event))
1569                        return;
1570
1571                pt_read_offset(buf);
1572
1573                pt_handle_status(pt);
1574
1575                pt_update_head(pt);
1576
1577                if (buf->snapshot)
1578                        pt->handle.head =
1579                                local_xchg(&buf->data_size,
1580                                           buf->nr_pages << PAGE_SHIFT);
1581                perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1582        }
1583}
1584
1585static long pt_event_snapshot_aux(struct perf_event *event,
1586                                  struct perf_output_handle *handle,
1587                                  unsigned long size)
1588{
1589        struct pt *pt = this_cpu_ptr(&pt_ctx);
1590        struct pt_buffer *buf = perf_get_aux(&pt->handle);
1591        unsigned long from = 0, to;
1592        long ret;
1593
1594        if (WARN_ON_ONCE(!buf))
1595                return 0;
1596
1597        /*
1598         * Sampling is only allowed on snapshot events;
1599         * see pt_buffer_setup_aux().
1600         */
1601        if (WARN_ON_ONCE(!buf->snapshot))
1602                return 0;
1603
1604        /*
1605         * Here, handle_nmi tells us if the tracing is on
1606         */
1607        if (READ_ONCE(pt->handle_nmi))
1608                pt_config_stop(event);
1609
1610        pt_read_offset(buf);
1611        pt_update_head(pt);
1612
1613        to = local_read(&buf->data_size);
1614        if (to < size)
1615                from = buf->nr_pages << PAGE_SHIFT;
1616        from += to - size;
1617
1618        ret = perf_output_copy_aux(&pt->handle, handle, from, to);
1619
1620        /*
1621         * If the tracing was on when we turned up, restart it.
1622         * Compiler barrier not needed as we couldn't have been
1623         * preempted by anything that touches pt->handle_nmi.
1624         */
1625        if (pt->handle_nmi)
1626                pt_config_start(event);
1627
1628        return ret;
1629}
1630
1631static void pt_event_del(struct perf_event *event, int mode)
1632{
1633        pt_event_stop(event, PERF_EF_UPDATE);
1634}
1635
1636static int pt_event_add(struct perf_event *event, int mode)
1637{
1638        struct pt *pt = this_cpu_ptr(&pt_ctx);
1639        struct hw_perf_event *hwc = &event->hw;
1640        int ret = -EBUSY;
1641
1642        if (pt->handle.event)
1643                goto fail;
1644
1645        if (mode & PERF_EF_START) {
1646                pt_event_start(event, 0);
1647                ret = -EINVAL;
1648                if (hwc->state == PERF_HES_STOPPED)
1649                        goto fail;
1650        } else {
1651                hwc->state = PERF_HES_STOPPED;
1652        }
1653
1654        ret = 0;
1655fail:
1656
1657        return ret;
1658}
1659
1660static void pt_event_read(struct perf_event *event)
1661{
1662}
1663
1664static void pt_event_destroy(struct perf_event *event)
1665{
1666        pt_addr_filters_fini(event);
1667        x86_del_exclusive(x86_lbr_exclusive_pt);
1668}
1669
1670static int pt_event_init(struct perf_event *event)
1671{
1672        if (event->attr.type != pt_pmu.pmu.type)
1673                return -ENOENT;
1674
1675        if (!pt_event_valid(event))
1676                return -EINVAL;
1677
1678        if (x86_add_exclusive(x86_lbr_exclusive_pt))
1679                return -EBUSY;
1680
1681        if (pt_addr_filters_init(event)) {
1682                x86_del_exclusive(x86_lbr_exclusive_pt);
1683                return -ENOMEM;
1684        }
1685
1686        event->destroy = pt_event_destroy;
1687
1688        return 0;
1689}
1690
1691void cpu_emergency_stop_pt(void)
1692{
1693        struct pt *pt = this_cpu_ptr(&pt_ctx);
1694
1695        if (pt->handle.event)
1696                pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1697}
1698
1699int is_intel_pt_event(struct perf_event *event)
1700{
1701        return event->pmu == &pt_pmu.pmu;
1702}
1703
1704static __init int pt_init(void)
1705{
1706        int ret, cpu, prior_warn = 0;
1707
1708        BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1709
1710        if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
1711                return -ENODEV;
1712
1713        get_online_cpus();
1714        for_each_online_cpu(cpu) {
1715                u64 ctl;
1716
1717                ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1718                if (!ret && (ctl & RTIT_CTL_TRACEEN))
1719                        prior_warn++;
1720        }
1721        put_online_cpus();
1722
1723        if (prior_warn) {
1724                x86_add_exclusive(x86_lbr_exclusive_pt);
1725                pr_warn("PT is enabled at boot time, doing nothing\n");
1726
1727                return -EBUSY;
1728        }
1729
1730        ret = pt_pmu_hw_init();
1731        if (ret)
1732                return ret;
1733
1734        if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
1735                pr_warn("ToPA output is not supported on this CPU\n");
1736                return -ENODEV;
1737        }
1738
1739        if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1740                pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
1741
1742        pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1743        pt_pmu.pmu.attr_groups           = pt_attr_groups;
1744        pt_pmu.pmu.task_ctx_nr           = perf_sw_context;
1745        pt_pmu.pmu.event_init            = pt_event_init;
1746        pt_pmu.pmu.add                   = pt_event_add;
1747        pt_pmu.pmu.del                   = pt_event_del;
1748        pt_pmu.pmu.start                 = pt_event_start;
1749        pt_pmu.pmu.stop                  = pt_event_stop;
1750        pt_pmu.pmu.snapshot_aux          = pt_event_snapshot_aux;
1751        pt_pmu.pmu.read                  = pt_event_read;
1752        pt_pmu.pmu.setup_aux             = pt_buffer_setup_aux;
1753        pt_pmu.pmu.free_aux              = pt_buffer_free_aux;
1754        pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
1755        pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1756        pt_pmu.pmu.nr_addr_filters       =
1757                intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
1758
1759        ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1760
1761        return ret;
1762}
1763arch_initcall(pt_init);
1764