linux/arch/x86/kernel/cpu/perf_event_amd_ibs.c
<<
>>
Prefs
   1/*
   2 * Performance events - AMD IBS
   3 *
   4 *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
   5 *
   6 *  For licencing details see kernel-base/COPYING
   7 */
   8
   9#include <linux/perf_event.h>
  10#include <linux/module.h>
  11#include <linux/pci.h>
  12#include <linux/ptrace.h>
  13
  14#include <asm/apic.h>
  15
  16#include "perf_event.h"
  17
  18static u32 ibs_caps;
  19
  20#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
  21
  22#include <linux/kprobes.h>
  23#include <linux/hardirq.h>
  24
  25#include <asm/nmi.h>
  26
  27#define IBS_FETCH_CONFIG_MASK   (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
  28#define IBS_OP_CONFIG_MASK      IBS_OP_MAX_CNT
  29
  30enum ibs_states {
  31        IBS_ENABLED     = 0,
  32        IBS_STARTED     = 1,
  33        IBS_STOPPING    = 2,
  34
  35        IBS_MAX_STATES,
  36};
  37
  38struct cpu_perf_ibs {
  39        struct perf_event       *event;
  40        unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
  41};
  42
  43struct perf_ibs {
  44        struct pmu      pmu;
  45        unsigned int    msr;
  46        u64             config_mask;
  47        u64             cnt_mask;
  48        u64             enable_mask;
  49        u64             valid_mask;
  50        u64             max_period;
  51        unsigned long   offset_mask[1];
  52        int             offset_max;
  53        struct cpu_perf_ibs __percpu *pcpu;
  54        u64             (*get_count)(u64 config);
  55};
  56
  57struct perf_ibs_data {
  58        u32             size;
  59        union {
  60                u32     data[0];        /* data buffer starts here */
  61                u32     caps;
  62        };
  63        u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
  64};
  65
  66static int
  67perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
  68{
  69        s64 left = local64_read(&hwc->period_left);
  70        s64 period = hwc->sample_period;
  71        int overflow = 0;
  72
  73        /*
  74         * If we are way outside a reasonable range then just skip forward:
  75         */
  76        if (unlikely(left <= -period)) {
  77                left = period;
  78                local64_set(&hwc->period_left, left);
  79                hwc->last_period = period;
  80                overflow = 1;
  81        }
  82
  83        if (unlikely(left < (s64)min)) {
  84                left += period;
  85                local64_set(&hwc->period_left, left);
  86                hwc->last_period = period;
  87                overflow = 1;
  88        }
  89
  90        /*
  91         * If the hw period that triggers the sw overflow is too short
  92         * we might hit the irq handler. This biases the results.
  93         * Thus we shorten the next-to-last period and set the last
  94         * period to the max period.
  95         */
  96        if (left > max) {
  97                left -= max;
  98                if (left > max)
  99                        left = max;
 100                else if (left < min)
 101                        left = min;
 102        }
 103
 104        *hw_period = (u64)left;
 105
 106        return overflow;
 107}
 108
 109static  int
 110perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
 111{
 112        struct hw_perf_event *hwc = &event->hw;
 113        int shift = 64 - width;
 114        u64 prev_raw_count;
 115        u64 delta;
 116
 117        /*
 118         * Careful: an NMI might modify the previous event value.
 119         *
 120         * Our tactic to handle this is to first atomically read and
 121         * exchange a new raw count - then add that new-prev delta
 122         * count to the generic event atomically:
 123         */
 124        prev_raw_count = local64_read(&hwc->prev_count);
 125        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 126                                        new_raw_count) != prev_raw_count)
 127                return 0;
 128
 129        /*
 130         * Now we have the new raw value and have updated the prev
 131         * timestamp already. We can now calculate the elapsed delta
 132         * (event-)time and add that to the generic event.
 133         *
 134         * Careful, not all hw sign-extends above the physical width
 135         * of the count.
 136         */
 137        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 138        delta >>= shift;
 139
 140        local64_add(delta, &event->count);
 141        local64_sub(delta, &hwc->period_left);
 142
 143        return 1;
 144}
 145
 146static struct perf_ibs perf_ibs_fetch;
 147static struct perf_ibs perf_ibs_op;
 148
 149static struct perf_ibs *get_ibs_pmu(int type)
 150{
 151        if (perf_ibs_fetch.pmu.type == type)
 152                return &perf_ibs_fetch;
 153        if (perf_ibs_op.pmu.type == type)
 154                return &perf_ibs_op;
 155        return NULL;
 156}
 157
 158/*
 159 * Use IBS for precise event sampling:
 160 *
 161 *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
 162 *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
 163 *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
 164 *
 165 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
 166 * MSRC001_1033) is used to select either cycle or micro-ops counting
 167 * mode.
 168 *
 169 * The rip of IBS samples has skid 0. Thus, IBS supports precise
 170 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
 171 * rip is invalid when IBS was not able to record the rip correctly.
 172 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
 173 *
 174 */
 175static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
 176{
 177        switch (event->attr.precise_ip) {
 178        case 0:
 179                return -ENOENT;
 180        case 1:
 181        case 2:
 182                break;
 183        default:
 184                return -EOPNOTSUPP;
 185        }
 186
 187        switch (event->attr.type) {
 188        case PERF_TYPE_HARDWARE:
 189                switch (event->attr.config) {
 190                case PERF_COUNT_HW_CPU_CYCLES:
 191                        *config = 0;
 192                        return 0;
 193                }
 194                break;
 195        case PERF_TYPE_RAW:
 196                switch (event->attr.config) {
 197                case 0x0076:
 198                        *config = 0;
 199                        return 0;
 200                case 0x00C1:
 201                        *config = IBS_OP_CNT_CTL;
 202                        return 0;
 203                }
 204                break;
 205        default:
 206                return -ENOENT;
 207        }
 208
 209        return -EOPNOTSUPP;
 210}
 211
 212static const struct perf_event_attr ibs_notsupp = {
 213        .exclude_user   = 1,
 214        .exclude_kernel = 1,
 215        .exclude_hv     = 1,
 216        .exclude_idle   = 1,
 217        .exclude_host   = 1,
 218        .exclude_guest  = 1,
 219};
 220
 221static int perf_ibs_init(struct perf_event *event)
 222{
 223        struct hw_perf_event *hwc = &event->hw;
 224        struct perf_ibs *perf_ibs;
 225        u64 max_cnt, config;
 226        int ret;
 227
 228        perf_ibs = get_ibs_pmu(event->attr.type);
 229        if (perf_ibs) {
 230                config = event->attr.config;
 231        } else {
 232                perf_ibs = &perf_ibs_op;
 233                ret = perf_ibs_precise_event(event, &config);
 234                if (ret)
 235                        return ret;
 236        }
 237
 238        if (event->pmu != &perf_ibs->pmu)
 239                return -ENOENT;
 240
 241        if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
 242                return -EINVAL;
 243
 244        if (config & ~perf_ibs->config_mask)
 245                return -EINVAL;
 246
 247        if (hwc->sample_period) {
 248                if (config & perf_ibs->cnt_mask)
 249                        /* raw max_cnt may not be set */
 250                        return -EINVAL;
 251                if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
 252                        /*
 253                         * lower 4 bits can not be set in ibs max cnt,
 254                         * but allowing it in case we adjust the
 255                         * sample period to set a frequency.
 256                         */
 257                        return -EINVAL;
 258                hwc->sample_period &= ~0x0FULL;
 259                if (!hwc->sample_period)
 260                        hwc->sample_period = 0x10;
 261        } else {
 262                max_cnt = config & perf_ibs->cnt_mask;
 263                config &= ~perf_ibs->cnt_mask;
 264                event->attr.sample_period = max_cnt << 4;
 265                hwc->sample_period = event->attr.sample_period;
 266        }
 267
 268        if (!hwc->sample_period)
 269                return -EINVAL;
 270
 271        /*
 272         * If we modify hwc->sample_period, we also need to update
 273         * hwc->last_period and hwc->period_left.
 274         */
 275        hwc->last_period = hwc->sample_period;
 276        local64_set(&hwc->period_left, hwc->sample_period);
 277
 278        hwc->config_base = perf_ibs->msr;
 279        hwc->config = config;
 280
 281        return 0;
 282}
 283
 284static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
 285                               struct hw_perf_event *hwc, u64 *period)
 286{
 287        int overflow;
 288
 289        /* ignore lower 4 bits in min count: */
 290        overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
 291        local64_set(&hwc->prev_count, 0);
 292
 293        return overflow;
 294}
 295
 296static u64 get_ibs_fetch_count(u64 config)
 297{
 298        return (config & IBS_FETCH_CNT) >> 12;
 299}
 300
 301static u64 get_ibs_op_count(u64 config)
 302{
 303        u64 count = 0;
 304
 305        if (config & IBS_OP_VAL)
 306                count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
 307
 308        if (ibs_caps & IBS_CAPS_RDWROPCNT)
 309                count += (config & IBS_OP_CUR_CNT) >> 32;
 310
 311        return count;
 312}
 313
 314static void
 315perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
 316                      u64 *config)
 317{
 318        u64 count = perf_ibs->get_count(*config);
 319
 320        /*
 321         * Set width to 64 since we do not overflow on max width but
 322         * instead on max count. In perf_ibs_set_period() we clear
 323         * prev count manually on overflow.
 324         */
 325        while (!perf_event_try_update(event, count, 64)) {
 326                rdmsrl(event->hw.config_base, *config);
 327                count = perf_ibs->get_count(*config);
 328        }
 329}
 330
 331static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
 332                                         struct hw_perf_event *hwc, u64 config)
 333{
 334        wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
 335}
 336
 337/*
 338 * Erratum #420 Instruction-Based Sampling Engine May Generate
 339 * Interrupt that Cannot Be Cleared:
 340 *
 341 * Must clear counter mask first, then clear the enable bit. See
 342 * Revision Guide for AMD Family 10h Processors, Publication #41322.
 343 */
 344static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
 345                                          struct hw_perf_event *hwc, u64 config)
 346{
 347        config &= ~perf_ibs->cnt_mask;
 348        wrmsrl(hwc->config_base, config);
 349        config &= ~perf_ibs->enable_mask;
 350        wrmsrl(hwc->config_base, config);
 351}
 352
 353/*
 354 * We cannot restore the ibs pmu state, so we always needs to update
 355 * the event while stopping it and then reset the state when starting
 356 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
 357 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
 358 */
 359static void perf_ibs_start(struct perf_event *event, int flags)
 360{
 361        struct hw_perf_event *hwc = &event->hw;
 362        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 363        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 364        u64 period;
 365
 366        if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 367                return;
 368
 369        WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
 370        hwc->state = 0;
 371
 372        perf_ibs_set_period(perf_ibs, hwc, &period);
 373        set_bit(IBS_STARTED, pcpu->state);
 374        perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 375
 376        perf_event_update_userpage(event);
 377}
 378
 379static void perf_ibs_stop(struct perf_event *event, int flags)
 380{
 381        struct hw_perf_event *hwc = &event->hw;
 382        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 383        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 384        u64 config;
 385        int stopping;
 386
 387        stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
 388
 389        if (!stopping && (hwc->state & PERF_HES_UPTODATE))
 390                return;
 391
 392        rdmsrl(hwc->config_base, config);
 393
 394        if (stopping) {
 395                set_bit(IBS_STOPPING, pcpu->state);
 396                perf_ibs_disable_event(perf_ibs, hwc, config);
 397                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 398                hwc->state |= PERF_HES_STOPPED;
 399        }
 400
 401        if (hwc->state & PERF_HES_UPTODATE)
 402                return;
 403
 404        /*
 405         * Clear valid bit to not count rollovers on update, rollovers
 406         * are only updated in the irq handler.
 407         */
 408        config &= ~perf_ibs->valid_mask;
 409
 410        perf_ibs_event_update(perf_ibs, event, &config);
 411        hwc->state |= PERF_HES_UPTODATE;
 412}
 413
 414static int perf_ibs_add(struct perf_event *event, int flags)
 415{
 416        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 417        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 418
 419        if (test_and_set_bit(IBS_ENABLED, pcpu->state))
 420                return -ENOSPC;
 421
 422        event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 423
 424        pcpu->event = event;
 425
 426        if (flags & PERF_EF_START)
 427                perf_ibs_start(event, PERF_EF_RELOAD);
 428
 429        return 0;
 430}
 431
 432static void perf_ibs_del(struct perf_event *event, int flags)
 433{
 434        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 435        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 436
 437        if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
 438                return;
 439
 440        perf_ibs_stop(event, PERF_EF_UPDATE);
 441
 442        pcpu->event = NULL;
 443
 444        perf_event_update_userpage(event);
 445}
 446
 447static void perf_ibs_read(struct perf_event *event) { }
 448
 449static struct perf_ibs perf_ibs_fetch = {
 450        .pmu = {
 451                .task_ctx_nr    = perf_invalid_context,
 452
 453                .event_init     = perf_ibs_init,
 454                .add            = perf_ibs_add,
 455                .del            = perf_ibs_del,
 456                .start          = perf_ibs_start,
 457                .stop           = perf_ibs_stop,
 458                .read           = perf_ibs_read,
 459        },
 460        .msr                    = MSR_AMD64_IBSFETCHCTL,
 461        .config_mask            = IBS_FETCH_CONFIG_MASK,
 462        .cnt_mask               = IBS_FETCH_MAX_CNT,
 463        .enable_mask            = IBS_FETCH_ENABLE,
 464        .valid_mask             = IBS_FETCH_VAL,
 465        .max_period             = IBS_FETCH_MAX_CNT << 4,
 466        .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
 467        .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
 468
 469        .get_count              = get_ibs_fetch_count,
 470};
 471
 472static struct perf_ibs perf_ibs_op = {
 473        .pmu = {
 474                .task_ctx_nr    = perf_invalid_context,
 475
 476                .event_init     = perf_ibs_init,
 477                .add            = perf_ibs_add,
 478                .del            = perf_ibs_del,
 479                .start          = perf_ibs_start,
 480                .stop           = perf_ibs_stop,
 481                .read           = perf_ibs_read,
 482        },
 483        .msr                    = MSR_AMD64_IBSOPCTL,
 484        .config_mask            = IBS_OP_CONFIG_MASK,
 485        .cnt_mask               = IBS_OP_MAX_CNT,
 486        .enable_mask            = IBS_OP_ENABLE,
 487        .valid_mask             = IBS_OP_VAL,
 488        .max_period             = IBS_OP_MAX_CNT << 4,
 489        .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
 490        .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
 491
 492        .get_count              = get_ibs_op_count,
 493};
 494
 495static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 496{
 497        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 498        struct perf_event *event = pcpu->event;
 499        struct hw_perf_event *hwc = &event->hw;
 500        struct perf_sample_data data;
 501        struct perf_raw_record raw;
 502        struct pt_regs regs;
 503        struct perf_ibs_data ibs_data;
 504        int offset, size, check_rip, offset_max, throttle = 0;
 505        unsigned int msr;
 506        u64 *buf, *config, period;
 507
 508        if (!test_bit(IBS_STARTED, pcpu->state)) {
 509                /*
 510                 * Catch spurious interrupts after stopping IBS: After
 511                 * disabling IBS there could be still incomming NMIs
 512                 * with samples that even have the valid bit cleared.
 513                 * Mark all this NMIs as handled.
 514                 */
 515                return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
 516        }
 517
 518        msr = hwc->config_base;
 519        buf = ibs_data.regs;
 520        rdmsrl(msr, *buf);
 521        if (!(*buf++ & perf_ibs->valid_mask))
 522                return 0;
 523
 524        config = &ibs_data.regs[0];
 525        perf_ibs_event_update(perf_ibs, event, config);
 526        perf_sample_data_init(&data, 0, hwc->last_period);
 527        if (!perf_ibs_set_period(perf_ibs, hwc, &period))
 528                goto out;       /* no sw counter overflow */
 529
 530        ibs_data.caps = ibs_caps;
 531        size = 1;
 532        offset = 1;
 533        check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
 534        if (event->attr.sample_type & PERF_SAMPLE_RAW)
 535                offset_max = perf_ibs->offset_max;
 536        else if (check_rip)
 537                offset_max = 2;
 538        else
 539                offset_max = 1;
 540        do {
 541                rdmsrl(msr + offset, *buf++);
 542                size++;
 543                offset = find_next_bit(perf_ibs->offset_mask,
 544                                       perf_ibs->offset_max,
 545                                       offset + 1);
 546        } while (offset < offset_max);
 547        ibs_data.size = sizeof(u64) * size;
 548
 549        regs = *iregs;
 550        if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
 551                regs.flags &= ~PERF_EFLAGS_EXACT;
 552        } else {
 553                set_linear_ip(&regs, ibs_data.regs[1]);
 554                regs.flags |= PERF_EFLAGS_EXACT;
 555        }
 556
 557        if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 558                raw.size = sizeof(u32) + ibs_data.size;
 559                raw.data = ibs_data.data;
 560                data.raw = &raw;
 561        }
 562
 563        throttle = perf_event_overflow(event, &data, &regs);
 564out:
 565        if (throttle)
 566                perf_ibs_disable_event(perf_ibs, hwc, *config);
 567        else
 568                perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 569
 570        perf_event_update_userpage(event);
 571
 572        return 1;
 573}
 574
 575static int __kprobes
 576perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 577{
 578        int handled = 0;
 579
 580        handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
 581        handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
 582
 583        if (handled)
 584                inc_irq_stat(apic_perf_irqs);
 585
 586        return handled;
 587}
 588
 589static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
 590{
 591        struct cpu_perf_ibs __percpu *pcpu;
 592        int ret;
 593
 594        pcpu = alloc_percpu(struct cpu_perf_ibs);
 595        if (!pcpu)
 596                return -ENOMEM;
 597
 598        perf_ibs->pcpu = pcpu;
 599
 600        ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
 601        if (ret) {
 602                perf_ibs->pcpu = NULL;
 603                free_percpu(pcpu);
 604        }
 605
 606        return ret;
 607}
 608
 609static __init int perf_event_ibs_init(void)
 610{
 611        if (!ibs_caps)
 612                return -ENODEV; /* ibs not supported by the cpu */
 613
 614        perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
 615        if (ibs_caps & IBS_CAPS_OPCNT)
 616                perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
 617        perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 618        register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
 619        printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 620
 621        return 0;
 622}
 623
 624#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
 625
 626static __init int perf_event_ibs_init(void) { return 0; }
 627
 628#endif
 629
 630/* IBS - apic initialization, for perf and oprofile */
 631
 632static __init u32 __get_ibs_caps(void)
 633{
 634        u32 caps;
 635        unsigned int max_level;
 636
 637        if (!boot_cpu_has(X86_FEATURE_IBS))
 638                return 0;
 639
 640        /* check IBS cpuid feature flags */
 641        max_level = cpuid_eax(0x80000000);
 642        if (max_level < IBS_CPUID_FEATURES)
 643                return IBS_CAPS_DEFAULT;
 644
 645        caps = cpuid_eax(IBS_CPUID_FEATURES);
 646        if (!(caps & IBS_CAPS_AVAIL))
 647                /* cpuid flags not valid */
 648                return IBS_CAPS_DEFAULT;
 649
 650        return caps;
 651}
 652
 653u32 get_ibs_caps(void)
 654{
 655        return ibs_caps;
 656}
 657
 658EXPORT_SYMBOL(get_ibs_caps);
 659
 660static inline int get_eilvt(int offset)
 661{
 662        return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
 663}
 664
 665static inline int put_eilvt(int offset)
 666{
 667        return !setup_APIC_eilvt(offset, 0, 0, 1);
 668}
 669
 670/*
 671 * Check and reserve APIC extended interrupt LVT offset for IBS if available.
 672 */
 673static inline int ibs_eilvt_valid(void)
 674{
 675        int offset;
 676        u64 val;
 677        int valid = 0;
 678
 679        preempt_disable();
 680
 681        rdmsrl(MSR_AMD64_IBSCTL, val);
 682        offset = val & IBSCTL_LVT_OFFSET_MASK;
 683
 684        if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
 685                pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
 686                       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
 687                goto out;
 688        }
 689
 690        if (!get_eilvt(offset)) {
 691                pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
 692                       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
 693                goto out;
 694        }
 695
 696        valid = 1;
 697out:
 698        preempt_enable();
 699
 700        return valid;
 701}
 702
 703static int setup_ibs_ctl(int ibs_eilvt_off)
 704{
 705        struct pci_dev *cpu_cfg;
 706        int nodes;
 707        u32 value = 0;
 708
 709        nodes = 0;
 710        cpu_cfg = NULL;
 711        do {
 712                cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
 713                                         PCI_DEVICE_ID_AMD_10H_NB_MISC,
 714                                         cpu_cfg);
 715                if (!cpu_cfg)
 716                        break;
 717                ++nodes;
 718                pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
 719                                       | IBSCTL_LVT_OFFSET_VALID);
 720                pci_read_config_dword(cpu_cfg, IBSCTL, &value);
 721                if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
 722                        pci_dev_put(cpu_cfg);
 723                        printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
 724                               "IBSCTL = 0x%08x\n", value);
 725                        return -EINVAL;
 726                }
 727        } while (1);
 728
 729        if (!nodes) {
 730                printk(KERN_DEBUG "No CPU node configured for IBS\n");
 731                return -ENODEV;
 732        }
 733
 734        return 0;
 735}
 736
 737/*
 738 * This runs only on the current cpu. We try to find an LVT offset and
 739 * setup the local APIC. For this we must disable preemption. On
 740 * success we initialize all nodes with this offset. This updates then
 741 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
 742 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
 743 * is using the new offset.
 744 */
 745static int force_ibs_eilvt_setup(void)
 746{
 747        int offset;
 748        int ret;
 749
 750        preempt_disable();
 751        /* find the next free available EILVT entry, skip offset 0 */
 752        for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
 753                if (get_eilvt(offset))
 754                        break;
 755        }
 756        preempt_enable();
 757
 758        if (offset == APIC_EILVT_NR_MAX) {
 759                printk(KERN_DEBUG "No EILVT entry available\n");
 760                return -EBUSY;
 761        }
 762
 763        ret = setup_ibs_ctl(offset);
 764        if (ret)
 765                goto out;
 766
 767        if (!ibs_eilvt_valid()) {
 768                ret = -EFAULT;
 769                goto out;
 770        }
 771
 772        pr_info("IBS: LVT offset %d assigned\n", offset);
 773
 774        return 0;
 775out:
 776        preempt_disable();
 777        put_eilvt(offset);
 778        preempt_enable();
 779        return ret;
 780}
 781
 782static inline int get_ibs_lvt_offset(void)
 783{
 784        u64 val;
 785
 786        rdmsrl(MSR_AMD64_IBSCTL, val);
 787        if (!(val & IBSCTL_LVT_OFFSET_VALID))
 788                return -EINVAL;
 789
 790        return val & IBSCTL_LVT_OFFSET_MASK;
 791}
 792
 793static void setup_APIC_ibs(void *dummy)
 794{
 795        int offset;
 796
 797        offset = get_ibs_lvt_offset();
 798        if (offset < 0)
 799                goto failed;
 800
 801        if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
 802                return;
 803failed:
 804        pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
 805                smp_processor_id());
 806}
 807
 808static void clear_APIC_ibs(void *dummy)
 809{
 810        int offset;
 811
 812        offset = get_ibs_lvt_offset();
 813        if (offset >= 0)
 814                setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
 815}
 816
 817static int __cpuinit
 818perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 819{
 820        switch (action & ~CPU_TASKS_FROZEN) {
 821        case CPU_STARTING:
 822                setup_APIC_ibs(NULL);
 823                break;
 824        case CPU_DYING:
 825                clear_APIC_ibs(NULL);
 826                break;
 827        default:
 828                break;
 829        }
 830
 831        return NOTIFY_OK;
 832}
 833
 834static __init int amd_ibs_init(void)
 835{
 836        u32 caps;
 837        int ret = -EINVAL;
 838
 839        caps = __get_ibs_caps();
 840        if (!caps)
 841                return -ENODEV; /* ibs not supported by the cpu */
 842
 843        /*
 844         * Force LVT offset assignment for family 10h: The offsets are
 845         * not assigned by the BIOS for this family, so the OS is
 846         * responsible for doing it. If the OS assignment fails, fall
 847         * back to BIOS settings and try to setup this.
 848         */
 849        if (boot_cpu_data.x86 == 0x10)
 850                force_ibs_eilvt_setup();
 851
 852        if (!ibs_eilvt_valid())
 853                goto out;
 854
 855        get_online_cpus();
 856        ibs_caps = caps;
 857        /* make ibs_caps visible to other cpus: */
 858        smp_mb();
 859        perf_cpu_notifier(perf_ibs_cpu_notifier);
 860        smp_call_function(setup_APIC_ibs, NULL, 1);
 861        put_online_cpus();
 862
 863        ret = perf_event_ibs_init();
 864out:
 865        if (ret)
 866                pr_err("Failed to setup IBS, %d\n", ret);
 867        return ret;
 868}
 869
 870/* Since we need the pci subsystem to init ibs we can't do this earlier: */
 871device_initcall(amd_ibs_init);
 872