linux/arch/x86/events/intel/rapl.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Support Intel RAPL energy consumption counters
   4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
   5 *
   6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   7 * section 14.7.1 (September 2013)
   8 *
   9 * RAPL provides more controls than just reporting energy consumption
  10 * however here we only expose the 3 energy consumption free running
  11 * counters (pp0, pkg, dram).
  12 *
  13 * Each of those counters increments in a power unit defined by the
  14 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  15 * but it can vary.
  16 *
  17 * Counter to rapl events mappings:
  18 *
  19 *  pp0 counter: consumption of all physical cores (power plane 0)
  20 *        event: rapl_energy_cores
  21 *    perf code: 0x1
  22 *
  23 *  pkg counter: consumption of the whole processor package
  24 *        event: rapl_energy_pkg
  25 *    perf code: 0x2
  26 *
  27 * dram counter: consumption of the dram domain (servers only)
  28 *        event: rapl_energy_dram
  29 *    perf code: 0x3
  30 *
  31 * gpu counter: consumption of the builtin-gpu domain (client only)
  32 *        event: rapl_energy_gpu
  33 *    perf code: 0x4
  34 *
  35 *  psys counter: consumption of the builtin-psys domain (client only)
  36 *        event: rapl_energy_psys
  37 *    perf code: 0x5
  38 *
  39 * We manage those counters as free running (read-only). They may be
  40 * use simultaneously by other tools, such as turbostat.
  41 *
  42 * The events only support system-wide mode counting. There is no
  43 * sampling support because it does not make sense and is not
  44 * supported by the RAPL hardware.
  45 *
  46 * Because we want to avoid floating-point operations in the kernel,
  47 * the events are all reported in fixed point arithmetic (32.32).
  48 * Tools must adjust the counts to convert them to Watts using
  49 * the duration of the measurement. Tools may use a function such as
  50 * ldexp(raw_count, -32);
  51 */
  52
  53#define pr_fmt(fmt) "RAPL PMU: " fmt
  54
  55#include <linux/module.h>
  56#include <linux/slab.h>
  57#include <linux/perf_event.h>
  58#include <linux/nospec.h>
  59#include <asm/cpu_device_id.h>
  60#include <asm/intel-family.h>
  61#include "../perf_event.h"
  62#include "../probe.h"
  63
  64MODULE_LICENSE("GPL");
  65
  66/*
  67 * RAPL energy status counters
  68 */
  69enum perf_rapl_events {
  70        PERF_RAPL_PP0 = 0,              /* all cores */
  71        PERF_RAPL_PKG,                  /* entire package */
  72        PERF_RAPL_RAM,                  /* DRAM */
  73        PERF_RAPL_PP1,                  /* gpu */
  74        PERF_RAPL_PSYS,                 /* psys */
  75
  76        PERF_RAPL_MAX,
  77        NR_RAPL_DOMAINS = PERF_RAPL_MAX,
  78};
  79
  80static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  81        "pp0-core",
  82        "package",
  83        "dram",
  84        "pp1-gpu",
  85        "psys",
  86};
  87
  88/*
  89 * event code: LSB 8 bits, passed in attr->config
  90 * any other bit is reserved
  91 */
  92#define RAPL_EVENT_MASK 0xFFULL
  93
  94#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
  95static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
  96                                struct kobj_attribute *attr,    \
  97                                char *page)                     \
  98{                                                               \
  99        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
 100        return sprintf(page, _format "\n");                     \
 101}                                                               \
 102static struct kobj_attribute format_attr_##_var =               \
 103        __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 104
 105#define RAPL_CNTR_WIDTH 32
 106
 107#define RAPL_EVENT_ATTR_STR(_name, v, str)                                      \
 108static struct perf_pmu_events_attr event_attr_##v = {                           \
 109        .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
 110        .id             = 0,                                                    \
 111        .event_str      = str,                                                  \
 112};
 113
 114struct rapl_pmu {
 115        raw_spinlock_t          lock;
 116        int                     n_active;
 117        int                     cpu;
 118        struct list_head        active_list;
 119        struct pmu              *pmu;
 120        ktime_t                 timer_interval;
 121        struct hrtimer          hrtimer;
 122};
 123
 124struct rapl_pmus {
 125        struct pmu              pmu;
 126        unsigned int            maxdie;
 127        struct rapl_pmu         *pmus[];
 128};
 129
 130struct rapl_model {
 131        unsigned long   events;
 132        bool            apply_quirk;
 133};
 134
 135 /* 1/2^hw_unit Joule */
 136static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 137static struct rapl_pmus *rapl_pmus;
 138static cpumask_t rapl_cpu_mask;
 139static unsigned int rapl_cntr_mask;
 140static u64 rapl_timer_ms;
 141static struct perf_msr rapl_msrs[];
 142
 143static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 144{
 145        unsigned int dieid = topology_logical_die_id(cpu);
 146
 147        /*
 148         * The unsigned check also catches the '-1' return value for non
 149         * existent mappings in the topology map.
 150         */
 151        return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
 152}
 153
 154static inline u64 rapl_read_counter(struct perf_event *event)
 155{
 156        u64 raw;
 157        rdmsrl(event->hw.event_base, raw);
 158        return raw;
 159}
 160
 161static inline u64 rapl_scale(u64 v, int cfg)
 162{
 163        if (cfg > NR_RAPL_DOMAINS) {
 164                pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 165                return v;
 166        }
 167        /*
 168         * scale delta to smallest unit (1/2^32)
 169         * users must then scale back: count * 1/(1e9*2^32) to get Joules
 170         * or use ldexp(count, -32).
 171         * Watts = Joules/Time delta
 172         */
 173        return v << (32 - rapl_hw_unit[cfg - 1]);
 174}
 175
 176static u64 rapl_event_update(struct perf_event *event)
 177{
 178        struct hw_perf_event *hwc = &event->hw;
 179        u64 prev_raw_count, new_raw_count;
 180        s64 delta, sdelta;
 181        int shift = RAPL_CNTR_WIDTH;
 182
 183again:
 184        prev_raw_count = local64_read(&hwc->prev_count);
 185        rdmsrl(event->hw.event_base, new_raw_count);
 186
 187        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 188                            new_raw_count) != prev_raw_count) {
 189                cpu_relax();
 190                goto again;
 191        }
 192
 193        /*
 194         * Now we have the new raw value and have updated the prev
 195         * timestamp already. We can now calculate the elapsed delta
 196         * (event-)time and add that to the generic event.
 197         *
 198         * Careful, not all hw sign-extends above the physical width
 199         * of the count.
 200         */
 201        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 202        delta >>= shift;
 203
 204        sdelta = rapl_scale(delta, event->hw.config);
 205
 206        local64_add(sdelta, &event->count);
 207
 208        return new_raw_count;
 209}
 210
 211static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 212{
 213       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
 214                     HRTIMER_MODE_REL_PINNED);
 215}
 216
 217static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 218{
 219        struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 220        struct perf_event *event;
 221        unsigned long flags;
 222
 223        if (!pmu->n_active)
 224                return HRTIMER_NORESTART;
 225
 226        raw_spin_lock_irqsave(&pmu->lock, flags);
 227
 228        list_for_each_entry(event, &pmu->active_list, active_entry)
 229                rapl_event_update(event);
 230
 231        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 232
 233        hrtimer_forward_now(hrtimer, pmu->timer_interval);
 234
 235        return HRTIMER_RESTART;
 236}
 237
 238static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 239{
 240        struct hrtimer *hr = &pmu->hrtimer;
 241
 242        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 243        hr->function = rapl_hrtimer_handle;
 244}
 245
 246static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 247                                   struct perf_event *event)
 248{
 249        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 250                return;
 251
 252        event->hw.state = 0;
 253
 254        list_add_tail(&event->active_entry, &pmu->active_list);
 255
 256        local64_set(&event->hw.prev_count, rapl_read_counter(event));
 257
 258        pmu->n_active++;
 259        if (pmu->n_active == 1)
 260                rapl_start_hrtimer(pmu);
 261}
 262
 263static void rapl_pmu_event_start(struct perf_event *event, int mode)
 264{
 265        struct rapl_pmu *pmu = event->pmu_private;
 266        unsigned long flags;
 267
 268        raw_spin_lock_irqsave(&pmu->lock, flags);
 269        __rapl_pmu_event_start(pmu, event);
 270        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 271}
 272
 273static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 274{
 275        struct rapl_pmu *pmu = event->pmu_private;
 276        struct hw_perf_event *hwc = &event->hw;
 277        unsigned long flags;
 278
 279        raw_spin_lock_irqsave(&pmu->lock, flags);
 280
 281        /* mark event as deactivated and stopped */
 282        if (!(hwc->state & PERF_HES_STOPPED)) {
 283                WARN_ON_ONCE(pmu->n_active <= 0);
 284                pmu->n_active--;
 285                if (pmu->n_active == 0)
 286                        hrtimer_cancel(&pmu->hrtimer);
 287
 288                list_del(&event->active_entry);
 289
 290                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 291                hwc->state |= PERF_HES_STOPPED;
 292        }
 293
 294        /* check if update of sw counter is necessary */
 295        if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 296                /*
 297                 * Drain the remaining delta count out of a event
 298                 * that we are disabling:
 299                 */
 300                rapl_event_update(event);
 301                hwc->state |= PERF_HES_UPTODATE;
 302        }
 303
 304        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 305}
 306
 307static int rapl_pmu_event_add(struct perf_event *event, int mode)
 308{
 309        struct rapl_pmu *pmu = event->pmu_private;
 310        struct hw_perf_event *hwc = &event->hw;
 311        unsigned long flags;
 312
 313        raw_spin_lock_irqsave(&pmu->lock, flags);
 314
 315        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 316
 317        if (mode & PERF_EF_START)
 318                __rapl_pmu_event_start(pmu, event);
 319
 320        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 321
 322        return 0;
 323}
 324
 325static void rapl_pmu_event_del(struct perf_event *event, int flags)
 326{
 327        rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 328}
 329
 330static int rapl_pmu_event_init(struct perf_event *event)
 331{
 332        u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 333        int bit, ret = 0;
 334        struct rapl_pmu *pmu;
 335
 336        /* only look at RAPL events */
 337        if (event->attr.type != rapl_pmus->pmu.type)
 338                return -ENOENT;
 339
 340        /* check only supported bits are set */
 341        if (event->attr.config & ~RAPL_EVENT_MASK)
 342                return -EINVAL;
 343
 344        if (event->cpu < 0)
 345                return -EINVAL;
 346
 347        event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 348
 349        if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
 350                return -EINVAL;
 351
 352        cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
 353        bit = cfg - 1;
 354
 355        /* check event supported */
 356        if (!(rapl_cntr_mask & (1 << bit)))
 357                return -EINVAL;
 358
 359        /* unsupported modes and filters */
 360        if (event->attr.sample_period) /* no sampling */
 361                return -EINVAL;
 362
 363        /* must be done before validate_group */
 364        pmu = cpu_to_rapl_pmu(event->cpu);
 365        if (!pmu)
 366                return -EINVAL;
 367        event->cpu = pmu->cpu;
 368        event->pmu_private = pmu;
 369        event->hw.event_base = rapl_msrs[bit].msr;
 370        event->hw.config = cfg;
 371        event->hw.idx = bit;
 372
 373        return ret;
 374}
 375
 376static void rapl_pmu_event_read(struct perf_event *event)
 377{
 378        rapl_event_update(event);
 379}
 380
 381static ssize_t rapl_get_attr_cpumask(struct device *dev,
 382                                struct device_attribute *attr, char *buf)
 383{
 384        return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 385}
 386
 387static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 388
 389static struct attribute *rapl_pmu_attrs[] = {
 390        &dev_attr_cpumask.attr,
 391        NULL,
 392};
 393
 394static struct attribute_group rapl_pmu_attr_group = {
 395        .attrs = rapl_pmu_attrs,
 396};
 397
 398RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 399RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 400RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 401RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 402RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 403
 404RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 405RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 406RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 407RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 408RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 409
 410/*
 411 * we compute in 0.23 nJ increments regardless of MSR
 412 */
 413RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 414RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 415RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 416RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 417RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 418
 419/*
 420 * There are no default events, but we need to create
 421 * "events" group (with empty attrs) before updating
 422 * it with detected events.
 423 */
 424static struct attribute *attrs_empty[] = {
 425        NULL,
 426};
 427
 428static struct attribute_group rapl_pmu_events_group = {
 429        .name = "events",
 430        .attrs = attrs_empty,
 431};
 432
 433DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
 434static struct attribute *rapl_formats_attr[] = {
 435        &format_attr_event.attr,
 436        NULL,
 437};
 438
 439static struct attribute_group rapl_pmu_format_group = {
 440        .name = "format",
 441        .attrs = rapl_formats_attr,
 442};
 443
 444static const struct attribute_group *rapl_attr_groups[] = {
 445        &rapl_pmu_attr_group,
 446        &rapl_pmu_format_group,
 447        &rapl_pmu_events_group,
 448        NULL,
 449};
 450
 451static struct attribute *rapl_events_cores[] = {
 452        EVENT_PTR(rapl_cores),
 453        EVENT_PTR(rapl_cores_unit),
 454        EVENT_PTR(rapl_cores_scale),
 455        NULL,
 456};
 457
 458static struct attribute_group rapl_events_cores_group = {
 459        .name  = "events",
 460        .attrs = rapl_events_cores,
 461};
 462
 463static struct attribute *rapl_events_pkg[] = {
 464        EVENT_PTR(rapl_pkg),
 465        EVENT_PTR(rapl_pkg_unit),
 466        EVENT_PTR(rapl_pkg_scale),
 467        NULL,
 468};
 469
 470static struct attribute_group rapl_events_pkg_group = {
 471        .name  = "events",
 472        .attrs = rapl_events_pkg,
 473};
 474
 475static struct attribute *rapl_events_ram[] = {
 476        EVENT_PTR(rapl_ram),
 477        EVENT_PTR(rapl_ram_unit),
 478        EVENT_PTR(rapl_ram_scale),
 479        NULL,
 480};
 481
 482static struct attribute_group rapl_events_ram_group = {
 483        .name  = "events",
 484        .attrs = rapl_events_ram,
 485};
 486
 487static struct attribute *rapl_events_gpu[] = {
 488        EVENT_PTR(rapl_gpu),
 489        EVENT_PTR(rapl_gpu_unit),
 490        EVENT_PTR(rapl_gpu_scale),
 491        NULL,
 492};
 493
 494static struct attribute_group rapl_events_gpu_group = {
 495        .name  = "events",
 496        .attrs = rapl_events_gpu,
 497};
 498
 499static struct attribute *rapl_events_psys[] = {
 500        EVENT_PTR(rapl_psys),
 501        EVENT_PTR(rapl_psys_unit),
 502        EVENT_PTR(rapl_psys_scale),
 503        NULL,
 504};
 505
 506static struct attribute_group rapl_events_psys_group = {
 507        .name  = "events",
 508        .attrs = rapl_events_psys,
 509};
 510
 511static bool test_msr(int idx, void *data)
 512{
 513        return test_bit(idx, (unsigned long *) data);
 514}
 515
 516static struct perf_msr rapl_msrs[] = {
 517        [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
 518        [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
 519        [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
 520        [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr },
 521        [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr },
 522};
 523
 524static int rapl_cpu_offline(unsigned int cpu)
 525{
 526        struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 527        int target;
 528
 529        /* Check if exiting cpu is used for collecting rapl events */
 530        if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
 531                return 0;
 532
 533        pmu->cpu = -1;
 534        /* Find a new cpu to collect rapl events */
 535        target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
 536
 537        /* Migrate rapl events to the new target */
 538        if (target < nr_cpu_ids) {
 539                cpumask_set_cpu(target, &rapl_cpu_mask);
 540                pmu->cpu = target;
 541                perf_pmu_migrate_context(pmu->pmu, cpu, target);
 542        }
 543        return 0;
 544}
 545
 546static int rapl_cpu_online(unsigned int cpu)
 547{
 548        struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 549        int target;
 550
 551        if (!pmu) {
 552                pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 553                if (!pmu)
 554                        return -ENOMEM;
 555
 556                raw_spin_lock_init(&pmu->lock);
 557                INIT_LIST_HEAD(&pmu->active_list);
 558                pmu->pmu = &rapl_pmus->pmu;
 559                pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 560                rapl_hrtimer_init(pmu);
 561
 562                rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
 563        }
 564
 565        /*
 566         * Check if there is an online cpu in the package which collects rapl
 567         * events already.
 568         */
 569        target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
 570        if (target < nr_cpu_ids)
 571                return 0;
 572
 573        cpumask_set_cpu(cpu, &rapl_cpu_mask);
 574        pmu->cpu = cpu;
 575        return 0;
 576}
 577
 578static int rapl_check_hw_unit(bool apply_quirk)
 579{
 580        u64 msr_rapl_power_unit_bits;
 581        int i;
 582
 583        /* protect rdmsrl() to handle virtualization */
 584        if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
 585                return -1;
 586        for (i = 0; i < NR_RAPL_DOMAINS; i++)
 587                rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 588
 589        /*
 590         * DRAM domain on HSW server and KNL has fixed energy unit which can be
 591         * different than the unit from power unit MSR. See
 592         * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 593         * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 594         */
 595        if (apply_quirk)
 596                rapl_hw_unit[PERF_RAPL_RAM] = 16;
 597
 598        /*
 599         * Calculate the timer rate:
 600         * Use reference of 200W for scaling the timeout to avoid counter
 601         * overflows. 200W = 200 Joules/sec
 602         * Divide interval by 2 to avoid lockstep (2 * 100)
 603         * if hw unit is 32, then we use 2 ms 1/200/2
 604         */
 605        rapl_timer_ms = 2;
 606        if (rapl_hw_unit[0] < 32) {
 607                rapl_timer_ms = (1000 / (2 * 100));
 608                rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
 609        }
 610        return 0;
 611}
 612
 613static void __init rapl_advertise(void)
 614{
 615        int i;
 616
 617        pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
 618                hweight32(rapl_cntr_mask), rapl_timer_ms);
 619
 620        for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 621                if (rapl_cntr_mask & (1 << i)) {
 622                        pr_info("hw unit of domain %s 2^-%d Joules\n",
 623                                rapl_domain_names[i], rapl_hw_unit[i]);
 624                }
 625        }
 626}
 627
 628static void cleanup_rapl_pmus(void)
 629{
 630        int i;
 631
 632        for (i = 0; i < rapl_pmus->maxdie; i++)
 633                kfree(rapl_pmus->pmus[i]);
 634        kfree(rapl_pmus);
 635}
 636
 637static const struct attribute_group *rapl_attr_update[] = {
 638        &rapl_events_cores_group,
 639        &rapl_events_pkg_group,
 640        &rapl_events_ram_group,
 641        &rapl_events_gpu_group,
 642        &rapl_events_gpu_group,
 643        NULL,
 644};
 645
 646static int __init init_rapl_pmus(void)
 647{
 648        int maxdie = topology_max_packages() * topology_max_die_per_package();
 649        size_t size;
 650
 651        size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
 652        rapl_pmus = kzalloc(size, GFP_KERNEL);
 653        if (!rapl_pmus)
 654                return -ENOMEM;
 655
 656        rapl_pmus->maxdie               = maxdie;
 657        rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
 658        rapl_pmus->pmu.attr_update      = rapl_attr_update;
 659        rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
 660        rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
 661        rapl_pmus->pmu.add              = rapl_pmu_event_add;
 662        rapl_pmus->pmu.del              = rapl_pmu_event_del;
 663        rapl_pmus->pmu.start            = rapl_pmu_event_start;
 664        rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
 665        rapl_pmus->pmu.read             = rapl_pmu_event_read;
 666        rapl_pmus->pmu.module           = THIS_MODULE;
 667        rapl_pmus->pmu.capabilities     = PERF_PMU_CAP_NO_EXCLUDE;
 668        return 0;
 669}
 670
 671#define X86_RAPL_MODEL_MATCH(model, init)       \
 672        { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
 673
 674static struct rapl_model model_snb = {
 675        .events         = BIT(PERF_RAPL_PP0) |
 676                          BIT(PERF_RAPL_PKG) |
 677                          BIT(PERF_RAPL_PP1),
 678        .apply_quirk    = false,
 679};
 680
 681static struct rapl_model model_snbep = {
 682        .events         = BIT(PERF_RAPL_PP0) |
 683                          BIT(PERF_RAPL_PKG) |
 684                          BIT(PERF_RAPL_RAM),
 685        .apply_quirk    = false,
 686};
 687
 688static struct rapl_model model_hsw = {
 689        .events         = BIT(PERF_RAPL_PP0) |
 690                          BIT(PERF_RAPL_PKG) |
 691                          BIT(PERF_RAPL_RAM) |
 692                          BIT(PERF_RAPL_PP1),
 693        .apply_quirk    = false,
 694};
 695
 696static struct rapl_model model_hsx = {
 697        .events         = BIT(PERF_RAPL_PP0) |
 698                          BIT(PERF_RAPL_PKG) |
 699                          BIT(PERF_RAPL_RAM),
 700        .apply_quirk    = true,
 701};
 702
 703static struct rapl_model model_knl = {
 704        .events         = BIT(PERF_RAPL_PKG) |
 705                          BIT(PERF_RAPL_RAM),
 706        .apply_quirk    = true,
 707};
 708
 709static struct rapl_model model_skl = {
 710        .events         = BIT(PERF_RAPL_PP0) |
 711                          BIT(PERF_RAPL_PKG) |
 712                          BIT(PERF_RAPL_RAM) |
 713                          BIT(PERF_RAPL_PP1) |
 714                          BIT(PERF_RAPL_PSYS),
 715        .apply_quirk    = false,
 716};
 717
 718static const struct x86_cpu_id rapl_model_match[] __initconst = {
 719        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,            model_snb),
 720        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,          model_snbep),
 721        X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,              model_snb),
 722        X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X,            model_snbep),
 723        X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL,                model_hsw),
 724        X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,              model_hsx),
 725        X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L,              model_hsw),
 726        X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G,              model_hsw),
 727        X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL,              model_hsw),
 728        X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G,            model_hsw),
 729        X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,            model_hsx),
 730        X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D,            model_hsx),
 731        X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,           model_knl),
 732        X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM,           model_knl),
 733        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L,              model_skl),
 734        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE,                model_skl),
 735        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,              model_hsx),
 736        X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L,             model_skl),
 737        X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE,               model_skl),
 738        X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L,           model_skl),
 739        X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT,          model_hsw),
 740        X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D,        model_hsw),
 741        X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,     model_hsw),
 742        X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L,              model_skl),
 743        X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE,                model_skl),
 744        {},
 745};
 746
 747MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
 748
 749static int __init rapl_pmu_init(void)
 750{
 751        const struct x86_cpu_id *id;
 752        struct rapl_model *rm;
 753        int ret;
 754
 755        id = x86_match_cpu(rapl_model_match);
 756        if (!id)
 757                return -ENODEV;
 758
 759        rm = (struct rapl_model *) id->driver_data;
 760        rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 761                                        false, (void *) &rm->events);
 762
 763        ret = rapl_check_hw_unit(rm->apply_quirk);
 764        if (ret)
 765                return ret;
 766
 767        ret = init_rapl_pmus();
 768        if (ret)
 769                return ret;
 770
 771        /*
 772         * Install callbacks. Core will call them for each online cpu.
 773         */
 774        ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
 775                                "perf/x86/rapl:online",
 776                                rapl_cpu_online, rapl_cpu_offline);
 777        if (ret)
 778                goto out;
 779
 780        ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 781        if (ret)
 782                goto out1;
 783
 784        rapl_advertise();
 785        return 0;
 786
 787out1:
 788        cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 789out:
 790        pr_warn("Initialization failed (%d), disabled\n", ret);
 791        cleanup_rapl_pmus();
 792        return ret;
 793}
 794module_init(rapl_pmu_init);
 795
 796static void __exit intel_rapl_exit(void)
 797{
 798        cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 799        perf_pmu_unregister(&rapl_pmus->pmu);
 800        cleanup_rapl_pmus();
 801}
 802module_exit(intel_rapl_exit);
 803