linux/drivers/powercap/intel_rapl_common.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Common code for Intel Running Average Power Limit (RAPL) support.
   4 * Copyright (c) 2019, Intel Corporation.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/kernel.h>
   9#include <linux/module.h>
  10#include <linux/list.h>
  11#include <linux/types.h>
  12#include <linux/device.h>
  13#include <linux/slab.h>
  14#include <linux/log2.h>
  15#include <linux/bitmap.h>
  16#include <linux/delay.h>
  17#include <linux/sysfs.h>
  18#include <linux/cpu.h>
  19#include <linux/powercap.h>
  20#include <linux/suspend.h>
  21#include <linux/intel_rapl.h>
  22#include <linux/processor.h>
  23#include <linux/platform_device.h>
  24
  25#include <asm/iosf_mbi.h>
  26#include <asm/cpu_device_id.h>
  27#include <asm/intel-family.h>
  28
  29/* Local defines */
  30#define MSR_PLATFORM_POWER_LIMIT        0x0000065C
  31
  32/* bitmasks for RAPL MSRs, used by primitive access functions */
  33#define ENERGY_STATUS_MASK      0xffffffff
  34
  35#define POWER_LIMIT1_MASK       0x7FFF
  36#define POWER_LIMIT1_ENABLE     BIT(15)
  37#define POWER_LIMIT1_CLAMP      BIT(16)
  38
  39#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
  40#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
  41#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
  42#define POWER_HIGH_LOCK         BIT_ULL(63)
  43#define POWER_LOW_LOCK          BIT(31)
  44
  45#define TIME_WINDOW1_MASK       (0x7FULL<<17)
  46#define TIME_WINDOW2_MASK       (0x7FULL<<49)
  47
  48#define POWER_UNIT_OFFSET       0
  49#define POWER_UNIT_MASK         0x0F
  50
  51#define ENERGY_UNIT_OFFSET      0x08
  52#define ENERGY_UNIT_MASK        0x1F00
  53
  54#define TIME_UNIT_OFFSET        0x10
  55#define TIME_UNIT_MASK          0xF0000
  56
  57#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
  58#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
  59#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
  60#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
  61
  62#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
  63#define PP_POLICY_MASK         0x1F
  64
  65/* Non HW constants */
  66#define RAPL_PRIMITIVE_DERIVED       BIT(1)     /* not from raw data */
  67#define RAPL_PRIMITIVE_DUMMY         BIT(2)
  68
  69#define TIME_WINDOW_MAX_MSEC 40000
  70#define TIME_WINDOW_MIN_MSEC 250
  71#define ENERGY_UNIT_SCALE    1000       /* scale from driver unit to powercap unit */
  72enum unit_type {
  73        ARBITRARY_UNIT,         /* no translation */
  74        POWER_UNIT,
  75        ENERGY_UNIT,
  76        TIME_UNIT,
  77};
  78
  79/* per domain data, some are optional */
  80#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
  81
  82#define DOMAIN_STATE_INACTIVE           BIT(0)
  83#define DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
  84#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
  85
  86static const char pl1_name[] = "long_term";
  87static const char pl2_name[] = "short_term";
  88
  89#define power_zone_to_rapl_domain(_zone) \
  90        container_of(_zone, struct rapl_domain, power_zone)
  91
  92struct rapl_defaults {
  93        u8 floor_freq_reg_addr;
  94        int (*check_unit)(struct rapl_package *rp, int cpu);
  95        void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
  96        u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
  97                                    bool to_raw);
  98        unsigned int dram_domain_energy_unit;
  99};
 100static struct rapl_defaults *rapl_defaults;
 101
 102/* Sideband MBI registers */
 103#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
 104#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
 105
 106#define PACKAGE_PLN_INT_SAVED   BIT(0)
 107#define MAX_PRIM_NAME (32)
 108
 109/* per domain data. used to describe individual knobs such that access function
 110 * can be consolidated into one instead of many inline functions.
 111 */
 112struct rapl_primitive_info {
 113        const char *name;
 114        u64 mask;
 115        int shift;
 116        enum rapl_domain_reg_id id;
 117        enum unit_type unit;
 118        u32 flag;
 119};
 120
 121#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
 122                .name = #p,                     \
 123                .mask = m,                      \
 124                .shift = s,                     \
 125                .id = i,                        \
 126                .unit = u,                      \
 127                .flag = f                       \
 128        }
 129
 130static void rapl_init_domains(struct rapl_package *rp);
 131static int rapl_read_data_raw(struct rapl_domain *rd,
 132                              enum rapl_primitives prim,
 133                              bool xlate, u64 *data);
 134static int rapl_write_data_raw(struct rapl_domain *rd,
 135                               enum rapl_primitives prim,
 136                               unsigned long long value);
 137static u64 rapl_unit_xlate(struct rapl_domain *rd,
 138                           enum unit_type type, u64 value, int to_raw);
 139static void package_power_limit_irq_save(struct rapl_package *rp);
 140
 141static LIST_HEAD(rapl_packages);        /* guarded by CPU hotplug lock */
 142
 143static const char *const rapl_domain_names[] = {
 144        "package",
 145        "core",
 146        "uncore",
 147        "dram",
 148        "psys",
 149};
 150
 151static int get_energy_counter(struct powercap_zone *power_zone,
 152                              u64 *energy_raw)
 153{
 154        struct rapl_domain *rd;
 155        u64 energy_now;
 156
 157        /* prevent CPU hotplug, make sure the RAPL domain does not go
 158         * away while reading the counter.
 159         */
 160        get_online_cpus();
 161        rd = power_zone_to_rapl_domain(power_zone);
 162
 163        if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
 164                *energy_raw = energy_now;
 165                put_online_cpus();
 166
 167                return 0;
 168        }
 169        put_online_cpus();
 170
 171        return -EIO;
 172}
 173
 174static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
 175{
 176        struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
 177
 178        *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
 179        return 0;
 180}
 181
 182static int release_zone(struct powercap_zone *power_zone)
 183{
 184        struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 185        struct rapl_package *rp = rd->rp;
 186
 187        /* package zone is the last zone of a package, we can free
 188         * memory here since all children has been unregistered.
 189         */
 190        if (rd->id == RAPL_DOMAIN_PACKAGE) {
 191                kfree(rd);
 192                rp->domains = NULL;
 193        }
 194
 195        return 0;
 196
 197}
 198
 199static int find_nr_power_limit(struct rapl_domain *rd)
 200{
 201        int i, nr_pl = 0;
 202
 203        for (i = 0; i < NR_POWER_LIMITS; i++) {
 204                if (rd->rpl[i].name)
 205                        nr_pl++;
 206        }
 207
 208        return nr_pl;
 209}
 210
 211static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
 212{
 213        struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 214
 215        if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
 216                return -EACCES;
 217
 218        get_online_cpus();
 219        rapl_write_data_raw(rd, PL1_ENABLE, mode);
 220        if (rapl_defaults->set_floor_freq)
 221                rapl_defaults->set_floor_freq(rd, mode);
 222        put_online_cpus();
 223
 224        return 0;
 225}
 226
 227static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
 228{
 229        struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 230        u64 val;
 231
 232        if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 233                *mode = false;
 234                return 0;
 235        }
 236        get_online_cpus();
 237        if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
 238                put_online_cpus();
 239                return -EIO;
 240        }
 241        *mode = val;
 242        put_online_cpus();
 243
 244        return 0;
 245}
 246
 247/* per RAPL domain ops, in the order of rapl_domain_type */
 248static const struct powercap_zone_ops zone_ops[] = {
 249        /* RAPL_DOMAIN_PACKAGE */
 250        {
 251         .get_energy_uj = get_energy_counter,
 252         .get_max_energy_range_uj = get_max_energy_counter,
 253         .release = release_zone,
 254         .set_enable = set_domain_enable,
 255         .get_enable = get_domain_enable,
 256         },
 257        /* RAPL_DOMAIN_PP0 */
 258        {
 259         .get_energy_uj = get_energy_counter,
 260         .get_max_energy_range_uj = get_max_energy_counter,
 261         .release = release_zone,
 262         .set_enable = set_domain_enable,
 263         .get_enable = get_domain_enable,
 264         },
 265        /* RAPL_DOMAIN_PP1 */
 266        {
 267         .get_energy_uj = get_energy_counter,
 268         .get_max_energy_range_uj = get_max_energy_counter,
 269         .release = release_zone,
 270         .set_enable = set_domain_enable,
 271         .get_enable = get_domain_enable,
 272         },
 273        /* RAPL_DOMAIN_DRAM */
 274        {
 275         .get_energy_uj = get_energy_counter,
 276         .get_max_energy_range_uj = get_max_energy_counter,
 277         .release = release_zone,
 278         .set_enable = set_domain_enable,
 279         .get_enable = get_domain_enable,
 280         },
 281        /* RAPL_DOMAIN_PLATFORM */
 282        {
 283         .get_energy_uj = get_energy_counter,
 284         .get_max_energy_range_uj = get_max_energy_counter,
 285         .release = release_zone,
 286         .set_enable = set_domain_enable,
 287         .get_enable = get_domain_enable,
 288         },
 289};
 290
 291/*
 292 * Constraint index used by powercap can be different than power limit (PL)
 293 * index in that some  PLs maybe missing due to non-existent MSRs. So we
 294 * need to convert here by finding the valid PLs only (name populated).
 295 */
 296static int contraint_to_pl(struct rapl_domain *rd, int cid)
 297{
 298        int i, j;
 299
 300        for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
 301                if ((rd->rpl[i].name) && j++ == cid) {
 302                        pr_debug("%s: index %d\n", __func__, i);
 303                        return i;
 304                }
 305        }
 306        pr_err("Cannot find matching power limit for constraint %d\n", cid);
 307
 308        return -EINVAL;
 309}
 310
 311static int set_power_limit(struct powercap_zone *power_zone, int cid,
 312                           u64 power_limit)
 313{
 314        struct rapl_domain *rd;
 315        struct rapl_package *rp;
 316        int ret = 0;
 317        int id;
 318
 319        get_online_cpus();
 320        rd = power_zone_to_rapl_domain(power_zone);
 321        id = contraint_to_pl(rd, cid);
 322        if (id < 0) {
 323                ret = id;
 324                goto set_exit;
 325        }
 326
 327        rp = rd->rp;
 328
 329        if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 330                dev_warn(&power_zone->dev,
 331                         "%s locked by BIOS, monitoring only\n", rd->name);
 332                ret = -EACCES;
 333                goto set_exit;
 334        }
 335
 336        switch (rd->rpl[id].prim_id) {
 337        case PL1_ENABLE:
 338                rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
 339                break;
 340        case PL2_ENABLE:
 341                rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
 342                break;
 343        default:
 344                ret = -EINVAL;
 345        }
 346        if (!ret)
 347                package_power_limit_irq_save(rp);
 348set_exit:
 349        put_online_cpus();
 350        return ret;
 351}
 352
 353static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 354                                   u64 *data)
 355{
 356        struct rapl_domain *rd;
 357        u64 val;
 358        int prim;
 359        int ret = 0;
 360        int id;
 361
 362        get_online_cpus();
 363        rd = power_zone_to_rapl_domain(power_zone);
 364        id = contraint_to_pl(rd, cid);
 365        if (id < 0) {
 366                ret = id;
 367                goto get_exit;
 368        }
 369
 370        switch (rd->rpl[id].prim_id) {
 371        case PL1_ENABLE:
 372                prim = POWER_LIMIT1;
 373                break;
 374        case PL2_ENABLE:
 375                prim = POWER_LIMIT2;
 376                break;
 377        default:
 378                put_online_cpus();
 379                return -EINVAL;
 380        }
 381        if (rapl_read_data_raw(rd, prim, true, &val))
 382                ret = -EIO;
 383        else
 384                *data = val;
 385
 386get_exit:
 387        put_online_cpus();
 388
 389        return ret;
 390}
 391
 392static int set_time_window(struct powercap_zone *power_zone, int cid,
 393                           u64 window)
 394{
 395        struct rapl_domain *rd;
 396        int ret = 0;
 397        int id;
 398
 399        get_online_cpus();
 400        rd = power_zone_to_rapl_domain(power_zone);
 401        id = contraint_to_pl(rd, cid);
 402        if (id < 0) {
 403                ret = id;
 404                goto set_time_exit;
 405        }
 406
 407        switch (rd->rpl[id].prim_id) {
 408        case PL1_ENABLE:
 409                rapl_write_data_raw(rd, TIME_WINDOW1, window);
 410                break;
 411        case PL2_ENABLE:
 412                rapl_write_data_raw(rd, TIME_WINDOW2, window);
 413                break;
 414        default:
 415                ret = -EINVAL;
 416        }
 417
 418set_time_exit:
 419        put_online_cpus();
 420        return ret;
 421}
 422
 423static int get_time_window(struct powercap_zone *power_zone, int cid,
 424                           u64 *data)
 425{
 426        struct rapl_domain *rd;
 427        u64 val;
 428        int ret = 0;
 429        int id;
 430
 431        get_online_cpus();
 432        rd = power_zone_to_rapl_domain(power_zone);
 433        id = contraint_to_pl(rd, cid);
 434        if (id < 0) {
 435                ret = id;
 436                goto get_time_exit;
 437        }
 438
 439        switch (rd->rpl[id].prim_id) {
 440        case PL1_ENABLE:
 441                ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
 442                break;
 443        case PL2_ENABLE:
 444                ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
 445                break;
 446        default:
 447                put_online_cpus();
 448                return -EINVAL;
 449        }
 450        if (!ret)
 451                *data = val;
 452
 453get_time_exit:
 454        put_online_cpus();
 455
 456        return ret;
 457}
 458
 459static const char *get_constraint_name(struct powercap_zone *power_zone,
 460                                       int cid)
 461{
 462        struct rapl_domain *rd;
 463        int id;
 464
 465        rd = power_zone_to_rapl_domain(power_zone);
 466        id = contraint_to_pl(rd, cid);
 467        if (id >= 0)
 468                return rd->rpl[id].name;
 469
 470        return NULL;
 471}
 472
 473static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
 474{
 475        struct rapl_domain *rd;
 476        u64 val;
 477        int prim;
 478        int ret = 0;
 479
 480        get_online_cpus();
 481        rd = power_zone_to_rapl_domain(power_zone);
 482        switch (rd->rpl[id].prim_id) {
 483        case PL1_ENABLE:
 484                prim = THERMAL_SPEC_POWER;
 485                break;
 486        case PL2_ENABLE:
 487                prim = MAX_POWER;
 488                break;
 489        default:
 490                put_online_cpus();
 491                return -EINVAL;
 492        }
 493        if (rapl_read_data_raw(rd, prim, true, &val))
 494                ret = -EIO;
 495        else
 496                *data = val;
 497
 498        put_online_cpus();
 499
 500        return ret;
 501}
 502
 503static const struct powercap_zone_constraint_ops constraint_ops = {
 504        .set_power_limit_uw = set_power_limit,
 505        .get_power_limit_uw = get_current_power_limit,
 506        .set_time_window_us = set_time_window,
 507        .get_time_window_us = get_time_window,
 508        .get_max_power_uw = get_max_power,
 509        .get_name = get_constraint_name,
 510};
 511
 512/* called after domain detection and package level data are set */
 513static void rapl_init_domains(struct rapl_package *rp)
 514{
 515        enum rapl_domain_type i;
 516        enum rapl_domain_reg_id j;
 517        struct rapl_domain *rd = rp->domains;
 518
 519        for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 520                unsigned int mask = rp->domain_map & (1 << i);
 521
 522                if (!mask)
 523                        continue;
 524
 525                rd->rp = rp;
 526                rd->name = rapl_domain_names[i];
 527                rd->id = i;
 528                rd->rpl[0].prim_id = PL1_ENABLE;
 529                rd->rpl[0].name = pl1_name;
 530                /* some domain may support two power limits */
 531                if (rp->priv->limits[i] == 2) {
 532                        rd->rpl[1].prim_id = PL2_ENABLE;
 533                        rd->rpl[1].name = pl2_name;
 534                }
 535
 536                for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
 537                        rd->regs[j] = rp->priv->regs[i][j];
 538
 539                if (i == RAPL_DOMAIN_DRAM) {
 540                        rd->domain_energy_unit =
 541                            rapl_defaults->dram_domain_energy_unit;
 542                        if (rd->domain_energy_unit)
 543                                pr_info("DRAM domain energy unit %dpj\n",
 544                                        rd->domain_energy_unit);
 545                }
 546                rd++;
 547        }
 548}
 549
 550static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
 551                           u64 value, int to_raw)
 552{
 553        u64 units = 1;
 554        struct rapl_package *rp = rd->rp;
 555        u64 scale = 1;
 556
 557        switch (type) {
 558        case POWER_UNIT:
 559                units = rp->power_unit;
 560                break;
 561        case ENERGY_UNIT:
 562                scale = ENERGY_UNIT_SCALE;
 563                /* per domain unit takes precedence */
 564                if (rd->domain_energy_unit)
 565                        units = rd->domain_energy_unit;
 566                else
 567                        units = rp->energy_unit;
 568                break;
 569        case TIME_UNIT:
 570                return rapl_defaults->compute_time_window(rp, value, to_raw);
 571        case ARBITRARY_UNIT:
 572        default:
 573                return value;
 574        };
 575
 576        if (to_raw)
 577                return div64_u64(value, units) * scale;
 578
 579        value *= units;
 580
 581        return div64_u64(value, scale);
 582}
 583
 584/* in the order of enum rapl_primitives */
 585static struct rapl_primitive_info rpi[] = {
 586        /* name, mask, shift, msr index, unit divisor */
 587        PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
 588                            RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
 589        PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
 590                            RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 591        PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 592                            RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 593        PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 594                            RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 595        PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 596                            RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 597        PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
 598                            RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 599        PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
 600                            RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 601        PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
 602                            RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 603        PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
 604                            RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 605        PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
 606                            RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 607        PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
 608                            0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 609        PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
 610                            RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 611        PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
 612                            RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 613        PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
 614                            RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
 615        PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
 616                            RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 617        PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 618                            RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
 619        /* non-hardware */
 620        PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 621                            RAPL_PRIMITIVE_DERIVED),
 622        {NULL, 0, 0, 0},
 623};
 624
 625/* Read primitive data based on its related struct rapl_primitive_info.
 626 * if xlate flag is set, return translated data based on data units, i.e.
 627 * time, energy, and power.
 628 * RAPL MSRs are non-architectual and are laid out not consistently across
 629 * domains. Here we use primitive info to allow writing consolidated access
 630 * functions.
 631 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
 632 * is pre-assigned based on RAPL unit MSRs read at init time.
 633 * 63-------------------------- 31--------------------------- 0
 634 * |                           xxxxx (mask)                   |
 635 * |                                |<- shift ----------------|
 636 * 63-------------------------- 31--------------------------- 0
 637 */
 638static int rapl_read_data_raw(struct rapl_domain *rd,
 639                              enum rapl_primitives prim, bool xlate, u64 *data)
 640{
 641        u64 value;
 642        struct rapl_primitive_info *rp = &rpi[prim];
 643        struct reg_action ra;
 644        int cpu;
 645
 646        if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 647                return -EINVAL;
 648
 649        ra.reg = rd->regs[rp->id];
 650        if (!ra.reg)
 651                return -EINVAL;
 652
 653        cpu = rd->rp->lead_cpu;
 654
 655        /* domain with 2 limits has different bit */
 656        if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
 657                rp->mask = POWER_HIGH_LOCK;
 658                rp->shift = 63;
 659        }
 660        /* non-hardware data are collected by the polling thread */
 661        if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
 662                *data = rd->rdd.primitives[prim];
 663                return 0;
 664        }
 665
 666        ra.mask = rp->mask;
 667
 668        if (rd->rp->priv->read_raw(cpu, &ra)) {
 669                pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 670                return -EIO;
 671        }
 672
 673        value = ra.value >> rp->shift;
 674
 675        if (xlate)
 676                *data = rapl_unit_xlate(rd, rp->unit, value, 0);
 677        else
 678                *data = value;
 679
 680        return 0;
 681}
 682
 683/* Similar use of primitive info in the read counterpart */
 684static int rapl_write_data_raw(struct rapl_domain *rd,
 685                               enum rapl_primitives prim,
 686                               unsigned long long value)
 687{
 688        struct rapl_primitive_info *rp = &rpi[prim];
 689        int cpu;
 690        u64 bits;
 691        struct reg_action ra;
 692        int ret;
 693
 694        cpu = rd->rp->lead_cpu;
 695        bits = rapl_unit_xlate(rd, rp->unit, value, 1);
 696        bits <<= rp->shift;
 697        bits &= rp->mask;
 698
 699        memset(&ra, 0, sizeof(ra));
 700
 701        ra.reg = rd->regs[rp->id];
 702        ra.mask = rp->mask;
 703        ra.value = bits;
 704
 705        ret = rd->rp->priv->write_raw(cpu, &ra);
 706
 707        return ret;
 708}
 709
 710/*
 711 * Raw RAPL data stored in MSRs are in certain scales. We need to
 712 * convert them into standard units based on the units reported in
 713 * the RAPL unit MSRs. This is specific to CPUs as the method to
 714 * calculate units differ on different CPUs.
 715 * We convert the units to below format based on CPUs.
 716 * i.e.
 717 * energy unit: picoJoules  : Represented in picoJoules by default
 718 * power unit : microWatts  : Represented in milliWatts by default
 719 * time unit  : microseconds: Represented in seconds by default
 720 */
 721static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 722{
 723        struct reg_action ra;
 724        u32 value;
 725
 726        ra.reg = rp->priv->reg_unit;
 727        ra.mask = ~0;
 728        if (rp->priv->read_raw(cpu, &ra)) {
 729                pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 730                       rp->priv->reg_unit, cpu);
 731                return -ENODEV;
 732        }
 733
 734        value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 735        rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
 736
 737        value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 738        rp->power_unit = 1000000 / (1 << value);
 739
 740        value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 741        rp->time_unit = 1000000 / (1 << value);
 742
 743        pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
 744                 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 745
 746        return 0;
 747}
 748
 749static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 750{
 751        struct reg_action ra;
 752        u32 value;
 753
 754        ra.reg = rp->priv->reg_unit;
 755        ra.mask = ~0;
 756        if (rp->priv->read_raw(cpu, &ra)) {
 757                pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 758                       rp->priv->reg_unit, cpu);
 759                return -ENODEV;
 760        }
 761
 762        value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 763        rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
 764
 765        value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 766        rp->power_unit = (1 << value) * 1000;
 767
 768        value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 769        rp->time_unit = 1000000 / (1 << value);
 770
 771        pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
 772                 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 773
 774        return 0;
 775}
 776
 777static void power_limit_irq_save_cpu(void *info)
 778{
 779        u32 l, h = 0;
 780        struct rapl_package *rp = (struct rapl_package *)info;
 781
 782        /* save the state of PLN irq mask bit before disabling it */
 783        rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 784        if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
 785                rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
 786                rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
 787        }
 788        l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 789        wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 790}
 791
 792/* REVISIT:
 793 * When package power limit is set artificially low by RAPL, LVT
 794 * thermal interrupt for package power limit should be ignored
 795 * since we are not really exceeding the real limit. The intention
 796 * is to avoid excessive interrupts while we are trying to save power.
 797 * A useful feature might be routing the package_power_limit interrupt
 798 * to userspace via eventfd. once we have a usecase, this is simple
 799 * to do by adding an atomic notifier.
 800 */
 801
 802static void package_power_limit_irq_save(struct rapl_package *rp)
 803{
 804        if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 805                return;
 806
 807        smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 808}
 809
 810/*
 811 * Restore per package power limit interrupt enable state. Called from cpu
 812 * hotplug code on package removal.
 813 */
 814static void package_power_limit_irq_restore(struct rapl_package *rp)
 815{
 816        u32 l, h;
 817
 818        if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 819                return;
 820
 821        /* irq enable state not saved, nothing to restore */
 822        if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
 823                return;
 824
 825        rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 826
 827        if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
 828                l |= PACKAGE_THERM_INT_PLN_ENABLE;
 829        else
 830                l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 831
 832        wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 833}
 834
 835static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
 836{
 837        int nr_powerlimit = find_nr_power_limit(rd);
 838
 839        /* always enable clamp such that p-state can go below OS requested
 840         * range. power capping priority over guranteed frequency.
 841         */
 842        rapl_write_data_raw(rd, PL1_CLAMP, mode);
 843
 844        /* some domains have pl2 */
 845        if (nr_powerlimit > 1) {
 846                rapl_write_data_raw(rd, PL2_ENABLE, mode);
 847                rapl_write_data_raw(rd, PL2_CLAMP, mode);
 848        }
 849}
 850
 851static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
 852{
 853        static u32 power_ctrl_orig_val;
 854        u32 mdata;
 855
 856        if (!rapl_defaults->floor_freq_reg_addr) {
 857                pr_err("Invalid floor frequency config register\n");
 858                return;
 859        }
 860
 861        if (!power_ctrl_orig_val)
 862                iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
 863                              rapl_defaults->floor_freq_reg_addr,
 864                              &power_ctrl_orig_val);
 865        mdata = power_ctrl_orig_val;
 866        if (enable) {
 867                mdata &= ~(0x7f << 8);
 868                mdata |= 1 << 8;
 869        }
 870        iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
 871                       rapl_defaults->floor_freq_reg_addr, mdata);
 872}
 873
 874static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
 875                                         bool to_raw)
 876{
 877        u64 f, y;               /* fraction and exp. used for time unit */
 878
 879        /*
 880         * Special processing based on 2^Y*(1+F/4), refer
 881         * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
 882         */
 883        if (!to_raw) {
 884                f = (value & 0x60) >> 5;
 885                y = value & 0x1f;
 886                value = (1 << y) * (4 + f) * rp->time_unit / 4;
 887        } else {
 888                do_div(value, rp->time_unit);
 889                y = ilog2(value);
 890                f = div64_u64(4 * (value - (1 << y)), 1 << y);
 891                value = (y & 0x1f) | ((f & 0x3) << 5);
 892        }
 893        return value;
 894}
 895
 896static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
 897                                         bool to_raw)
 898{
 899        /*
 900         * Atom time unit encoding is straight forward val * time_unit,
 901         * where time_unit is default to 1 sec. Never 0.
 902         */
 903        if (!to_raw)
 904                return (value) ? value *= rp->time_unit : rp->time_unit;
 905
 906        value = div64_u64(value, rp->time_unit);
 907
 908        return value;
 909}
 910
 911static const struct rapl_defaults rapl_defaults_core = {
 912        .floor_freq_reg_addr = 0,
 913        .check_unit = rapl_check_unit_core,
 914        .set_floor_freq = set_floor_freq_default,
 915        .compute_time_window = rapl_compute_time_window_core,
 916};
 917
 918static const struct rapl_defaults rapl_defaults_hsw_server = {
 919        .check_unit = rapl_check_unit_core,
 920        .set_floor_freq = set_floor_freq_default,
 921        .compute_time_window = rapl_compute_time_window_core,
 922        .dram_domain_energy_unit = 15300,
 923};
 924
 925static const struct rapl_defaults rapl_defaults_byt = {
 926        .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
 927        .check_unit = rapl_check_unit_atom,
 928        .set_floor_freq = set_floor_freq_atom,
 929        .compute_time_window = rapl_compute_time_window_atom,
 930};
 931
 932static const struct rapl_defaults rapl_defaults_tng = {
 933        .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
 934        .check_unit = rapl_check_unit_atom,
 935        .set_floor_freq = set_floor_freq_atom,
 936        .compute_time_window = rapl_compute_time_window_atom,
 937};
 938
 939static const struct rapl_defaults rapl_defaults_ann = {
 940        .floor_freq_reg_addr = 0,
 941        .check_unit = rapl_check_unit_atom,
 942        .set_floor_freq = NULL,
 943        .compute_time_window = rapl_compute_time_window_atom,
 944};
 945
 946static const struct rapl_defaults rapl_defaults_cht = {
 947        .floor_freq_reg_addr = 0,
 948        .check_unit = rapl_check_unit_atom,
 949        .set_floor_freq = NULL,
 950        .compute_time_window = rapl_compute_time_window_atom,
 951};
 952
 953static const struct x86_cpu_id rapl_ids[] __initconst = {
 954        INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
 955        INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
 956
 957        INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
 958        INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
 959
 960        INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
 961        INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
 962        INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
 963        INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
 964
 965        INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
 966        INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
 967        INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
 968        INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
 969
 970        INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
 971        INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
 972        INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
 973        INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
 974        INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
 975        INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
 976        INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
 977        INTEL_CPU_FAM6(ICELAKE_DESKTOP, rapl_defaults_core),
 978        INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
 979        INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
 980        INTEL_CPU_FAM6(ICELAKE_XEON_D, rapl_defaults_hsw_server),
 981
 982        INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
 983        INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
 984        INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
 985        INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
 986        INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
 987        INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
 988        INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
 989        INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
 990
 991        INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
 992        INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
 993        {}
 994};
 995
 996MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
 997
 998/* Read once for all raw primitive data for domains */
 999static void rapl_update_domain_data(struct rapl_package *rp)
1000{
1001        int dmn, prim;
1002        u64 val;
1003
1004        for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1005                pr_debug("update %s domain %s data\n", rp->name,
1006                         rp->domains[dmn].name);
1007                /* exclude non-raw primitives */
1008                for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1009                        if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1010                                                rpi[prim].unit, &val))
1011                                rp->domains[dmn].rdd.primitives[prim] = val;
1012                }
1013        }
1014
1015}
1016
1017static int rapl_package_register_powercap(struct rapl_package *rp)
1018{
1019        struct rapl_domain *rd;
1020        struct powercap_zone *power_zone = NULL;
1021        int nr_pl, ret;
1022
1023        /* Update the domain data of the new package */
1024        rapl_update_domain_data(rp);
1025
1026        /* first we register package domain as the parent zone */
1027        for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1028                if (rd->id == RAPL_DOMAIN_PACKAGE) {
1029                        nr_pl = find_nr_power_limit(rd);
1030                        pr_debug("register package domain %s\n", rp->name);
1031                        power_zone = powercap_register_zone(&rd->power_zone,
1032                                            rp->priv->control_type, rp->name,
1033                                            NULL, &zone_ops[rd->id], nr_pl,
1034                                            &constraint_ops);
1035                        if (IS_ERR(power_zone)) {
1036                                pr_debug("failed to register power zone %s\n",
1037                                         rp->name);
1038                                return PTR_ERR(power_zone);
1039                        }
1040                        /* track parent zone in per package/socket data */
1041                        rp->power_zone = power_zone;
1042                        /* done, only one package domain per socket */
1043                        break;
1044                }
1045        }
1046        if (!power_zone) {
1047                pr_err("no package domain found, unknown topology!\n");
1048                return -ENODEV;
1049        }
1050        /* now register domains as children of the socket/package */
1051        for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1052                if (rd->id == RAPL_DOMAIN_PACKAGE)
1053                        continue;
1054                /* number of power limits per domain varies */
1055                nr_pl = find_nr_power_limit(rd);
1056                power_zone = powercap_register_zone(&rd->power_zone,
1057                                                    rp->priv->control_type,
1058                                                    rd->name, rp->power_zone,
1059                                                    &zone_ops[rd->id], nr_pl,
1060                                                    &constraint_ops);
1061
1062                if (IS_ERR(power_zone)) {
1063                        pr_debug("failed to register power_zone, %s:%s\n",
1064                                 rp->name, rd->name);
1065                        ret = PTR_ERR(power_zone);
1066                        goto err_cleanup;
1067                }
1068        }
1069        return 0;
1070
1071err_cleanup:
1072        /*
1073         * Clean up previously initialized domains within the package if we
1074         * failed after the first domain setup.
1075         */
1076        while (--rd >= rp->domains) {
1077                pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1078                powercap_unregister_zone(rp->priv->control_type,
1079                                         &rd->power_zone);
1080        }
1081
1082        return ret;
1083}
1084
1085int rapl_add_platform_domain(struct rapl_if_priv *priv)
1086{
1087        struct rapl_domain *rd;
1088        struct powercap_zone *power_zone;
1089        struct reg_action ra;
1090        int ret;
1091
1092        ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1093        ra.mask = ~0;
1094        ret = priv->read_raw(0, &ra);
1095        if (ret || !ra.value)
1096                return -ENODEV;
1097
1098        ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1099        ra.mask = ~0;
1100        ret = priv->read_raw(0, &ra);
1101        if (ret || !ra.value)
1102                return -ENODEV;
1103
1104        rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1105        if (!rd)
1106                return -ENOMEM;
1107
1108        rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1109        rd->id = RAPL_DOMAIN_PLATFORM;
1110        rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1111            priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1112        rd->regs[RAPL_DOMAIN_REG_STATUS] =
1113            priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1114        rd->rpl[0].prim_id = PL1_ENABLE;
1115        rd->rpl[0].name = pl1_name;
1116        rd->rpl[1].prim_id = PL2_ENABLE;
1117        rd->rpl[1].name = pl2_name;
1118        rd->rp = rapl_find_package_domain(0, priv);
1119
1120        power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1121                                            "psys", NULL,
1122                                            &zone_ops[RAPL_DOMAIN_PLATFORM],
1123                                            2, &constraint_ops);
1124
1125        if (IS_ERR(power_zone)) {
1126                kfree(rd);
1127                return PTR_ERR(power_zone);
1128        }
1129
1130        priv->platform_rapl_domain = rd;
1131
1132        return 0;
1133}
1134EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1135
1136void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1137{
1138        if (priv->platform_rapl_domain) {
1139                powercap_unregister_zone(priv->control_type,
1140                                 &priv->platform_rapl_domain->power_zone);
1141                kfree(priv->platform_rapl_domain);
1142        }
1143}
1144EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1145
1146static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1147{
1148        struct reg_action ra;
1149
1150        switch (domain) {
1151        case RAPL_DOMAIN_PACKAGE:
1152        case RAPL_DOMAIN_PP0:
1153        case RAPL_DOMAIN_PP1:
1154        case RAPL_DOMAIN_DRAM:
1155                ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1156                break;
1157        case RAPL_DOMAIN_PLATFORM:
1158                /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1159                return -EINVAL;
1160        default:
1161                pr_err("invalid domain id %d\n", domain);
1162                return -EINVAL;
1163        }
1164        /* make sure domain counters are available and contains non-zero
1165         * values, otherwise skip it.
1166         */
1167
1168        ra.mask = ~0;
1169        if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1170                return -ENODEV;
1171
1172        return 0;
1173}
1174
1175/*
1176 * Check if power limits are available. Two cases when they are not available:
1177 * 1. Locked by BIOS, in this case we still provide read-only access so that
1178 *    users can see what limit is set by the BIOS.
1179 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1180 *    exist at all. In this case, we do not show the constraints in powercap.
1181 *
1182 * Called after domains are detected and initialized.
1183 */
1184static void rapl_detect_powerlimit(struct rapl_domain *rd)
1185{
1186        u64 val64;
1187        int i;
1188
1189        /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1190        if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1191                if (val64) {
1192                        pr_info("RAPL %s domain %s locked by BIOS\n",
1193                                rd->rp->name, rd->name);
1194                        rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1195                }
1196        }
1197        /* check if power limit MSR exists, otherwise domain is monitoring only */
1198        for (i = 0; i < NR_POWER_LIMITS; i++) {
1199                int prim = rd->rpl[i].prim_id;
1200
1201                if (rapl_read_data_raw(rd, prim, false, &val64))
1202                        rd->rpl[i].name = NULL;
1203        }
1204}
1205
1206/* Detect active and valid domains for the given CPU, caller must
1207 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1208 */
1209static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1210{
1211        struct rapl_domain *rd;
1212        int i;
1213
1214        for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1215                /* use physical package id to read counters */
1216                if (!rapl_check_domain(cpu, i, rp)) {
1217                        rp->domain_map |= 1 << i;
1218                        pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1219                }
1220        }
1221        rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1222        if (!rp->nr_domains) {
1223                pr_debug("no valid rapl domains found in %s\n", rp->name);
1224                return -ENODEV;
1225        }
1226        pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1227
1228        rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1229                              GFP_KERNEL);
1230        if (!rp->domains)
1231                return -ENOMEM;
1232
1233        rapl_init_domains(rp);
1234
1235        for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1236                rapl_detect_powerlimit(rd);
1237
1238        return 0;
1239}
1240
1241/* called from CPU hotplug notifier, hotplug lock held */
1242void rapl_remove_package(struct rapl_package *rp)
1243{
1244        struct rapl_domain *rd, *rd_package = NULL;
1245
1246        package_power_limit_irq_restore(rp);
1247
1248        for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1249                rapl_write_data_raw(rd, PL1_ENABLE, 0);
1250                rapl_write_data_raw(rd, PL1_CLAMP, 0);
1251                if (find_nr_power_limit(rd) > 1) {
1252                        rapl_write_data_raw(rd, PL2_ENABLE, 0);
1253                        rapl_write_data_raw(rd, PL2_CLAMP, 0);
1254                }
1255                if (rd->id == RAPL_DOMAIN_PACKAGE) {
1256                        rd_package = rd;
1257                        continue;
1258                }
1259                pr_debug("remove package, undo power limit on %s: %s\n",
1260                         rp->name, rd->name);
1261                powercap_unregister_zone(rp->priv->control_type,
1262                                         &rd->power_zone);
1263        }
1264        /* do parent zone last */
1265        powercap_unregister_zone(rp->priv->control_type,
1266                                 &rd_package->power_zone);
1267        list_del(&rp->plist);
1268        kfree(rp);
1269}
1270EXPORT_SYMBOL_GPL(rapl_remove_package);
1271
1272/* caller to ensure CPU hotplug lock is held */
1273struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1274{
1275        int id = topology_logical_die_id(cpu);
1276        struct rapl_package *rp;
1277
1278        list_for_each_entry(rp, &rapl_packages, plist) {
1279                if (rp->id == id
1280                    && rp->priv->control_type == priv->control_type)
1281                        return rp;
1282        }
1283
1284        return NULL;
1285}
1286EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1287
1288/* called from CPU hotplug notifier, hotplug lock held */
1289struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1290{
1291        int id = topology_logical_die_id(cpu);
1292        struct rapl_package *rp;
1293        struct cpuinfo_x86 *c = &cpu_data(cpu);
1294        int ret;
1295
1296        rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1297        if (!rp)
1298                return ERR_PTR(-ENOMEM);
1299
1300        /* add the new package to the list */
1301        rp->id = id;
1302        rp->lead_cpu = cpu;
1303        rp->priv = priv;
1304
1305        if (topology_max_die_per_package() > 1)
1306                snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1307                         "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1308        else
1309                snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1310                         c->phys_proc_id);
1311
1312        /* check if the package contains valid domains */
1313        if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1314                ret = -ENODEV;
1315                goto err_free_package;
1316        }
1317        ret = rapl_package_register_powercap(rp);
1318        if (!ret) {
1319                INIT_LIST_HEAD(&rp->plist);
1320                list_add(&rp->plist, &rapl_packages);
1321                return rp;
1322        }
1323
1324err_free_package:
1325        kfree(rp->domains);
1326        kfree(rp);
1327        return ERR_PTR(ret);
1328}
1329EXPORT_SYMBOL_GPL(rapl_add_package);
1330
1331static void power_limit_state_save(void)
1332{
1333        struct rapl_package *rp;
1334        struct rapl_domain *rd;
1335        int nr_pl, ret, i;
1336
1337        get_online_cpus();
1338        list_for_each_entry(rp, &rapl_packages, plist) {
1339                if (!rp->power_zone)
1340                        continue;
1341                rd = power_zone_to_rapl_domain(rp->power_zone);
1342                nr_pl = find_nr_power_limit(rd);
1343                for (i = 0; i < nr_pl; i++) {
1344                        switch (rd->rpl[i].prim_id) {
1345                        case PL1_ENABLE:
1346                                ret = rapl_read_data_raw(rd,
1347                                                 POWER_LIMIT1, true,
1348                                                 &rd->rpl[i].last_power_limit);
1349                                if (ret)
1350                                        rd->rpl[i].last_power_limit = 0;
1351                                break;
1352                        case PL2_ENABLE:
1353                                ret = rapl_read_data_raw(rd,
1354                                                 POWER_LIMIT2, true,
1355                                                 &rd->rpl[i].last_power_limit);
1356                                if (ret)
1357                                        rd->rpl[i].last_power_limit = 0;
1358                                break;
1359                        }
1360                }
1361        }
1362        put_online_cpus();
1363}
1364
1365static void power_limit_state_restore(void)
1366{
1367        struct rapl_package *rp;
1368        struct rapl_domain *rd;
1369        int nr_pl, i;
1370
1371        get_online_cpus();
1372        list_for_each_entry(rp, &rapl_packages, plist) {
1373                if (!rp->power_zone)
1374                        continue;
1375                rd = power_zone_to_rapl_domain(rp->power_zone);
1376                nr_pl = find_nr_power_limit(rd);
1377                for (i = 0; i < nr_pl; i++) {
1378                        switch (rd->rpl[i].prim_id) {
1379                        case PL1_ENABLE:
1380                                if (rd->rpl[i].last_power_limit)
1381                                        rapl_write_data_raw(rd, POWER_LIMIT1,
1382                                            rd->rpl[i].last_power_limit);
1383                                break;
1384                        case PL2_ENABLE:
1385                                if (rd->rpl[i].last_power_limit)
1386                                        rapl_write_data_raw(rd, POWER_LIMIT2,
1387                                            rd->rpl[i].last_power_limit);
1388                                break;
1389                        }
1390                }
1391        }
1392        put_online_cpus();
1393}
1394
1395static int rapl_pm_callback(struct notifier_block *nb,
1396                            unsigned long mode, void *_unused)
1397{
1398        switch (mode) {
1399        case PM_SUSPEND_PREPARE:
1400                power_limit_state_save();
1401                break;
1402        case PM_POST_SUSPEND:
1403                power_limit_state_restore();
1404                break;
1405        }
1406        return NOTIFY_OK;
1407}
1408
1409static struct notifier_block rapl_pm_notifier = {
1410        .notifier_call = rapl_pm_callback,
1411};
1412
1413static struct platform_device *rapl_msr_platdev;
1414
1415static int __init rapl_init(void)
1416{
1417        const struct x86_cpu_id *id;
1418        int ret;
1419
1420        id = x86_match_cpu(rapl_ids);
1421        if (!id) {
1422                pr_err("driver does not support CPU family %d model %d\n",
1423                       boot_cpu_data.x86, boot_cpu_data.x86_model);
1424
1425                return -ENODEV;
1426        }
1427
1428        rapl_defaults = (struct rapl_defaults *)id->driver_data;
1429
1430        ret = register_pm_notifier(&rapl_pm_notifier);
1431        if (ret)
1432                return ret;
1433
1434        rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1435        if (!rapl_msr_platdev) {
1436                ret = -ENOMEM;
1437                goto end;
1438        }
1439
1440        ret = platform_device_add(rapl_msr_platdev);
1441        if (ret)
1442                platform_device_put(rapl_msr_platdev);
1443
1444end:
1445        if (ret)
1446                unregister_pm_notifier(&rapl_pm_notifier);
1447
1448        return ret;
1449}
1450
1451static void __exit rapl_exit(void)
1452{
1453        platform_device_unregister(rapl_msr_platdev);
1454        unregister_pm_notifier(&rapl_pm_notifier);
1455}
1456
1457fs_initcall(rapl_init);
1458module_exit(rapl_exit);
1459
1460MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1461MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1462MODULE_LICENSE("GPL v2");
1463