linux/drivers/cpufreq/intel_pstate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * intel_pstate.c: Native P state management for Intel processors
   4 *
   5 * (C) Copyright 2012 Intel Corporation
   6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
   7 */
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include <linux/kernel.h>
  12#include <linux/kernel_stat.h>
  13#include <linux/module.h>
  14#include <linux/ktime.h>
  15#include <linux/hrtimer.h>
  16#include <linux/tick.h>
  17#include <linux/slab.h>
  18#include <linux/sched/cpufreq.h>
  19#include <linux/list.h>
  20#include <linux/cpu.h>
  21#include <linux/cpufreq.h>
  22#include <linux/sysfs.h>
  23#include <linux/types.h>
  24#include <linux/fs.h>
  25#include <linux/acpi.h>
  26#include <linux/vmalloc.h>
  27#include <linux/pm_qos.h>
  28#include <trace/events/power.h>
  29
  30#include <asm/div64.h>
  31#include <asm/msr.h>
  32#include <asm/cpu_device_id.h>
  33#include <asm/cpufeature.h>
  34#include <asm/intel-family.h>
  35
  36#define INTEL_PSTATE_SAMPLING_INTERVAL  (10 * NSEC_PER_MSEC)
  37
  38#define INTEL_CPUFREQ_TRANSITION_LATENCY        20000
  39#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP      5000
  40#define INTEL_CPUFREQ_TRANSITION_DELAY          500
  41
  42#ifdef CONFIG_ACPI
  43#include <acpi/processor.h>
  44#include <acpi/cppc_acpi.h>
  45#endif
  46
  47#define FRAC_BITS 8
  48#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
  49#define fp_toint(X) ((X) >> FRAC_BITS)
  50
  51#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
  52
  53#define EXT_BITS 6
  54#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
  55#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
  56#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
  57
  58static inline int32_t mul_fp(int32_t x, int32_t y)
  59{
  60        return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
  61}
  62
  63static inline int32_t div_fp(s64 x, s64 y)
  64{
  65        return div64_s64((int64_t)x << FRAC_BITS, y);
  66}
  67
  68static inline int ceiling_fp(int32_t x)
  69{
  70        int mask, ret;
  71
  72        ret = fp_toint(x);
  73        mask = (1 << FRAC_BITS) - 1;
  74        if (x & mask)
  75                ret += 1;
  76        return ret;
  77}
  78
  79static inline u64 mul_ext_fp(u64 x, u64 y)
  80{
  81        return (x * y) >> EXT_FRAC_BITS;
  82}
  83
  84static inline u64 div_ext_fp(u64 x, u64 y)
  85{
  86        return div64_u64(x << EXT_FRAC_BITS, y);
  87}
  88
  89/**
  90 * struct sample -      Store performance sample
  91 * @core_avg_perf:      Ratio of APERF/MPERF which is the actual average
  92 *                      performance during last sample period
  93 * @busy_scaled:        Scaled busy value which is used to calculate next
  94 *                      P state. This can be different than core_avg_perf
  95 *                      to account for cpu idle period
  96 * @aperf:              Difference of actual performance frequency clock count
  97 *                      read from APERF MSR between last and current sample
  98 * @mperf:              Difference of maximum performance frequency clock count
  99 *                      read from MPERF MSR between last and current sample
 100 * @tsc:                Difference of time stamp counter between last and
 101 *                      current sample
 102 * @time:               Current time from scheduler
 103 *
 104 * This structure is used in the cpudata structure to store performance sample
 105 * data for choosing next P State.
 106 */
 107struct sample {
 108        int32_t core_avg_perf;
 109        int32_t busy_scaled;
 110        u64 aperf;
 111        u64 mperf;
 112        u64 tsc;
 113        u64 time;
 114};
 115
 116/**
 117 * struct pstate_data - Store P state data
 118 * @current_pstate:     Current requested P state
 119 * @min_pstate:         Min P state possible for this platform
 120 * @max_pstate:         Max P state possible for this platform
 121 * @max_pstate_physical:This is physical Max P state for a processor
 122 *                      This can be higher than the max_pstate which can
 123 *                      be limited by platform thermal design power limits
 124 * @perf_ctl_scaling:   PERF_CTL P-state to frequency scaling factor
 125 * @scaling:            Scaling factor between performance and frequency
 126 * @turbo_pstate:       Max Turbo P state possible for this platform
 127 * @min_freq:           @min_pstate frequency in cpufreq units
 128 * @max_freq:           @max_pstate frequency in cpufreq units
 129 * @turbo_freq:         @turbo_pstate frequency in cpufreq units
 130 *
 131 * Stores the per cpu model P state limits and current P state.
 132 */
 133struct pstate_data {
 134        int     current_pstate;
 135        int     min_pstate;
 136        int     max_pstate;
 137        int     max_pstate_physical;
 138        int     perf_ctl_scaling;
 139        int     scaling;
 140        int     turbo_pstate;
 141        unsigned int min_freq;
 142        unsigned int max_freq;
 143        unsigned int turbo_freq;
 144};
 145
 146/**
 147 * struct vid_data -    Stores voltage information data
 148 * @min:                VID data for this platform corresponding to
 149 *                      the lowest P state
 150 * @max:                VID data corresponding to the highest P State.
 151 * @turbo:              VID data for turbo P state
 152 * @ratio:              Ratio of (vid max - vid min) /
 153 *                      (max P state - Min P State)
 154 *
 155 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
 156 * This data is used in Atom platforms, where in addition to target P state,
 157 * the voltage data needs to be specified to select next P State.
 158 */
 159struct vid_data {
 160        int min;
 161        int max;
 162        int turbo;
 163        int32_t ratio;
 164};
 165
 166/**
 167 * struct global_params - Global parameters, mostly tunable via sysfs.
 168 * @no_turbo:           Whether or not to use turbo P-states.
 169 * @turbo_disabled:     Whether or not turbo P-states are available at all,
 170 *                      based on the MSR_IA32_MISC_ENABLE value and whether or
 171 *                      not the maximum reported turbo P-state is different from
 172 *                      the maximum reported non-turbo one.
 173 * @turbo_disabled_mf:  The @turbo_disabled value reflected by cpuinfo.max_freq.
 174 * @min_perf_pct:       Minimum capacity limit in percent of the maximum turbo
 175 *                      P-state capacity.
 176 * @max_perf_pct:       Maximum capacity limit in percent of the maximum turbo
 177 *                      P-state capacity.
 178 */
 179struct global_params {
 180        bool no_turbo;
 181        bool turbo_disabled;
 182        bool turbo_disabled_mf;
 183        int max_perf_pct;
 184        int min_perf_pct;
 185};
 186
 187/**
 188 * struct cpudata -     Per CPU instance data storage
 189 * @cpu:                CPU number for this instance data
 190 * @policy:             CPUFreq policy value
 191 * @update_util:        CPUFreq utility callback information
 192 * @update_util_set:    CPUFreq utility callback is set
 193 * @iowait_boost:       iowait-related boost fraction
 194 * @last_update:        Time of the last update.
 195 * @pstate:             Stores P state limits for this CPU
 196 * @vid:                Stores VID limits for this CPU
 197 * @last_sample_time:   Last Sample time
 198 * @aperf_mperf_shift:  APERF vs MPERF counting frequency difference
 199 * @prev_aperf:         Last APERF value read from APERF MSR
 200 * @prev_mperf:         Last MPERF value read from MPERF MSR
 201 * @prev_tsc:           Last timestamp counter (TSC) value
 202 * @prev_cummulative_iowait: IO Wait time difference from last and
 203 *                      current sample
 204 * @sample:             Storage for storing last Sample data
 205 * @min_perf_ratio:     Minimum capacity in terms of PERF or HWP ratios
 206 * @max_perf_ratio:     Maximum capacity in terms of PERF or HWP ratios
 207 * @acpi_perf_data:     Stores ACPI perf information read from _PSS
 208 * @valid_pss_table:    Set to true for valid ACPI _PSS entries found
 209 * @epp_powersave:      Last saved HWP energy performance preference
 210 *                      (EPP) or energy performance bias (EPB),
 211 *                      when policy switched to performance
 212 * @epp_policy:         Last saved policy used to set EPP/EPB
 213 * @epp_default:        Power on default HWP energy performance
 214 *                      preference/bias
 215 * @epp_cached          Cached HWP energy-performance preference value
 216 * @hwp_req_cached:     Cached value of the last HWP Request MSR
 217 * @hwp_cap_cached:     Cached value of the last HWP Capabilities MSR
 218 * @last_io_update:     Last time when IO wake flag was set
 219 * @sched_flags:        Store scheduler flags for possible cross CPU update
 220 * @hwp_boost_min:      Last HWP boosted min performance
 221 * @suspended:          Whether or not the driver has been suspended.
 222 *
 223 * This structure stores per CPU instance data for all CPUs.
 224 */
 225struct cpudata {
 226        int cpu;
 227
 228        unsigned int policy;
 229        struct update_util_data update_util;
 230        bool   update_util_set;
 231
 232        struct pstate_data pstate;
 233        struct vid_data vid;
 234
 235        u64     last_update;
 236        u64     last_sample_time;
 237        u64     aperf_mperf_shift;
 238        u64     prev_aperf;
 239        u64     prev_mperf;
 240        u64     prev_tsc;
 241        u64     prev_cummulative_iowait;
 242        struct sample sample;
 243        int32_t min_perf_ratio;
 244        int32_t max_perf_ratio;
 245#ifdef CONFIG_ACPI
 246        struct acpi_processor_performance acpi_perf_data;
 247        bool valid_pss_table;
 248#endif
 249        unsigned int iowait_boost;
 250        s16 epp_powersave;
 251        s16 epp_policy;
 252        s16 epp_default;
 253        s16 epp_cached;
 254        u64 hwp_req_cached;
 255        u64 hwp_cap_cached;
 256        u64 last_io_update;
 257        unsigned int sched_flags;
 258        u32 hwp_boost_min;
 259        bool suspended;
 260};
 261
 262static struct cpudata **all_cpu_data;
 263
 264/**
 265 * struct pstate_funcs - Per CPU model specific callbacks
 266 * @get_max:            Callback to get maximum non turbo effective P state
 267 * @get_max_physical:   Callback to get maximum non turbo physical P state
 268 * @get_min:            Callback to get minimum P state
 269 * @get_turbo:          Callback to get turbo P state
 270 * @get_scaling:        Callback to get frequency scaling factor
 271 * @get_cpu_scaling:    Get frequency scaling factor for a given cpu
 272 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference
 273 * @get_val:            Callback to convert P state to actual MSR write value
 274 * @get_vid:            Callback to get VID data for Atom platforms
 275 *
 276 * Core and Atom CPU models have different way to get P State limits. This
 277 * structure is used to store those callbacks.
 278 */
 279struct pstate_funcs {
 280        int (*get_max)(void);
 281        int (*get_max_physical)(void);
 282        int (*get_min)(void);
 283        int (*get_turbo)(void);
 284        int (*get_scaling)(void);
 285        int (*get_cpu_scaling)(int cpu);
 286        int (*get_aperf_mperf_shift)(void);
 287        u64 (*get_val)(struct cpudata*, int pstate);
 288        void (*get_vid)(struct cpudata *);
 289};
 290
 291static struct pstate_funcs pstate_funcs __read_mostly;
 292
 293static int hwp_active __read_mostly;
 294static int hwp_mode_bdw __read_mostly;
 295static bool per_cpu_limits __read_mostly;
 296static bool hwp_boost __read_mostly;
 297
 298static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 299
 300#ifdef CONFIG_ACPI
 301static bool acpi_ppc;
 302#endif
 303
 304static struct global_params global;
 305
 306static DEFINE_MUTEX(intel_pstate_driver_lock);
 307static DEFINE_MUTEX(intel_pstate_limits_lock);
 308
 309#ifdef CONFIG_ACPI
 310
 311static bool intel_pstate_acpi_pm_profile_server(void)
 312{
 313        if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
 314            acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
 315                return true;
 316
 317        return false;
 318}
 319
 320static bool intel_pstate_get_ppc_enable_status(void)
 321{
 322        if (intel_pstate_acpi_pm_profile_server())
 323                return true;
 324
 325        return acpi_ppc;
 326}
 327
 328#ifdef CONFIG_ACPI_CPPC_LIB
 329
 330/* The work item is needed to avoid CPU hotplug locking issues */
 331static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
 332{
 333        sched_set_itmt_support();
 334}
 335
 336static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
 337
 338static void intel_pstate_set_itmt_prio(int cpu)
 339{
 340        struct cppc_perf_caps cppc_perf;
 341        static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
 342        int ret;
 343
 344        ret = cppc_get_perf_caps(cpu, &cppc_perf);
 345        if (ret)
 346                return;
 347
 348        /*
 349         * The priorities can be set regardless of whether or not
 350         * sched_set_itmt_support(true) has been called and it is valid to
 351         * update them at any time after it has been called.
 352         */
 353        sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
 354
 355        if (max_highest_perf <= min_highest_perf) {
 356                if (cppc_perf.highest_perf > max_highest_perf)
 357                        max_highest_perf = cppc_perf.highest_perf;
 358
 359                if (cppc_perf.highest_perf < min_highest_perf)
 360                        min_highest_perf = cppc_perf.highest_perf;
 361
 362                if (max_highest_perf > min_highest_perf) {
 363                        /*
 364                         * This code can be run during CPU online under the
 365                         * CPU hotplug locks, so sched_set_itmt_support()
 366                         * cannot be called from here.  Queue up a work item
 367                         * to invoke it.
 368                         */
 369                        schedule_work(&sched_itmt_work);
 370                }
 371        }
 372}
 373
 374static int intel_pstate_get_cppc_guaranteed(int cpu)
 375{
 376        struct cppc_perf_caps cppc_perf;
 377        int ret;
 378
 379        ret = cppc_get_perf_caps(cpu, &cppc_perf);
 380        if (ret)
 381                return ret;
 382
 383        if (cppc_perf.guaranteed_perf)
 384                return cppc_perf.guaranteed_perf;
 385
 386        return cppc_perf.nominal_perf;
 387}
 388
 389static u32 intel_pstate_cppc_nominal(int cpu)
 390{
 391        u64 nominal_perf;
 392
 393        if (cppc_get_nominal_perf(cpu, &nominal_perf))
 394                return 0;
 395
 396        return nominal_perf;
 397}
 398#else /* CONFIG_ACPI_CPPC_LIB */
 399static inline void intel_pstate_set_itmt_prio(int cpu)
 400{
 401}
 402#endif /* CONFIG_ACPI_CPPC_LIB */
 403
 404static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 405{
 406        struct cpudata *cpu;
 407        int ret;
 408        int i;
 409
 410        if (hwp_active) {
 411                intel_pstate_set_itmt_prio(policy->cpu);
 412                return;
 413        }
 414
 415        if (!intel_pstate_get_ppc_enable_status())
 416                return;
 417
 418        cpu = all_cpu_data[policy->cpu];
 419
 420        ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
 421                                                  policy->cpu);
 422        if (ret)
 423                return;
 424
 425        /*
 426         * Check if the control value in _PSS is for PERF_CTL MSR, which should
 427         * guarantee that the states returned by it map to the states in our
 428         * list directly.
 429         */
 430        if (cpu->acpi_perf_data.control_register.space_id !=
 431                                                ACPI_ADR_SPACE_FIXED_HARDWARE)
 432                goto err;
 433
 434        /*
 435         * If there is only one entry _PSS, simply ignore _PSS and continue as
 436         * usual without taking _PSS into account
 437         */
 438        if (cpu->acpi_perf_data.state_count < 2)
 439                goto err;
 440
 441        pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
 442        for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
 443                pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
 444                         (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
 445                         (u32) cpu->acpi_perf_data.states[i].core_frequency,
 446                         (u32) cpu->acpi_perf_data.states[i].power,
 447                         (u32) cpu->acpi_perf_data.states[i].control);
 448        }
 449
 450        /*
 451         * The _PSS table doesn't contain whole turbo frequency range.
 452         * This just contains +1 MHZ above the max non turbo frequency,
 453         * with control value corresponding to max turbo ratio. But
 454         * when cpufreq set policy is called, it will call with this
 455         * max frequency, which will cause a reduced performance as
 456         * this driver uses real max turbo frequency as the max
 457         * frequency. So correct this frequency in _PSS table to
 458         * correct max turbo frequency based on the turbo state.
 459         * Also need to convert to MHz as _PSS freq is in MHz.
 460         */
 461        if (!global.turbo_disabled)
 462                cpu->acpi_perf_data.states[0].core_frequency =
 463                                        policy->cpuinfo.max_freq / 1000;
 464        cpu->valid_pss_table = true;
 465        pr_debug("_PPC limits will be enforced\n");
 466
 467        return;
 468
 469 err:
 470        cpu->valid_pss_table = false;
 471        acpi_processor_unregister_performance(policy->cpu);
 472}
 473
 474static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 475{
 476        struct cpudata *cpu;
 477
 478        cpu = all_cpu_data[policy->cpu];
 479        if (!cpu->valid_pss_table)
 480                return;
 481
 482        acpi_processor_unregister_performance(policy->cpu);
 483}
 484#else /* CONFIG_ACPI */
 485static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 486{
 487}
 488
 489static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 490{
 491}
 492
 493static inline bool intel_pstate_acpi_pm_profile_server(void)
 494{
 495        return false;
 496}
 497#endif /* CONFIG_ACPI */
 498
 499#ifndef CONFIG_ACPI_CPPC_LIB
 500static inline int intel_pstate_get_cppc_guaranteed(int cpu)
 501{
 502        return -ENOTSUPP;
 503}
 504#endif /* CONFIG_ACPI_CPPC_LIB */
 505
 506/**
 507 * intel_pstate_hybrid_hwp_adjust - Calibrate HWP performance levels.
 508 * @cpu: Target CPU.
 509 *
 510 * On hybrid processors, HWP may expose more performance levels than there are
 511 * P-states accessible through the PERF_CTL interface.  If that happens, the
 512 * scaling factor between HWP performance levels and CPU frequency will be less
 513 * than the scaling factor between P-state values and CPU frequency.
 514 *
 515 * In that case, adjust the CPU parameters used in computations accordingly.
 516 */
 517static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu)
 518{
 519        int perf_ctl_max_phys = cpu->pstate.max_pstate_physical;
 520        int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
 521        int perf_ctl_turbo = pstate_funcs.get_turbo();
 522        int turbo_freq = perf_ctl_turbo * perf_ctl_scaling;
 523        int scaling = cpu->pstate.scaling;
 524
 525        pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
 526        pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, pstate_funcs.get_max());
 527        pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo);
 528        pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling);
 529        pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate);
 530        pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate);
 531        pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
 532
 533        /*
 534         * If the product of the HWP performance scaling factor and the HWP_CAP
 535         * highest performance is greater than the maximum turbo frequency
 536         * corresponding to the pstate_funcs.get_turbo() return value, the
 537         * scaling factor is too high, so recompute it to make the HWP_CAP
 538         * highest performance correspond to the maximum turbo frequency.
 539         */
 540        if (turbo_freq < cpu->pstate.turbo_pstate * scaling) {
 541                cpu->pstate.turbo_freq = turbo_freq;
 542                scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate);
 543                cpu->pstate.scaling = scaling;
 544
 545                pr_debug("CPU%d: refined HWP-to-frequency scaling factor: %d\n",
 546                         cpu->cpu, scaling);
 547        }
 548
 549        cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling,
 550                                         perf_ctl_scaling);
 551
 552        cpu->pstate.max_pstate_physical =
 553                        DIV_ROUND_UP(perf_ctl_max_phys * perf_ctl_scaling,
 554                                     scaling);
 555
 556        cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
 557        /*
 558         * Cast the min P-state value retrieved via pstate_funcs.get_min() to
 559         * the effective range of HWP performance levels.
 560         */
 561        cpu->pstate.min_pstate = DIV_ROUND_UP(cpu->pstate.min_freq, scaling);
 562}
 563
 564static inline void update_turbo_state(void)
 565{
 566        u64 misc_en;
 567        struct cpudata *cpu;
 568
 569        cpu = all_cpu_data[0];
 570        rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
 571        global.turbo_disabled =
 572                (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
 573                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 574}
 575
 576static int min_perf_pct_min(void)
 577{
 578        struct cpudata *cpu = all_cpu_data[0];
 579        int turbo_pstate = cpu->pstate.turbo_pstate;
 580
 581        return turbo_pstate ?
 582                (cpu->pstate.min_pstate * 100 / turbo_pstate) : 0;
 583}
 584
 585static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
 586{
 587        u64 epb;
 588        int ret;
 589
 590        if (!boot_cpu_has(X86_FEATURE_EPB))
 591                return -ENXIO;
 592
 593        ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
 594        if (ret)
 595                return (s16)ret;
 596
 597        return (s16)(epb & 0x0f);
 598}
 599
 600static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
 601{
 602        s16 epp;
 603
 604        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 605                /*
 606                 * When hwp_req_data is 0, means that caller didn't read
 607                 * MSR_HWP_REQUEST, so need to read and get EPP.
 608                 */
 609                if (!hwp_req_data) {
 610                        epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
 611                                            &hwp_req_data);
 612                        if (epp)
 613                                return epp;
 614                }
 615                epp = (hwp_req_data >> 24) & 0xff;
 616        } else {
 617                /* When there is no EPP present, HWP uses EPB settings */
 618                epp = intel_pstate_get_epb(cpu_data);
 619        }
 620
 621        return epp;
 622}
 623
 624static int intel_pstate_set_epb(int cpu, s16 pref)
 625{
 626        u64 epb;
 627        int ret;
 628
 629        if (!boot_cpu_has(X86_FEATURE_EPB))
 630                return -ENXIO;
 631
 632        ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
 633        if (ret)
 634                return ret;
 635
 636        epb = (epb & ~0x0f) | pref;
 637        wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
 638
 639        return 0;
 640}
 641
 642/*
 643 * EPP/EPB display strings corresponding to EPP index in the
 644 * energy_perf_strings[]
 645 *      index           String
 646 *-------------------------------------
 647 *      0               default
 648 *      1               performance
 649 *      2               balance_performance
 650 *      3               balance_power
 651 *      4               power
 652 */
 653static const char * const energy_perf_strings[] = {
 654        "default",
 655        "performance",
 656        "balance_performance",
 657        "balance_power",
 658        "power",
 659        NULL
 660};
 661static const unsigned int epp_values[] = {
 662        HWP_EPP_PERFORMANCE,
 663        HWP_EPP_BALANCE_PERFORMANCE,
 664        HWP_EPP_BALANCE_POWERSAVE,
 665        HWP_EPP_POWERSAVE
 666};
 667
 668static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp)
 669{
 670        s16 epp;
 671        int index = -EINVAL;
 672
 673        *raw_epp = 0;
 674        epp = intel_pstate_get_epp(cpu_data, 0);
 675        if (epp < 0)
 676                return epp;
 677
 678        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 679                if (epp == HWP_EPP_PERFORMANCE)
 680                        return 1;
 681                if (epp == HWP_EPP_BALANCE_PERFORMANCE)
 682                        return 2;
 683                if (epp == HWP_EPP_BALANCE_POWERSAVE)
 684                        return 3;
 685                if (epp == HWP_EPP_POWERSAVE)
 686                        return 4;
 687                *raw_epp = epp;
 688                return 0;
 689        } else if (boot_cpu_has(X86_FEATURE_EPB)) {
 690                /*
 691                 * Range:
 692                 *      0x00-0x03       :       Performance
 693                 *      0x04-0x07       :       Balance performance
 694                 *      0x08-0x0B       :       Balance power
 695                 *      0x0C-0x0F       :       Power
 696                 * The EPB is a 4 bit value, but our ranges restrict the
 697                 * value which can be set. Here only using top two bits
 698                 * effectively.
 699                 */
 700                index = (epp >> 2) + 1;
 701        }
 702
 703        return index;
 704}
 705
 706static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
 707{
 708        int ret;
 709
 710        /*
 711         * Use the cached HWP Request MSR value, because in the active mode the
 712         * register itself may be updated by intel_pstate_hwp_boost_up() or
 713         * intel_pstate_hwp_boost_down() at any time.
 714         */
 715        u64 value = READ_ONCE(cpu->hwp_req_cached);
 716
 717        value &= ~GENMASK_ULL(31, 24);
 718        value |= (u64)epp << 24;
 719        /*
 720         * The only other updater of hwp_req_cached in the active mode,
 721         * intel_pstate_hwp_set(), is called under the same lock as this
 722         * function, so it cannot run in parallel with the update below.
 723         */
 724        WRITE_ONCE(cpu->hwp_req_cached, value);
 725        ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
 726        if (!ret)
 727                cpu->epp_cached = epp;
 728
 729        return ret;
 730}
 731
 732static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
 733                                              int pref_index, bool use_raw,
 734                                              u32 raw_epp)
 735{
 736        int epp = -EINVAL;
 737        int ret;
 738
 739        if (!pref_index)
 740                epp = cpu_data->epp_default;
 741
 742        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 743                if (use_raw)
 744                        epp = raw_epp;
 745                else if (epp == -EINVAL)
 746                        epp = epp_values[pref_index - 1];
 747
 748                /*
 749                 * To avoid confusion, refuse to set EPP to any values different
 750                 * from 0 (performance) if the current policy is "performance",
 751                 * because those values would be overridden.
 752                 */
 753                if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
 754                        return -EBUSY;
 755
 756                ret = intel_pstate_set_epp(cpu_data, epp);
 757        } else {
 758                if (epp == -EINVAL)
 759                        epp = (pref_index - 1) << 2;
 760                ret = intel_pstate_set_epb(cpu_data->cpu, epp);
 761        }
 762
 763        return ret;
 764}
 765
 766static ssize_t show_energy_performance_available_preferences(
 767                                struct cpufreq_policy *policy, char *buf)
 768{
 769        int i = 0;
 770        int ret = 0;
 771
 772        while (energy_perf_strings[i] != NULL)
 773                ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
 774
 775        ret += sprintf(&buf[ret], "\n");
 776
 777        return ret;
 778}
 779
 780cpufreq_freq_attr_ro(energy_performance_available_preferences);
 781
 782static struct cpufreq_driver intel_pstate;
 783
 784static ssize_t store_energy_performance_preference(
 785                struct cpufreq_policy *policy, const char *buf, size_t count)
 786{
 787        struct cpudata *cpu = all_cpu_data[policy->cpu];
 788        char str_preference[21];
 789        bool raw = false;
 790        ssize_t ret;
 791        u32 epp = 0;
 792
 793        ret = sscanf(buf, "%20s", str_preference);
 794        if (ret != 1)
 795                return -EINVAL;
 796
 797        ret = match_string(energy_perf_strings, -1, str_preference);
 798        if (ret < 0) {
 799                if (!boot_cpu_has(X86_FEATURE_HWP_EPP))
 800                        return ret;
 801
 802                ret = kstrtouint(buf, 10, &epp);
 803                if (ret)
 804                        return ret;
 805
 806                if (epp > 255)
 807                        return -EINVAL;
 808
 809                raw = true;
 810        }
 811
 812        /*
 813         * This function runs with the policy R/W semaphore held, which
 814         * guarantees that the driver pointer will not change while it is
 815         * running.
 816         */
 817        if (!intel_pstate_driver)
 818                return -EAGAIN;
 819
 820        mutex_lock(&intel_pstate_limits_lock);
 821
 822        if (intel_pstate_driver == &intel_pstate) {
 823                ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp);
 824        } else {
 825                /*
 826                 * In the passive mode the governor needs to be stopped on the
 827                 * target CPU before the EPP update and restarted after it,
 828                 * which is super-heavy-weight, so make sure it is worth doing
 829                 * upfront.
 830                 */
 831                if (!raw)
 832                        epp = ret ? epp_values[ret - 1] : cpu->epp_default;
 833
 834                if (cpu->epp_cached != epp) {
 835                        int err;
 836
 837                        cpufreq_stop_governor(policy);
 838                        ret = intel_pstate_set_epp(cpu, epp);
 839                        err = cpufreq_start_governor(policy);
 840                        if (!ret)
 841                                ret = err;
 842                }
 843        }
 844
 845        mutex_unlock(&intel_pstate_limits_lock);
 846
 847        return ret ?: count;
 848}
 849
 850static ssize_t show_energy_performance_preference(
 851                                struct cpufreq_policy *policy, char *buf)
 852{
 853        struct cpudata *cpu_data = all_cpu_data[policy->cpu];
 854        int preference, raw_epp;
 855
 856        preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp);
 857        if (preference < 0)
 858                return preference;
 859
 860        if (raw_epp)
 861                return  sprintf(buf, "%d\n", raw_epp);
 862        else
 863                return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
 864}
 865
 866cpufreq_freq_attr_rw(energy_performance_preference);
 867
 868static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf)
 869{
 870        struct cpudata *cpu = all_cpu_data[policy->cpu];
 871        int ratio, freq;
 872
 873        ratio = intel_pstate_get_cppc_guaranteed(policy->cpu);
 874        if (ratio <= 0) {
 875                u64 cap;
 876
 877                rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap);
 878                ratio = HWP_GUARANTEED_PERF(cap);
 879        }
 880
 881        freq = ratio * cpu->pstate.scaling;
 882        if (cpu->pstate.scaling != cpu->pstate.perf_ctl_scaling)
 883                freq = rounddown(freq, cpu->pstate.perf_ctl_scaling);
 884
 885        return sprintf(buf, "%d\n", freq);
 886}
 887
 888cpufreq_freq_attr_ro(base_frequency);
 889
 890static struct freq_attr *hwp_cpufreq_attrs[] = {
 891        &energy_performance_preference,
 892        &energy_performance_available_preferences,
 893        &base_frequency,
 894        NULL,
 895};
 896
 897static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 898{
 899        u64 cap;
 900
 901        rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap);
 902        WRITE_ONCE(cpu->hwp_cap_cached, cap);
 903        cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap);
 904        cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap);
 905}
 906
 907static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
 908{
 909        int scaling = cpu->pstate.scaling;
 910
 911        __intel_pstate_get_hwp_cap(cpu);
 912
 913        cpu->pstate.max_freq = cpu->pstate.max_pstate * scaling;
 914        cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling;
 915        if (scaling != cpu->pstate.perf_ctl_scaling) {
 916                int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
 917
 918                cpu->pstate.max_freq = rounddown(cpu->pstate.max_freq,
 919                                                 perf_ctl_scaling);
 920                cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_freq,
 921                                                   perf_ctl_scaling);
 922        }
 923}
 924
 925static void intel_pstate_hwp_set(unsigned int cpu)
 926{
 927        struct cpudata *cpu_data = all_cpu_data[cpu];
 928        int max, min;
 929        u64 value;
 930        s16 epp;
 931
 932        max = cpu_data->max_perf_ratio;
 933        min = cpu_data->min_perf_ratio;
 934
 935        if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
 936                min = max;
 937
 938        rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
 939
 940        value &= ~HWP_MIN_PERF(~0L);
 941        value |= HWP_MIN_PERF(min);
 942
 943        value &= ~HWP_MAX_PERF(~0L);
 944        value |= HWP_MAX_PERF(max);
 945
 946        if (cpu_data->epp_policy == cpu_data->policy)
 947                goto skip_epp;
 948
 949        cpu_data->epp_policy = cpu_data->policy;
 950
 951        if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
 952                epp = intel_pstate_get_epp(cpu_data, value);
 953                cpu_data->epp_powersave = epp;
 954                /* If EPP read was failed, then don't try to write */
 955                if (epp < 0)
 956                        goto skip_epp;
 957
 958                epp = 0;
 959        } else {
 960                /* skip setting EPP, when saved value is invalid */
 961                if (cpu_data->epp_powersave < 0)
 962                        goto skip_epp;
 963
 964                /*
 965                 * No need to restore EPP when it is not zero. This
 966                 * means:
 967                 *  - Policy is not changed
 968                 *  - user has manually changed
 969                 *  - Error reading EPB
 970                 */
 971                epp = intel_pstate_get_epp(cpu_data, value);
 972                if (epp)
 973                        goto skip_epp;
 974
 975                epp = cpu_data->epp_powersave;
 976        }
 977        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 978                value &= ~GENMASK_ULL(31, 24);
 979                value |= (u64)epp << 24;
 980        } else {
 981                intel_pstate_set_epb(cpu, epp);
 982        }
 983skip_epp:
 984        WRITE_ONCE(cpu_data->hwp_req_cached, value);
 985        wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 986}
 987
 988static void intel_pstate_hwp_offline(struct cpudata *cpu)
 989{
 990        u64 value = READ_ONCE(cpu->hwp_req_cached);
 991        int min_perf;
 992
 993        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 994                /*
 995                 * In case the EPP has been set to "performance" by the
 996                 * active mode "performance" scaling algorithm, replace that
 997                 * temporary value with the cached EPP one.
 998                 */
 999                value &= ~GENMASK_ULL(31, 24);
1000                value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached);
1001                WRITE_ONCE(cpu->hwp_req_cached, value);
1002        }
1003
1004        value &= ~GENMASK_ULL(31, 0);
1005        min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
1006
1007        /* Set hwp_max = hwp_min */
1008        value |= HWP_MAX_PERF(min_perf);
1009        value |= HWP_MIN_PERF(min_perf);
1010
1011        /* Set EPP to min */
1012        if (boot_cpu_has(X86_FEATURE_HWP_EPP))
1013                value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
1014
1015        wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
1016}
1017
1018#define POWER_CTL_EE_ENABLE     1
1019#define POWER_CTL_EE_DISABLE    2
1020
1021static int power_ctl_ee_state;
1022
1023static void set_power_ctl_ee_state(bool input)
1024{
1025        u64 power_ctl;
1026
1027        mutex_lock(&intel_pstate_driver_lock);
1028        rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
1029        if (input) {
1030                power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE);
1031                power_ctl_ee_state = POWER_CTL_EE_ENABLE;
1032        } else {
1033                power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE);
1034                power_ctl_ee_state = POWER_CTL_EE_DISABLE;
1035        }
1036        wrmsrl(MSR_IA32_POWER_CTL, power_ctl);
1037        mutex_unlock(&intel_pstate_driver_lock);
1038}
1039
1040static void intel_pstate_hwp_enable(struct cpudata *cpudata);
1041
1042static void intel_pstate_hwp_reenable(struct cpudata *cpu)
1043{
1044        intel_pstate_hwp_enable(cpu);
1045        wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached));
1046}
1047
1048static int intel_pstate_suspend(struct cpufreq_policy *policy)
1049{
1050        struct cpudata *cpu = all_cpu_data[policy->cpu];
1051
1052        pr_debug("CPU %d suspending\n", cpu->cpu);
1053
1054        cpu->suspended = true;
1055
1056        return 0;
1057}
1058
1059static int intel_pstate_resume(struct cpufreq_policy *policy)
1060{
1061        struct cpudata *cpu = all_cpu_data[policy->cpu];
1062
1063        pr_debug("CPU %d resuming\n", cpu->cpu);
1064
1065        /* Only restore if the system default is changed */
1066        if (power_ctl_ee_state == POWER_CTL_EE_ENABLE)
1067                set_power_ctl_ee_state(true);
1068        else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE)
1069                set_power_ctl_ee_state(false);
1070
1071        if (cpu->suspended && hwp_active) {
1072                mutex_lock(&intel_pstate_limits_lock);
1073
1074                /* Re-enable HWP, because "online" has not done that. */
1075                intel_pstate_hwp_reenable(cpu);
1076
1077                mutex_unlock(&intel_pstate_limits_lock);
1078        }
1079
1080        cpu->suspended = false;
1081
1082        return 0;
1083}
1084
1085static void intel_pstate_update_policies(void)
1086{
1087        int cpu;
1088
1089        for_each_possible_cpu(cpu)
1090                cpufreq_update_policy(cpu);
1091}
1092
1093static void intel_pstate_update_max_freq(unsigned int cpu)
1094{
1095        struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
1096        struct cpudata *cpudata;
1097
1098        if (!policy)
1099                return;
1100
1101        cpudata = all_cpu_data[cpu];
1102        policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
1103                        cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
1104
1105        refresh_frequency_limits(policy);
1106
1107        cpufreq_cpu_release(policy);
1108}
1109
1110static void intel_pstate_update_limits(unsigned int cpu)
1111{
1112        mutex_lock(&intel_pstate_driver_lock);
1113
1114        update_turbo_state();
1115        /*
1116         * If turbo has been turned on or off globally, policy limits for
1117         * all CPUs need to be updated to reflect that.
1118         */
1119        if (global.turbo_disabled_mf != global.turbo_disabled) {
1120                global.turbo_disabled_mf = global.turbo_disabled;
1121                arch_set_max_freq_ratio(global.turbo_disabled);
1122                for_each_possible_cpu(cpu)
1123                        intel_pstate_update_max_freq(cpu);
1124        } else {
1125                cpufreq_update_policy(cpu);
1126        }
1127
1128        mutex_unlock(&intel_pstate_driver_lock);
1129}
1130
1131/************************** sysfs begin ************************/
1132#define show_one(file_name, object)                                     \
1133        static ssize_t show_##file_name                                 \
1134        (struct kobject *kobj, struct kobj_attribute *attr, char *buf)  \
1135        {                                                               \
1136                return sprintf(buf, "%u\n", global.object);             \
1137        }
1138
1139static ssize_t intel_pstate_show_status(char *buf);
1140static int intel_pstate_update_status(const char *buf, size_t size);
1141
1142static ssize_t show_status(struct kobject *kobj,
1143                           struct kobj_attribute *attr, char *buf)
1144{
1145        ssize_t ret;
1146
1147        mutex_lock(&intel_pstate_driver_lock);
1148        ret = intel_pstate_show_status(buf);
1149        mutex_unlock(&intel_pstate_driver_lock);
1150
1151        return ret;
1152}
1153
1154static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
1155                            const char *buf, size_t count)
1156{
1157        char *p = memchr(buf, '\n', count);
1158        int ret;
1159
1160        mutex_lock(&intel_pstate_driver_lock);
1161        ret = intel_pstate_update_status(buf, p ? p - buf : count);
1162        mutex_unlock(&intel_pstate_driver_lock);
1163
1164        return ret < 0 ? ret : count;
1165}
1166
1167static ssize_t show_turbo_pct(struct kobject *kobj,
1168                                struct kobj_attribute *attr, char *buf)
1169{
1170        struct cpudata *cpu;
1171        int total, no_turbo, turbo_pct;
1172        uint32_t turbo_fp;
1173
1174        mutex_lock(&intel_pstate_driver_lock);
1175
1176        if (!intel_pstate_driver) {
1177                mutex_unlock(&intel_pstate_driver_lock);
1178                return -EAGAIN;
1179        }
1180
1181        cpu = all_cpu_data[0];
1182
1183        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
1184        no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
1185        turbo_fp = div_fp(no_turbo, total);
1186        turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
1187
1188        mutex_unlock(&intel_pstate_driver_lock);
1189
1190        return sprintf(buf, "%u\n", turbo_pct);
1191}
1192
1193static ssize_t show_num_pstates(struct kobject *kobj,
1194                                struct kobj_attribute *attr, char *buf)
1195{
1196        struct cpudata *cpu;
1197        int total;
1198
1199        mutex_lock(&intel_pstate_driver_lock);
1200
1201        if (!intel_pstate_driver) {
1202                mutex_unlock(&intel_pstate_driver_lock);
1203                return -EAGAIN;
1204        }
1205
1206        cpu = all_cpu_data[0];
1207        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
1208
1209        mutex_unlock(&intel_pstate_driver_lock);
1210
1211        return sprintf(buf, "%u\n", total);
1212}
1213
1214static ssize_t show_no_turbo(struct kobject *kobj,
1215                             struct kobj_attribute *attr, char *buf)
1216{
1217        ssize_t ret;
1218
1219        mutex_lock(&intel_pstate_driver_lock);
1220
1221        if (!intel_pstate_driver) {
1222                mutex_unlock(&intel_pstate_driver_lock);
1223                return -EAGAIN;
1224        }
1225
1226        update_turbo_state();
1227        if (global.turbo_disabled)
1228                ret = sprintf(buf, "%u\n", global.turbo_disabled);
1229        else
1230                ret = sprintf(buf, "%u\n", global.no_turbo);
1231
1232        mutex_unlock(&intel_pstate_driver_lock);
1233
1234        return ret;
1235}
1236
1237static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
1238                              const char *buf, size_t count)
1239{
1240        unsigned int input;
1241        int ret;
1242
1243        ret = sscanf(buf, "%u", &input);
1244        if (ret != 1)
1245                return -EINVAL;
1246
1247        mutex_lock(&intel_pstate_driver_lock);
1248
1249        if (!intel_pstate_driver) {
1250                mutex_unlock(&intel_pstate_driver_lock);
1251                return -EAGAIN;
1252        }
1253
1254        mutex_lock(&intel_pstate_limits_lock);
1255
1256        update_turbo_state();
1257        if (global.turbo_disabled) {
1258                pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n");
1259                mutex_unlock(&intel_pstate_limits_lock);
1260                mutex_unlock(&intel_pstate_driver_lock);
1261                return -EPERM;
1262        }
1263
1264        global.no_turbo = clamp_t(int, input, 0, 1);
1265
1266        if (global.no_turbo) {
1267                struct cpudata *cpu = all_cpu_data[0];
1268                int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate;
1269
1270                /* Squash the global minimum into the permitted range. */
1271                if (global.min_perf_pct > pct)
1272                        global.min_perf_pct = pct;
1273        }
1274
1275        mutex_unlock(&intel_pstate_limits_lock);
1276
1277        intel_pstate_update_policies();
1278
1279        mutex_unlock(&intel_pstate_driver_lock);
1280
1281        return count;
1282}
1283
1284static void update_qos_request(enum freq_qos_req_type type)
1285{
1286        struct freq_qos_request *req;
1287        struct cpufreq_policy *policy;
1288        int i;
1289
1290        for_each_possible_cpu(i) {
1291                struct cpudata *cpu = all_cpu_data[i];
1292                unsigned int freq, perf_pct;
1293
1294                policy = cpufreq_cpu_get(i);
1295                if (!policy)
1296                        continue;
1297
1298                req = policy->driver_data;
1299                cpufreq_cpu_put(policy);
1300
1301                if (!req)
1302                        continue;
1303
1304                if (hwp_active)
1305                        intel_pstate_get_hwp_cap(cpu);
1306
1307                if (type == FREQ_QOS_MIN) {
1308                        perf_pct = global.min_perf_pct;
1309                } else {
1310                        req++;
1311                        perf_pct = global.max_perf_pct;
1312                }
1313
1314                freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * perf_pct, 100);
1315
1316                if (freq_qos_update_request(req, freq) < 0)
1317                        pr_warn("Failed to update freq constraint: CPU%d\n", i);
1318        }
1319}
1320
1321static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
1322                                  const char *buf, size_t count)
1323{
1324        unsigned int input;
1325        int ret;
1326
1327        ret = sscanf(buf, "%u", &input);
1328        if (ret != 1)
1329                return -EINVAL;
1330
1331        mutex_lock(&intel_pstate_driver_lock);
1332
1333        if (!intel_pstate_driver) {
1334                mutex_unlock(&intel_pstate_driver_lock);
1335                return -EAGAIN;
1336        }
1337
1338        mutex_lock(&intel_pstate_limits_lock);
1339
1340        global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100);
1341
1342        mutex_unlock(&intel_pstate_limits_lock);
1343
1344        if (intel_pstate_driver == &intel_pstate)
1345                intel_pstate_update_policies();
1346        else
1347                update_qos_request(FREQ_QOS_MAX);
1348
1349        mutex_unlock(&intel_pstate_driver_lock);
1350
1351        return count;
1352}
1353
1354static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
1355                                  const char *buf, size_t count)
1356{
1357        unsigned int input;
1358        int ret;
1359
1360        ret = sscanf(buf, "%u", &input);
1361        if (ret != 1)
1362                return -EINVAL;
1363
1364        mutex_lock(&intel_pstate_driver_lock);
1365
1366        if (!intel_pstate_driver) {
1367                mutex_unlock(&intel_pstate_driver_lock);
1368                return -EAGAIN;
1369        }
1370
1371        mutex_lock(&intel_pstate_limits_lock);
1372
1373        global.min_perf_pct = clamp_t(int, input,
1374                                      min_perf_pct_min(), global.max_perf_pct);
1375
1376        mutex_unlock(&intel_pstate_limits_lock);
1377
1378        if (intel_pstate_driver == &intel_pstate)
1379                intel_pstate_update_policies();
1380        else
1381                update_qos_request(FREQ_QOS_MIN);
1382
1383        mutex_unlock(&intel_pstate_driver_lock);
1384
1385        return count;
1386}
1387
1388static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
1389                                struct kobj_attribute *attr, char *buf)
1390{
1391        return sprintf(buf, "%u\n", hwp_boost);
1392}
1393
1394static ssize_t store_hwp_dynamic_boost(struct kobject *a,
1395                                       struct kobj_attribute *b,
1396                                       const char *buf, size_t count)
1397{
1398        unsigned int input;
1399        int ret;
1400
1401        ret = kstrtouint(buf, 10, &input);
1402        if (ret)
1403                return ret;
1404
1405        mutex_lock(&intel_pstate_driver_lock);
1406        hwp_boost = !!input;
1407        intel_pstate_update_policies();
1408        mutex_unlock(&intel_pstate_driver_lock);
1409
1410        return count;
1411}
1412
1413static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr,
1414                                      char *buf)
1415{
1416        u64 power_ctl;
1417        int enable;
1418
1419        rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
1420        enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE));
1421        return sprintf(buf, "%d\n", !enable);
1422}
1423
1424static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b,
1425                                       const char *buf, size_t count)
1426{
1427        bool input;
1428        int ret;
1429
1430        ret = kstrtobool(buf, &input);
1431        if (ret)
1432                return ret;
1433
1434        set_power_ctl_ee_state(input);
1435
1436        return count;
1437}
1438
1439show_one(max_perf_pct, max_perf_pct);
1440show_one(min_perf_pct, min_perf_pct);
1441
1442define_one_global_rw(status);
1443define_one_global_rw(no_turbo);
1444define_one_global_rw(max_perf_pct);
1445define_one_global_rw(min_perf_pct);
1446define_one_global_ro(turbo_pct);
1447define_one_global_ro(num_pstates);
1448define_one_global_rw(hwp_dynamic_boost);
1449define_one_global_rw(energy_efficiency);
1450
1451static struct attribute *intel_pstate_attributes[] = {
1452        &status.attr,
1453        &no_turbo.attr,
1454        NULL
1455};
1456
1457static const struct attribute_group intel_pstate_attr_group = {
1458        .attrs = intel_pstate_attributes,
1459};
1460
1461static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[];
1462
1463static struct kobject *intel_pstate_kobject;
1464
1465static void __init intel_pstate_sysfs_expose_params(void)
1466{
1467        int rc;
1468
1469        intel_pstate_kobject = kobject_create_and_add("intel_pstate",
1470                                                &cpu_subsys.dev_root->kobj);
1471        if (WARN_ON(!intel_pstate_kobject))
1472                return;
1473
1474        rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
1475        if (WARN_ON(rc))
1476                return;
1477
1478        if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
1479                rc = sysfs_create_file(intel_pstate_kobject, &turbo_pct.attr);
1480                WARN_ON(rc);
1481
1482                rc = sysfs_create_file(intel_pstate_kobject, &num_pstates.attr);
1483                WARN_ON(rc);
1484        }
1485
1486        /*
1487         * If per cpu limits are enforced there are no global limits, so
1488         * return without creating max/min_perf_pct attributes
1489         */
1490        if (per_cpu_limits)
1491                return;
1492
1493        rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
1494        WARN_ON(rc);
1495
1496        rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
1497        WARN_ON(rc);
1498
1499        if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) {
1500                rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr);
1501                WARN_ON(rc);
1502        }
1503}
1504
1505static void __init intel_pstate_sysfs_remove(void)
1506{
1507        if (!intel_pstate_kobject)
1508                return;
1509
1510        sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group);
1511
1512        if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
1513                sysfs_remove_file(intel_pstate_kobject, &num_pstates.attr);
1514                sysfs_remove_file(intel_pstate_kobject, &turbo_pct.attr);
1515        }
1516
1517        if (!per_cpu_limits) {
1518                sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr);
1519                sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr);
1520
1521                if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids))
1522                        sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr);
1523        }
1524
1525        kobject_put(intel_pstate_kobject);
1526}
1527
1528static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void)
1529{
1530        int rc;
1531
1532        if (!hwp_active)
1533                return;
1534
1535        rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
1536        WARN_ON_ONCE(rc);
1537}
1538
1539static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
1540{
1541        if (!hwp_active)
1542                return;
1543
1544        sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
1545}
1546
1547/************************** sysfs end ************************/
1548
1549static void intel_pstate_hwp_enable(struct cpudata *cpudata)
1550{
1551        /* First disable HWP notification interrupt as we don't process them */
1552        if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
1553                wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
1554
1555        wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
1556        if (cpudata->epp_default == -EINVAL)
1557                cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
1558}
1559
1560static int atom_get_min_pstate(void)
1561{
1562        u64 value;
1563
1564        rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1565        return (value >> 8) & 0x7F;
1566}
1567
1568static int atom_get_max_pstate(void)
1569{
1570        u64 value;
1571
1572        rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1573        return (value >> 16) & 0x7F;
1574}
1575
1576static int atom_get_turbo_pstate(void)
1577{
1578        u64 value;
1579
1580        rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value);
1581        return value & 0x7F;
1582}
1583
1584static u64 atom_get_val(struct cpudata *cpudata, int pstate)
1585{
1586        u64 val;
1587        int32_t vid_fp;
1588        u32 vid;
1589
1590        val = (u64)pstate << 8;
1591        if (global.no_turbo && !global.turbo_disabled)
1592                val |= (u64)1 << 32;
1593
1594        vid_fp = cpudata->vid.min + mul_fp(
1595                int_tofp(pstate - cpudata->pstate.min_pstate),
1596                cpudata->vid.ratio);
1597
1598        vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
1599        vid = ceiling_fp(vid_fp);
1600
1601        if (pstate > cpudata->pstate.max_pstate)
1602                vid = cpudata->vid.turbo;
1603
1604        return val | vid;
1605}
1606
1607static int silvermont_get_scaling(void)
1608{
1609        u64 value;
1610        int i;
1611        /* Defined in Table 35-6 from SDM (Sept 2015) */
1612        static int silvermont_freq_table[] = {
1613                83300, 100000, 133300, 116700, 80000};
1614
1615        rdmsrl(MSR_FSB_FREQ, value);
1616        i = value & 0x7;
1617        WARN_ON(i > 4);
1618
1619        return silvermont_freq_table[i];
1620}
1621
1622static int airmont_get_scaling(void)
1623{
1624        u64 value;
1625        int i;
1626        /* Defined in Table 35-10 from SDM (Sept 2015) */
1627        static int airmont_freq_table[] = {
1628                83300, 100000, 133300, 116700, 80000,
1629                93300, 90000, 88900, 87500};
1630
1631        rdmsrl(MSR_FSB_FREQ, value);
1632        i = value & 0xF;
1633        WARN_ON(i > 8);
1634
1635        return airmont_freq_table[i];
1636}
1637
1638static void atom_get_vid(struct cpudata *cpudata)
1639{
1640        u64 value;
1641
1642        rdmsrl(MSR_ATOM_CORE_VIDS, value);
1643        cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
1644        cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
1645        cpudata->vid.ratio = div_fp(
1646                cpudata->vid.max - cpudata->vid.min,
1647                int_tofp(cpudata->pstate.max_pstate -
1648                        cpudata->pstate.min_pstate));
1649
1650        rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value);
1651        cpudata->vid.turbo = value & 0x7f;
1652}
1653
1654static int core_get_min_pstate(void)
1655{
1656        u64 value;
1657
1658        rdmsrl(MSR_PLATFORM_INFO, value);
1659        return (value >> 40) & 0xFF;
1660}
1661
1662static int core_get_max_pstate_physical(void)
1663{
1664        u64 value;
1665
1666        rdmsrl(MSR_PLATFORM_INFO, value);
1667        return (value >> 8) & 0xFF;
1668}
1669
1670static int core_get_tdp_ratio(u64 plat_info)
1671{
1672        /* Check how many TDP levels present */
1673        if (plat_info & 0x600000000) {
1674                u64 tdp_ctrl;
1675                u64 tdp_ratio;
1676                int tdp_msr;
1677                int err;
1678
1679                /* Get the TDP level (0, 1, 2) to get ratios */
1680                err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
1681                if (err)
1682                        return err;
1683
1684                /* TDP MSR are continuous starting at 0x648 */
1685                tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03);
1686                err = rdmsrl_safe(tdp_msr, &tdp_ratio);
1687                if (err)
1688                        return err;
1689
1690                /* For level 1 and 2, bits[23:16] contain the ratio */
1691                if (tdp_ctrl & 0x03)
1692                        tdp_ratio >>= 16;
1693
1694                tdp_ratio &= 0xff; /* ratios are only 8 bits long */
1695                pr_debug("tdp_ratio %x\n", (int)tdp_ratio);
1696
1697                return (int)tdp_ratio;
1698        }
1699
1700        return -ENXIO;
1701}
1702
1703static int core_get_max_pstate(void)
1704{
1705        u64 tar;
1706        u64 plat_info;
1707        int max_pstate;
1708        int tdp_ratio;
1709        int err;
1710
1711        rdmsrl(MSR_PLATFORM_INFO, plat_info);
1712        max_pstate = (plat_info >> 8) & 0xFF;
1713
1714        tdp_ratio = core_get_tdp_ratio(plat_info);
1715        if (tdp_ratio <= 0)
1716                return max_pstate;
1717
1718        if (hwp_active) {
1719                /* Turbo activation ratio is not used on HWP platforms */
1720                return tdp_ratio;
1721        }
1722
1723        err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
1724        if (!err) {
1725                int tar_levels;
1726
1727                /* Do some sanity checking for safety */
1728                tar_levels = tar & 0xff;
1729                if (tdp_ratio - 1 == tar_levels) {
1730                        max_pstate = tar_levels;
1731                        pr_debug("max_pstate=TAC %x\n", max_pstate);
1732                }
1733        }
1734
1735        return max_pstate;
1736}
1737
1738static int core_get_turbo_pstate(void)
1739{
1740        u64 value;
1741        int nont, ret;
1742
1743        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1744        nont = core_get_max_pstate();
1745        ret = (value) & 255;
1746        if (ret <= nont)
1747                ret = nont;
1748        return ret;
1749}
1750
1751static inline int core_get_scaling(void)
1752{
1753        return 100000;
1754}
1755
1756static u64 core_get_val(struct cpudata *cpudata, int pstate)
1757{
1758        u64 val;
1759
1760        val = (u64)pstate << 8;
1761        if (global.no_turbo && !global.turbo_disabled)
1762                val |= (u64)1 << 32;
1763
1764        return val;
1765}
1766
1767static int knl_get_aperf_mperf_shift(void)
1768{
1769        return 10;
1770}
1771
1772static int knl_get_turbo_pstate(void)
1773{
1774        u64 value;
1775        int nont, ret;
1776
1777        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1778        nont = core_get_max_pstate();
1779        ret = (((value) >> 8) & 0xFF);
1780        if (ret <= nont)
1781                ret = nont;
1782        return ret;
1783}
1784
1785#ifdef CONFIG_ACPI_CPPC_LIB
1786static u32 hybrid_ref_perf;
1787
1788static int hybrid_get_cpu_scaling(int cpu)
1789{
1790        return DIV_ROUND_UP(core_get_scaling() * hybrid_ref_perf,
1791                            intel_pstate_cppc_nominal(cpu));
1792}
1793
1794static void intel_pstate_cppc_set_cpu_scaling(void)
1795{
1796        u32 min_nominal_perf = U32_MAX;
1797        int cpu;
1798
1799        for_each_present_cpu(cpu) {
1800                u32 nominal_perf = intel_pstate_cppc_nominal(cpu);
1801
1802                if (nominal_perf && nominal_perf < min_nominal_perf)
1803                        min_nominal_perf = nominal_perf;
1804        }
1805
1806        if (min_nominal_perf < U32_MAX) {
1807                hybrid_ref_perf = min_nominal_perf;
1808                pstate_funcs.get_cpu_scaling = hybrid_get_cpu_scaling;
1809        }
1810}
1811#else
1812static inline void intel_pstate_cppc_set_cpu_scaling(void)
1813{
1814}
1815#endif /* CONFIG_ACPI_CPPC_LIB */
1816
1817static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
1818{
1819        trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1820        cpu->pstate.current_pstate = pstate;
1821        /*
1822         * Generally, there is no guarantee that this code will always run on
1823         * the CPU being updated, so force the register update to run on the
1824         * right CPU.
1825         */
1826        wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
1827                      pstate_funcs.get_val(cpu, pstate));
1828}
1829
1830static void intel_pstate_set_min_pstate(struct cpudata *cpu)
1831{
1832        intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
1833}
1834
1835static void intel_pstate_max_within_limits(struct cpudata *cpu)
1836{
1837        int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
1838
1839        update_turbo_state();
1840        intel_pstate_set_pstate(cpu, pstate);
1841}
1842
1843static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
1844{
1845        int perf_ctl_max_phys = pstate_funcs.get_max_physical();
1846        int perf_ctl_scaling = pstate_funcs.get_scaling();
1847
1848        cpu->pstate.min_pstate = pstate_funcs.get_min();
1849        cpu->pstate.max_pstate_physical = perf_ctl_max_phys;
1850        cpu->pstate.perf_ctl_scaling = perf_ctl_scaling;
1851
1852        if (hwp_active && !hwp_mode_bdw) {
1853                __intel_pstate_get_hwp_cap(cpu);
1854
1855                if (pstate_funcs.get_cpu_scaling) {
1856                        cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
1857                        if (cpu->pstate.scaling != perf_ctl_scaling)
1858                                intel_pstate_hybrid_hwp_adjust(cpu);
1859                } else {
1860                        cpu->pstate.scaling = perf_ctl_scaling;
1861                }
1862        } else {
1863                cpu->pstate.scaling = perf_ctl_scaling;
1864                cpu->pstate.max_pstate = pstate_funcs.get_max();
1865                cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
1866        }
1867
1868        if (cpu->pstate.scaling == perf_ctl_scaling) {
1869                cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
1870                cpu->pstate.max_freq = cpu->pstate.max_pstate * perf_ctl_scaling;
1871                cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * perf_ctl_scaling;
1872        }
1873
1874        if (pstate_funcs.get_aperf_mperf_shift)
1875                cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
1876
1877        if (pstate_funcs.get_vid)
1878                pstate_funcs.get_vid(cpu);
1879
1880        intel_pstate_set_min_pstate(cpu);
1881}
1882
1883/*
1884 * Long hold time will keep high perf limits for long time,
1885 * which negatively impacts perf/watt for some workloads,
1886 * like specpower. 3ms is based on experiements on some
1887 * workoads.
1888 */
1889static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
1890
1891static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu)
1892{
1893        u64 hwp_req = READ_ONCE(cpu->hwp_req_cached);
1894        u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
1895        u32 max_limit = (hwp_req & 0xff00) >> 8;
1896        u32 min_limit = (hwp_req & 0xff);
1897        u32 boost_level1;
1898
1899        /*
1900         * Cases to consider (User changes via sysfs or boot time):
1901         * If, P0 (Turbo max) = P1 (Guaranteed max) = min:
1902         *      No boost, return.
1903         * If, P0 (Turbo max) > P1 (Guaranteed max) = min:
1904         *     Should result in one level boost only for P0.
1905         * If, P0 (Turbo max) = P1 (Guaranteed max) > min:
1906         *     Should result in two level boost:
1907         *         (min + p1)/2 and P1.
1908         * If, P0 (Turbo max) > P1 (Guaranteed max) > min:
1909         *     Should result in three level boost:
1910         *        (min + p1)/2, P1 and P0.
1911         */
1912
1913        /* If max and min are equal or already at max, nothing to boost */
1914        if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit)
1915                return;
1916
1917        if (!cpu->hwp_boost_min)
1918                cpu->hwp_boost_min = min_limit;
1919
1920        /* level at half way mark between min and guranteed */
1921        boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1;
1922
1923        if (cpu->hwp_boost_min < boost_level1)
1924                cpu->hwp_boost_min = boost_level1;
1925        else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap))
1926                cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap);
1927        else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) &&
1928                 max_limit != HWP_GUARANTEED_PERF(hwp_cap))
1929                cpu->hwp_boost_min = max_limit;
1930        else
1931                return;
1932
1933        hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min;
1934        wrmsrl(MSR_HWP_REQUEST, hwp_req);
1935        cpu->last_update = cpu->sample.time;
1936}
1937
1938static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu)
1939{
1940        if (cpu->hwp_boost_min) {
1941                bool expired;
1942
1943                /* Check if we are idle for hold time to boost down */
1944                expired = time_after64(cpu->sample.time, cpu->last_update +
1945                                       hwp_boost_hold_time_ns);
1946                if (expired) {
1947                        wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached);
1948                        cpu->hwp_boost_min = 0;
1949                }
1950        }
1951        cpu->last_update = cpu->sample.time;
1952}
1953
1954static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
1955                                                      u64 time)
1956{
1957        cpu->sample.time = time;
1958
1959        if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
1960                bool do_io = false;
1961
1962                cpu->sched_flags = 0;
1963                /*
1964                 * Set iowait_boost flag and update time. Since IO WAIT flag
1965                 * is set all the time, we can't just conclude that there is
1966                 * some IO bound activity is scheduled on this CPU with just
1967                 * one occurrence. If we receive at least two in two
1968                 * consecutive ticks, then we treat as boost candidate.
1969                 */
1970                if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
1971                        do_io = true;
1972
1973                cpu->last_io_update = time;
1974
1975                if (do_io)
1976                        intel_pstate_hwp_boost_up(cpu);
1977
1978        } else {
1979                intel_pstate_hwp_boost_down(cpu);
1980        }
1981}
1982
1983static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
1984                                                u64 time, unsigned int flags)
1985{
1986        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1987
1988        cpu->sched_flags |= flags;
1989
1990        if (smp_processor_id() == cpu->cpu)
1991                intel_pstate_update_util_hwp_local(cpu, time);
1992}
1993
1994static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
1995{
1996        struct sample *sample = &cpu->sample;
1997
1998        sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
1999}
2000
2001static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
2002{
2003        u64 aperf, mperf;
2004        unsigned long flags;
2005        u64 tsc;
2006
2007        local_irq_save(flags);
2008        rdmsrl(MSR_IA32_APERF, aperf);
2009        rdmsrl(MSR_IA32_MPERF, mperf);
2010        tsc = rdtsc();
2011        if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
2012                local_irq_restore(flags);
2013                return false;
2014        }
2015        local_irq_restore(flags);
2016
2017        cpu->last_sample_time = cpu->sample.time;
2018        cpu->sample.time = time;
2019        cpu->sample.aperf = aperf;
2020        cpu->sample.mperf = mperf;
2021        cpu->sample.tsc =  tsc;
2022        cpu->sample.aperf -= cpu->prev_aperf;
2023        cpu->sample.mperf -= cpu->prev_mperf;
2024        cpu->sample.tsc -= cpu->prev_tsc;
2025
2026        cpu->prev_aperf = aperf;
2027        cpu->prev_mperf = mperf;
2028        cpu->prev_tsc = tsc;
2029        /*
2030         * First time this function is invoked in a given cycle, all of the
2031         * previous sample data fields are equal to zero or stale and they must
2032         * be populated with meaningful numbers for things to work, so assume
2033         * that sample.time will always be reset before setting the utilization
2034         * update hook and make the caller skip the sample then.
2035         */
2036        if (cpu->last_sample_time) {
2037                intel_pstate_calc_avg_perf(cpu);
2038                return true;
2039        }
2040        return false;
2041}
2042
2043static inline int32_t get_avg_frequency(struct cpudata *cpu)
2044{
2045        return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz);
2046}
2047
2048static inline int32_t get_avg_pstate(struct cpudata *cpu)
2049{
2050        return mul_ext_fp(cpu->pstate.max_pstate_physical,
2051                          cpu->sample.core_avg_perf);
2052}
2053
2054static inline int32_t get_target_pstate(struct cpudata *cpu)
2055{
2056        struct sample *sample = &cpu->sample;
2057        int32_t busy_frac;
2058        int target, avg_pstate;
2059
2060        busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
2061                           sample->tsc);
2062
2063        if (busy_frac < cpu->iowait_boost)
2064                busy_frac = cpu->iowait_boost;
2065
2066        sample->busy_scaled = busy_frac * 100;
2067
2068        target = global.no_turbo || global.turbo_disabled ?
2069                        cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
2070        target += target >> 2;
2071        target = mul_fp(target, busy_frac);
2072        if (target < cpu->pstate.min_pstate)
2073                target = cpu->pstate.min_pstate;
2074
2075        /*
2076         * If the average P-state during the previous cycle was higher than the
2077         * current target, add 50% of the difference to the target to reduce
2078         * possible performance oscillations and offset possible performance
2079         * loss related to moving the workload from one CPU to another within
2080         * a package/module.
2081         */
2082        avg_pstate = get_avg_pstate(cpu);
2083        if (avg_pstate > target)
2084                target += (avg_pstate - target) >> 1;
2085
2086        return target;
2087}
2088
2089static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
2090{
2091        int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
2092        int max_pstate = max(min_pstate, cpu->max_perf_ratio);
2093
2094        return clamp_t(int, pstate, min_pstate, max_pstate);
2095}
2096
2097static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
2098{
2099        if (pstate == cpu->pstate.current_pstate)
2100                return;
2101
2102        cpu->pstate.current_pstate = pstate;
2103        wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
2104}
2105
2106static void intel_pstate_adjust_pstate(struct cpudata *cpu)
2107{
2108        int from = cpu->pstate.current_pstate;
2109        struct sample *sample;
2110        int target_pstate;
2111
2112        update_turbo_state();
2113
2114        target_pstate = get_target_pstate(cpu);
2115        target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2116        trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu);
2117        intel_pstate_update_pstate(cpu, target_pstate);
2118
2119        sample = &cpu->sample;
2120        trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
2121                fp_toint(sample->busy_scaled),
2122                from,
2123                cpu->pstate.current_pstate,
2124                sample->mperf,
2125                sample->aperf,
2126                sample->tsc,
2127                get_avg_frequency(cpu),
2128                fp_toint(cpu->iowait_boost * 100));
2129}
2130
2131static void intel_pstate_update_util(struct update_util_data *data, u64 time,
2132                                     unsigned int flags)
2133{
2134        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
2135        u64 delta_ns;
2136
2137        /* Don't allow remote callbacks */
2138        if (smp_processor_id() != cpu->cpu)
2139                return;
2140
2141        delta_ns = time - cpu->last_update;
2142        if (flags & SCHED_CPUFREQ_IOWAIT) {
2143                /* Start over if the CPU may have been idle. */
2144                if (delta_ns > TICK_NSEC) {
2145                        cpu->iowait_boost = ONE_EIGHTH_FP;
2146                } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
2147                        cpu->iowait_boost <<= 1;
2148                        if (cpu->iowait_boost > int_tofp(1))
2149                                cpu->iowait_boost = int_tofp(1);
2150                } else {
2151                        cpu->iowait_boost = ONE_EIGHTH_FP;
2152                }
2153        } else if (cpu->iowait_boost) {
2154                /* Clear iowait_boost if the CPU may have been idle. */
2155                if (delta_ns > TICK_NSEC)
2156                        cpu->iowait_boost = 0;
2157                else
2158                        cpu->iowait_boost >>= 1;
2159        }
2160        cpu->last_update = time;
2161        delta_ns = time - cpu->sample.time;
2162        if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
2163                return;
2164
2165        if (intel_pstate_sample(cpu, time))
2166                intel_pstate_adjust_pstate(cpu);
2167}
2168
2169static struct pstate_funcs core_funcs = {
2170        .get_max = core_get_max_pstate,
2171        .get_max_physical = core_get_max_pstate_physical,
2172        .get_min = core_get_min_pstate,
2173        .get_turbo = core_get_turbo_pstate,
2174        .get_scaling = core_get_scaling,
2175        .get_val = core_get_val,
2176};
2177
2178static const struct pstate_funcs silvermont_funcs = {
2179        .get_max = atom_get_max_pstate,
2180        .get_max_physical = atom_get_max_pstate,
2181        .get_min = atom_get_min_pstate,
2182        .get_turbo = atom_get_turbo_pstate,
2183        .get_val = atom_get_val,
2184        .get_scaling = silvermont_get_scaling,
2185        .get_vid = atom_get_vid,
2186};
2187
2188static const struct pstate_funcs airmont_funcs = {
2189        .get_max = atom_get_max_pstate,
2190        .get_max_physical = atom_get_max_pstate,
2191        .get_min = atom_get_min_pstate,
2192        .get_turbo = atom_get_turbo_pstate,
2193        .get_val = atom_get_val,
2194        .get_scaling = airmont_get_scaling,
2195        .get_vid = atom_get_vid,
2196};
2197
2198static const struct pstate_funcs knl_funcs = {
2199        .get_max = core_get_max_pstate,
2200        .get_max_physical = core_get_max_pstate_physical,
2201        .get_min = core_get_min_pstate,
2202        .get_turbo = knl_get_turbo_pstate,
2203        .get_aperf_mperf_shift = knl_get_aperf_mperf_shift,
2204        .get_scaling = core_get_scaling,
2205        .get_val = core_get_val,
2206};
2207
2208#define X86_MATCH(model, policy)                                         \
2209        X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
2210                                           X86_FEATURE_APERFMPERF, &policy)
2211
2212static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
2213        X86_MATCH(SANDYBRIDGE,          core_funcs),
2214        X86_MATCH(SANDYBRIDGE_X,        core_funcs),
2215        X86_MATCH(ATOM_SILVERMONT,      silvermont_funcs),
2216        X86_MATCH(IVYBRIDGE,            core_funcs),
2217        X86_MATCH(HASWELL,              core_funcs),
2218        X86_MATCH(BROADWELL,            core_funcs),
2219        X86_MATCH(IVYBRIDGE_X,          core_funcs),
2220        X86_MATCH(HASWELL_X,            core_funcs),
2221        X86_MATCH(HASWELL_L,            core_funcs),
2222        X86_MATCH(HASWELL_G,            core_funcs),
2223        X86_MATCH(BROADWELL_G,          core_funcs),
2224        X86_MATCH(ATOM_AIRMONT,         airmont_funcs),
2225        X86_MATCH(SKYLAKE_L,            core_funcs),
2226        X86_MATCH(BROADWELL_X,          core_funcs),
2227        X86_MATCH(SKYLAKE,              core_funcs),
2228        X86_MATCH(BROADWELL_D,          core_funcs),
2229        X86_MATCH(XEON_PHI_KNL,         knl_funcs),
2230        X86_MATCH(XEON_PHI_KNM,         knl_funcs),
2231        X86_MATCH(ATOM_GOLDMONT,        core_funcs),
2232        X86_MATCH(ATOM_GOLDMONT_PLUS,   core_funcs),
2233        X86_MATCH(SKYLAKE_X,            core_funcs),
2234        X86_MATCH(COMETLAKE,            core_funcs),
2235        X86_MATCH(ICELAKE_X,            core_funcs),
2236        {}
2237};
2238MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
2239
2240static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
2241        X86_MATCH(BROADWELL_D,          core_funcs),
2242        X86_MATCH(BROADWELL_X,          core_funcs),
2243        X86_MATCH(SKYLAKE_X,            core_funcs),
2244        {}
2245};
2246
2247static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
2248        X86_MATCH(KABYLAKE,             core_funcs),
2249        {}
2250};
2251
2252static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = {
2253        X86_MATCH(SKYLAKE_X,            core_funcs),
2254        X86_MATCH(SKYLAKE,              core_funcs),
2255        {}
2256};
2257
2258static int intel_pstate_init_cpu(unsigned int cpunum)
2259{
2260        struct cpudata *cpu;
2261
2262        cpu = all_cpu_data[cpunum];
2263
2264        if (!cpu) {
2265                cpu = kzalloc(sizeof(*cpu), GFP_KERNEL);
2266                if (!cpu)
2267                        return -ENOMEM;
2268
2269                all_cpu_data[cpunum] = cpu;
2270
2271                cpu->cpu = cpunum;
2272
2273                cpu->epp_default = -EINVAL;
2274
2275                if (hwp_active) {
2276                        const struct x86_cpu_id *id;
2277
2278                        intel_pstate_hwp_enable(cpu);
2279
2280                        id = x86_match_cpu(intel_pstate_hwp_boost_ids);
2281                        if (id && intel_pstate_acpi_pm_profile_server())
2282                                hwp_boost = true;
2283                }
2284        } else if (hwp_active) {
2285                /*
2286                 * Re-enable HWP in case this happens after a resume from ACPI
2287                 * S3 if the CPU was offline during the whole system/resume
2288                 * cycle.
2289                 */
2290                intel_pstate_hwp_reenable(cpu);
2291        }
2292
2293        cpu->epp_powersave = -EINVAL;
2294        cpu->epp_policy = 0;
2295
2296        intel_pstate_get_cpu_pstates(cpu);
2297
2298        pr_debug("controlling: cpu %d\n", cpunum);
2299
2300        return 0;
2301}
2302
2303static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
2304{
2305        struct cpudata *cpu = all_cpu_data[cpu_num];
2306
2307        if (hwp_active && !hwp_boost)
2308                return;
2309
2310        if (cpu->update_util_set)
2311                return;
2312
2313        /* Prevent intel_pstate_update_util() from using stale data. */
2314        cpu->sample.time = 0;
2315        cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
2316                                     (hwp_active ?
2317                                      intel_pstate_update_util_hwp :
2318                                      intel_pstate_update_util));
2319        cpu->update_util_set = true;
2320}
2321
2322static void intel_pstate_clear_update_util_hook(unsigned int cpu)
2323{
2324        struct cpudata *cpu_data = all_cpu_data[cpu];
2325
2326        if (!cpu_data->update_util_set)
2327                return;
2328
2329        cpufreq_remove_update_util_hook(cpu);
2330        cpu_data->update_util_set = false;
2331        synchronize_rcu();
2332}
2333
2334static int intel_pstate_get_max_freq(struct cpudata *cpu)
2335{
2336        return global.turbo_disabled || global.no_turbo ?
2337                        cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2338}
2339
2340static void intel_pstate_update_perf_limits(struct cpudata *cpu,
2341                                            unsigned int policy_min,
2342                                            unsigned int policy_max)
2343{
2344        int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
2345        int32_t max_policy_perf, min_policy_perf;
2346
2347        max_policy_perf = policy_max / perf_ctl_scaling;
2348        if (policy_max == policy_min) {
2349                min_policy_perf = max_policy_perf;
2350        } else {
2351                min_policy_perf = policy_min / perf_ctl_scaling;
2352                min_policy_perf = clamp_t(int32_t, min_policy_perf,
2353                                          0, max_policy_perf);
2354        }
2355
2356        /*
2357         * HWP needs some special consideration, because HWP_REQUEST uses
2358         * abstract values to represent performance rather than pure ratios.
2359         */
2360        if (hwp_active) {
2361                intel_pstate_get_hwp_cap(cpu);
2362
2363                if (cpu->pstate.scaling != perf_ctl_scaling) {
2364                        int scaling = cpu->pstate.scaling;
2365                        int freq;
2366
2367                        freq = max_policy_perf * perf_ctl_scaling;
2368                        max_policy_perf = DIV_ROUND_UP(freq, scaling);
2369                        freq = min_policy_perf * perf_ctl_scaling;
2370                        min_policy_perf = DIV_ROUND_UP(freq, scaling);
2371                }
2372        }
2373
2374        pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n",
2375                 cpu->cpu, min_policy_perf, max_policy_perf);
2376
2377        /* Normalize user input to [min_perf, max_perf] */
2378        if (per_cpu_limits) {
2379                cpu->min_perf_ratio = min_policy_perf;
2380                cpu->max_perf_ratio = max_policy_perf;
2381        } else {
2382                int turbo_max = cpu->pstate.turbo_pstate;
2383                int32_t global_min, global_max;
2384
2385                /* Global limits are in percent of the maximum turbo P-state. */
2386                global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
2387                global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
2388                global_min = clamp_t(int32_t, global_min, 0, global_max);
2389
2390                pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu,
2391                         global_min, global_max);
2392
2393                cpu->min_perf_ratio = max(min_policy_perf, global_min);
2394                cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf);
2395                cpu->max_perf_ratio = min(max_policy_perf, global_max);
2396                cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio);
2397
2398                /* Make sure min_perf <= max_perf */
2399                cpu->min_perf_ratio = min(cpu->min_perf_ratio,
2400                                          cpu->max_perf_ratio);
2401
2402        }
2403        pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu,
2404                 cpu->max_perf_ratio,
2405                 cpu->min_perf_ratio);
2406}
2407
2408static int intel_pstate_set_policy(struct cpufreq_policy *policy)
2409{
2410        struct cpudata *cpu;
2411
2412        if (!policy->cpuinfo.max_freq)
2413                return -ENODEV;
2414
2415        pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
2416                 policy->cpuinfo.max_freq, policy->max);
2417
2418        cpu = all_cpu_data[policy->cpu];
2419        cpu->policy = policy->policy;
2420
2421        mutex_lock(&intel_pstate_limits_lock);
2422
2423        intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
2424
2425        if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
2426                /*
2427                 * NOHZ_FULL CPUs need this as the governor callback may not
2428                 * be invoked on them.
2429                 */
2430                intel_pstate_clear_update_util_hook(policy->cpu);
2431                intel_pstate_max_within_limits(cpu);
2432        } else {
2433                intel_pstate_set_update_util_hook(policy->cpu);
2434        }
2435
2436        if (hwp_active) {
2437                /*
2438                 * When hwp_boost was active before and dynamically it
2439                 * was turned off, in that case we need to clear the
2440                 * update util hook.
2441                 */
2442                if (!hwp_boost)
2443                        intel_pstate_clear_update_util_hook(policy->cpu);
2444                intel_pstate_hwp_set(policy->cpu);
2445        }
2446
2447        mutex_unlock(&intel_pstate_limits_lock);
2448
2449        return 0;
2450}
2451
2452static void intel_pstate_adjust_policy_max(struct cpudata *cpu,
2453                                           struct cpufreq_policy_data *policy)
2454{
2455        if (!hwp_active &&
2456            cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
2457            policy->max < policy->cpuinfo.max_freq &&
2458            policy->max > cpu->pstate.max_freq) {
2459                pr_debug("policy->max > max non turbo frequency\n");
2460                policy->max = policy->cpuinfo.max_freq;
2461        }
2462}
2463
2464static void intel_pstate_verify_cpu_policy(struct cpudata *cpu,
2465                                           struct cpufreq_policy_data *policy)
2466{
2467        int max_freq;
2468
2469        update_turbo_state();
2470        if (hwp_active) {
2471                intel_pstate_get_hwp_cap(cpu);
2472                max_freq = global.no_turbo || global.turbo_disabled ?
2473                                cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2474        } else {
2475                max_freq = intel_pstate_get_max_freq(cpu);
2476        }
2477        cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq);
2478
2479        intel_pstate_adjust_policy_max(cpu, policy);
2480}
2481
2482static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy)
2483{
2484        intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
2485
2486        return 0;
2487}
2488
2489static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy)
2490{
2491        struct cpudata *cpu = all_cpu_data[policy->cpu];
2492
2493        pr_debug("CPU %d going offline\n", cpu->cpu);
2494
2495        if (cpu->suspended)
2496                return 0;
2497
2498        /*
2499         * If the CPU is an SMT thread and it goes offline with the performance
2500         * settings different from the minimum, it will prevent its sibling
2501         * from getting to lower performance levels, so force the minimum
2502         * performance on CPU offline to prevent that from happening.
2503         */
2504        if (hwp_active)
2505                intel_pstate_hwp_offline(cpu);
2506        else
2507                intel_pstate_set_min_pstate(cpu);
2508
2509        intel_pstate_exit_perf_limits(policy);
2510
2511        return 0;
2512}
2513
2514static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
2515{
2516        struct cpudata *cpu = all_cpu_data[policy->cpu];
2517
2518        pr_debug("CPU %d going online\n", cpu->cpu);
2519
2520        intel_pstate_init_acpi_perf_limits(policy);
2521
2522        if (hwp_active) {
2523                /*
2524                 * Re-enable HWP and clear the "suspended" flag to let "resume"
2525                 * know that it need not do that.
2526                 */
2527                intel_pstate_hwp_reenable(cpu);
2528                cpu->suspended = false;
2529        }
2530
2531        return 0;
2532}
2533
2534static int intel_pstate_cpu_offline(struct cpufreq_policy *policy)
2535{
2536        intel_pstate_clear_update_util_hook(policy->cpu);
2537
2538        return intel_cpufreq_cpu_offline(policy);
2539}
2540
2541static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
2542{
2543        pr_debug("CPU %d exiting\n", policy->cpu);
2544
2545        policy->fast_switch_possible = false;
2546
2547        return 0;
2548}
2549
2550static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
2551{
2552        struct cpudata *cpu;
2553        int rc;
2554
2555        rc = intel_pstate_init_cpu(policy->cpu);
2556        if (rc)
2557                return rc;
2558
2559        cpu = all_cpu_data[policy->cpu];
2560
2561        cpu->max_perf_ratio = 0xFF;
2562        cpu->min_perf_ratio = 0;
2563
2564        /* cpuinfo and default policy values */
2565        policy->cpuinfo.min_freq = cpu->pstate.min_freq;
2566        update_turbo_state();
2567        global.turbo_disabled_mf = global.turbo_disabled;
2568        policy->cpuinfo.max_freq = global.turbo_disabled ?
2569                        cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2570
2571        policy->min = policy->cpuinfo.min_freq;
2572        policy->max = policy->cpuinfo.max_freq;
2573
2574        intel_pstate_init_acpi_perf_limits(policy);
2575
2576        policy->fast_switch_possible = true;
2577
2578        return 0;
2579}
2580
2581static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
2582{
2583        int ret = __intel_pstate_cpu_init(policy);
2584
2585        if (ret)
2586                return ret;
2587
2588        /*
2589         * Set the policy to powersave to provide a valid fallback value in case
2590         * the default cpufreq governor is neither powersave nor performance.
2591         */
2592        policy->policy = CPUFREQ_POLICY_POWERSAVE;
2593
2594        if (hwp_active) {
2595                struct cpudata *cpu = all_cpu_data[policy->cpu];
2596
2597                cpu->epp_cached = intel_pstate_get_epp(cpu, 0);
2598        }
2599
2600        return 0;
2601}
2602
2603static struct cpufreq_driver intel_pstate = {
2604        .flags          = CPUFREQ_CONST_LOOPS,
2605        .verify         = intel_pstate_verify_policy,
2606        .setpolicy      = intel_pstate_set_policy,
2607        .suspend        = intel_pstate_suspend,
2608        .resume         = intel_pstate_resume,
2609        .init           = intel_pstate_cpu_init,
2610        .exit           = intel_pstate_cpu_exit,
2611        .offline        = intel_pstate_cpu_offline,
2612        .online         = intel_pstate_cpu_online,
2613        .update_limits  = intel_pstate_update_limits,
2614        .name           = "intel_pstate",
2615};
2616
2617static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
2618{
2619        struct cpudata *cpu = all_cpu_data[policy->cpu];
2620
2621        intel_pstate_verify_cpu_policy(cpu, policy);
2622        intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
2623
2624        return 0;
2625}
2626
2627/* Use of trace in passive mode:
2628 *
2629 * In passive mode the trace core_busy field (also known as the
2630 * performance field, and lablelled as such on the graphs; also known as
2631 * core_avg_perf) is not needed and so is re-assigned to indicate if the
2632 * driver call was via the normal or fast switch path. Various graphs
2633 * output from the intel_pstate_tracer.py utility that include core_busy
2634 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%,
2635 * so we use 10 to indicate the normal path through the driver, and
2636 * 90 to indicate the fast switch path through the driver.
2637 * The scaled_busy field is not used, and is set to 0.
2638 */
2639
2640#define INTEL_PSTATE_TRACE_TARGET 10
2641#define INTEL_PSTATE_TRACE_FAST_SWITCH 90
2642
2643static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
2644{
2645        struct sample *sample;
2646
2647        if (!trace_pstate_sample_enabled())
2648                return;
2649
2650        if (!intel_pstate_sample(cpu, ktime_get()))
2651                return;
2652
2653        sample = &cpu->sample;
2654        trace_pstate_sample(trace_type,
2655                0,
2656                old_pstate,
2657                cpu->pstate.current_pstate,
2658                sample->mperf,
2659                sample->aperf,
2660                sample->tsc,
2661                get_avg_frequency(cpu),
2662                fp_toint(cpu->iowait_boost * 100));
2663}
2664
2665static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
2666                                     u32 desired, bool fast_switch)
2667{
2668        u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev;
2669
2670        value &= ~HWP_MIN_PERF(~0L);
2671        value |= HWP_MIN_PERF(min);
2672
2673        value &= ~HWP_MAX_PERF(~0L);
2674        value |= HWP_MAX_PERF(max);
2675
2676        value &= ~HWP_DESIRED_PERF(~0L);
2677        value |= HWP_DESIRED_PERF(desired);
2678
2679        if (value == prev)
2680                return;
2681
2682        WRITE_ONCE(cpu->hwp_req_cached, value);
2683        if (fast_switch)
2684                wrmsrl(MSR_HWP_REQUEST, value);
2685        else
2686                wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
2687}
2688
2689static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu,
2690                                          u32 target_pstate, bool fast_switch)
2691{
2692        if (fast_switch)
2693                wrmsrl(MSR_IA32_PERF_CTL,
2694                       pstate_funcs.get_val(cpu, target_pstate));
2695        else
2696                wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
2697                              pstate_funcs.get_val(cpu, target_pstate));
2698}
2699
2700static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy,
2701                                       int target_pstate, bool fast_switch)
2702{
2703        struct cpudata *cpu = all_cpu_data[policy->cpu];
2704        int old_pstate = cpu->pstate.current_pstate;
2705
2706        target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2707        if (hwp_active) {
2708                int max_pstate = policy->strict_target ?
2709                                        target_pstate : cpu->max_perf_ratio;
2710
2711                intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0,
2712                                         fast_switch);
2713        } else if (target_pstate != old_pstate) {
2714                intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch);
2715        }
2716
2717        cpu->pstate.current_pstate = target_pstate;
2718
2719        intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH :
2720                            INTEL_PSTATE_TRACE_TARGET, old_pstate);
2721
2722        return target_pstate;
2723}
2724
2725static int intel_cpufreq_target(struct cpufreq_policy *policy,
2726                                unsigned int target_freq,
2727                                unsigned int relation)
2728{
2729        struct cpudata *cpu = all_cpu_data[policy->cpu];
2730        struct cpufreq_freqs freqs;
2731        int target_pstate;
2732
2733        update_turbo_state();
2734
2735        freqs.old = policy->cur;
2736        freqs.new = target_freq;
2737
2738        cpufreq_freq_transition_begin(policy, &freqs);
2739
2740        switch (relation) {
2741        case CPUFREQ_RELATION_L:
2742                target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
2743                break;
2744        case CPUFREQ_RELATION_H:
2745                target_pstate = freqs.new / cpu->pstate.scaling;
2746                break;
2747        default:
2748                target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
2749                break;
2750        }
2751
2752        target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false);
2753
2754        freqs.new = target_pstate * cpu->pstate.scaling;
2755
2756        cpufreq_freq_transition_end(policy, &freqs, false);
2757
2758        return 0;
2759}
2760
2761static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
2762                                              unsigned int target_freq)
2763{
2764        struct cpudata *cpu = all_cpu_data[policy->cpu];
2765        int target_pstate;
2766
2767        update_turbo_state();
2768
2769        target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
2770
2771        target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true);
2772
2773        return target_pstate * cpu->pstate.scaling;
2774}
2775
2776static void intel_cpufreq_adjust_perf(unsigned int cpunum,
2777                                      unsigned long min_perf,
2778                                      unsigned long target_perf,
2779                                      unsigned long capacity)
2780{
2781        struct cpudata *cpu = all_cpu_data[cpunum];
2782        u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
2783        int old_pstate = cpu->pstate.current_pstate;
2784        int cap_pstate, min_pstate, max_pstate, target_pstate;
2785
2786        update_turbo_state();
2787        cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) :
2788                                             HWP_HIGHEST_PERF(hwp_cap);
2789
2790        /* Optimization: Avoid unnecessary divisions. */
2791
2792        target_pstate = cap_pstate;
2793        if (target_perf < capacity)
2794                target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);
2795
2796        min_pstate = cap_pstate;
2797        if (min_perf < capacity)
2798                min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);
2799
2800        if (min_pstate < cpu->pstate.min_pstate)
2801                min_pstate = cpu->pstate.min_pstate;
2802
2803        if (min_pstate < cpu->min_perf_ratio)
2804                min_pstate = cpu->min_perf_ratio;
2805
2806        max_pstate = min(cap_pstate, cpu->max_perf_ratio);
2807        if (max_pstate < min_pstate)
2808                max_pstate = min_pstate;
2809
2810        target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);
2811
2812        intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true);
2813
2814        cpu->pstate.current_pstate = target_pstate;
2815        intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
2816}
2817
2818static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
2819{
2820        struct freq_qos_request *req;
2821        struct cpudata *cpu;
2822        struct device *dev;
2823        int ret, freq;
2824
2825        dev = get_cpu_device(policy->cpu);
2826        if (!dev)
2827                return -ENODEV;
2828
2829        ret = __intel_pstate_cpu_init(policy);
2830        if (ret)
2831                return ret;
2832
2833        policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
2834        /* This reflects the intel_pstate_get_cpu_pstates() setting. */
2835        policy->cur = policy->cpuinfo.min_freq;
2836
2837        req = kcalloc(2, sizeof(*req), GFP_KERNEL);
2838        if (!req) {
2839                ret = -ENOMEM;
2840                goto pstate_exit;
2841        }
2842
2843        cpu = all_cpu_data[policy->cpu];
2844
2845        if (hwp_active) {
2846                u64 value;
2847
2848                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
2849
2850                intel_pstate_get_hwp_cap(cpu);
2851
2852                rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value);
2853                WRITE_ONCE(cpu->hwp_req_cached, value);
2854
2855                cpu->epp_cached = intel_pstate_get_epp(cpu, value);
2856        } else {
2857                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
2858        }
2859
2860        freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100);
2861
2862        ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN,
2863                                   freq);
2864        if (ret < 0) {
2865                dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
2866                goto free_req;
2867        }
2868
2869        freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100);
2870
2871        ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX,
2872                                   freq);
2873        if (ret < 0) {
2874                dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
2875                goto remove_min_req;
2876        }
2877
2878        policy->driver_data = req;
2879
2880        return 0;
2881
2882remove_min_req:
2883        freq_qos_remove_request(req);
2884free_req:
2885        kfree(req);
2886pstate_exit:
2887        intel_pstate_exit_perf_limits(policy);
2888
2889        return ret;
2890}
2891
2892static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy)
2893{
2894        struct freq_qos_request *req;
2895
2896        req = policy->driver_data;
2897
2898        freq_qos_remove_request(req + 1);
2899        freq_qos_remove_request(req);
2900        kfree(req);
2901
2902        return intel_pstate_cpu_exit(policy);
2903}
2904
2905static struct cpufreq_driver intel_cpufreq = {
2906        .flags          = CPUFREQ_CONST_LOOPS,
2907        .verify         = intel_cpufreq_verify_policy,
2908        .target         = intel_cpufreq_target,
2909        .fast_switch    = intel_cpufreq_fast_switch,
2910        .init           = intel_cpufreq_cpu_init,
2911        .exit           = intel_cpufreq_cpu_exit,
2912        .offline        = intel_cpufreq_cpu_offline,
2913        .online         = intel_pstate_cpu_online,
2914        .suspend        = intel_pstate_suspend,
2915        .resume         = intel_pstate_resume,
2916        .update_limits  = intel_pstate_update_limits,
2917        .name           = "intel_cpufreq",
2918};
2919
2920static struct cpufreq_driver *default_driver;
2921
2922static void intel_pstate_driver_cleanup(void)
2923{
2924        unsigned int cpu;
2925
2926        cpus_read_lock();
2927        for_each_online_cpu(cpu) {
2928                if (all_cpu_data[cpu]) {
2929                        if (intel_pstate_driver == &intel_pstate)
2930                                intel_pstate_clear_update_util_hook(cpu);
2931
2932                        kfree(all_cpu_data[cpu]);
2933                        all_cpu_data[cpu] = NULL;
2934                }
2935        }
2936        cpus_read_unlock();
2937
2938        intel_pstate_driver = NULL;
2939}
2940
2941static int intel_pstate_register_driver(struct cpufreq_driver *driver)
2942{
2943        int ret;
2944
2945        if (driver == &intel_pstate)
2946                intel_pstate_sysfs_expose_hwp_dynamic_boost();
2947
2948        memset(&global, 0, sizeof(global));
2949        global.max_perf_pct = 100;
2950
2951        intel_pstate_driver = driver;
2952        ret = cpufreq_register_driver(intel_pstate_driver);
2953        if (ret) {
2954                intel_pstate_driver_cleanup();
2955                return ret;
2956        }
2957
2958        global.min_perf_pct = min_perf_pct_min();
2959
2960        return 0;
2961}
2962
2963static ssize_t intel_pstate_show_status(char *buf)
2964{
2965        if (!intel_pstate_driver)
2966                return sprintf(buf, "off\n");
2967
2968        return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ?
2969                                        "active" : "passive");
2970}
2971
2972static int intel_pstate_update_status(const char *buf, size_t size)
2973{
2974        if (size == 3 && !strncmp(buf, "off", size)) {
2975                if (!intel_pstate_driver)
2976                        return -EINVAL;
2977
2978                if (hwp_active)
2979                        return -EBUSY;
2980
2981                cpufreq_unregister_driver(intel_pstate_driver);
2982                intel_pstate_driver_cleanup();
2983                return 0;
2984        }
2985
2986        if (size == 6 && !strncmp(buf, "active", size)) {
2987                if (intel_pstate_driver) {
2988                        if (intel_pstate_driver == &intel_pstate)
2989                                return 0;
2990
2991                        cpufreq_unregister_driver(intel_pstate_driver);
2992                }
2993
2994                return intel_pstate_register_driver(&intel_pstate);
2995        }
2996
2997        if (size == 7 && !strncmp(buf, "passive", size)) {
2998                if (intel_pstate_driver) {
2999                        if (intel_pstate_driver == &intel_cpufreq)
3000                                return 0;
3001
3002                        cpufreq_unregister_driver(intel_pstate_driver);
3003                        intel_pstate_sysfs_hide_hwp_dynamic_boost();
3004                }
3005
3006                return intel_pstate_register_driver(&intel_cpufreq);
3007        }
3008
3009        return -EINVAL;
3010}
3011
3012static int no_load __initdata;
3013static int no_hwp __initdata;
3014static int hwp_only __initdata;
3015static unsigned int force_load __initdata;
3016
3017static int __init intel_pstate_msrs_not_valid(void)
3018{
3019        if (!pstate_funcs.get_max() ||
3020            !pstate_funcs.get_min() ||
3021            !pstate_funcs.get_turbo())
3022                return -ENODEV;
3023
3024        return 0;
3025}
3026
3027static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
3028{
3029        pstate_funcs.get_max   = funcs->get_max;
3030        pstate_funcs.get_max_physical = funcs->get_max_physical;
3031        pstate_funcs.get_min   = funcs->get_min;
3032        pstate_funcs.get_turbo = funcs->get_turbo;
3033        pstate_funcs.get_scaling = funcs->get_scaling;
3034        pstate_funcs.get_val   = funcs->get_val;
3035        pstate_funcs.get_vid   = funcs->get_vid;
3036        pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift;
3037}
3038
3039#ifdef CONFIG_ACPI
3040
3041static bool __init intel_pstate_no_acpi_pss(void)
3042{
3043        int i;
3044
3045        for_each_possible_cpu(i) {
3046                acpi_status status;
3047                union acpi_object *pss;
3048                struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
3049                struct acpi_processor *pr = per_cpu(processors, i);
3050
3051                if (!pr)
3052                        continue;
3053
3054                status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
3055                if (ACPI_FAILURE(status))
3056                        continue;
3057
3058                pss = buffer.pointer;
3059                if (pss && pss->type == ACPI_TYPE_PACKAGE) {
3060                        kfree(pss);
3061                        return false;
3062                }
3063
3064                kfree(pss);
3065        }
3066
3067        pr_debug("ACPI _PSS not found\n");
3068        return true;
3069}
3070
3071static bool __init intel_pstate_no_acpi_pcch(void)
3072{
3073        acpi_status status;
3074        acpi_handle handle;
3075
3076        status = acpi_get_handle(NULL, "\\_SB", &handle);
3077        if (ACPI_FAILURE(status))
3078                goto not_found;
3079
3080        if (acpi_has_method(handle, "PCCH"))
3081                return false;
3082
3083not_found:
3084        pr_debug("ACPI PCCH not found\n");
3085        return true;
3086}
3087
3088static bool __init intel_pstate_has_acpi_ppc(void)
3089{
3090        int i;
3091
3092        for_each_possible_cpu(i) {
3093                struct acpi_processor *pr = per_cpu(processors, i);
3094
3095                if (!pr)
3096                        continue;
3097                if (acpi_has_method(pr->handle, "_PPC"))
3098                        return true;
3099        }
3100        pr_debug("ACPI _PPC not found\n");
3101        return false;
3102}
3103
3104enum {
3105        PSS,
3106        PPC,
3107};
3108
3109/* Hardware vendor-specific info that has its own power management modes */
3110static struct acpi_platform_list plat_info[] __initdata = {
3111        {"HP    ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS},
3112        {"ORACLE", "X4-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3113        {"ORACLE", "X4-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3114        {"ORACLE", "X4-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3115        {"ORACLE", "X3-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3116        {"ORACLE", "X3-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3117        {"ORACLE", "X3-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3118        {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3119        {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3120        {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3121        {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3122        {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3123        {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3124        {"ORACLE", "X6-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3125        {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
3126        { } /* End */
3127};
3128
3129#define BITMASK_OOB     (BIT(8) | BIT(18))
3130
3131static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
3132{
3133        const struct x86_cpu_id *id;
3134        u64 misc_pwr;
3135        int idx;
3136
3137        id = x86_match_cpu(intel_pstate_cpu_oob_ids);
3138        if (id) {
3139                rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
3140                if (misc_pwr & BITMASK_OOB) {
3141                        pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n");
3142                        pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n");
3143                        return true;
3144                }
3145        }
3146
3147        idx = acpi_match_platform_list(plat_info);
3148        if (idx < 0)
3149                return false;
3150
3151        switch (plat_info[idx].data) {
3152        case PSS:
3153                if (!intel_pstate_no_acpi_pss())
3154                        return false;
3155
3156                return intel_pstate_no_acpi_pcch();
3157        case PPC:
3158                return intel_pstate_has_acpi_ppc() && !force_load;
3159        }
3160
3161        return false;
3162}
3163
3164static void intel_pstate_request_control_from_smm(void)
3165{
3166        /*
3167         * It may be unsafe to request P-states control from SMM if _PPC support
3168         * has not been enabled.
3169         */
3170        if (acpi_ppc)
3171                acpi_processor_pstate_control();
3172}
3173#else /* CONFIG_ACPI not enabled */
3174static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
3175static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
3176static inline void intel_pstate_request_control_from_smm(void) {}
3177#endif /* CONFIG_ACPI */
3178
3179#define INTEL_PSTATE_HWP_BROADWELL      0x01
3180
3181#define X86_MATCH_HWP(model, hwp_mode)                                  \
3182        X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
3183                                           X86_FEATURE_HWP, hwp_mode)
3184
3185static const struct x86_cpu_id hwp_support_ids[] __initconst = {
3186        X86_MATCH_HWP(BROADWELL_X,      INTEL_PSTATE_HWP_BROADWELL),
3187        X86_MATCH_HWP(BROADWELL_D,      INTEL_PSTATE_HWP_BROADWELL),
3188        X86_MATCH_HWP(ANY,              0),
3189        {}
3190};
3191
3192static bool intel_pstate_hwp_is_enabled(void)
3193{
3194        u64 value;
3195
3196        rdmsrl(MSR_PM_ENABLE, value);
3197        return !!(value & 0x1);
3198}
3199
3200static int __init intel_pstate_init(void)
3201{
3202        const struct x86_cpu_id *id;
3203        int rc;
3204
3205        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
3206                return -ENODEV;
3207
3208        id = x86_match_cpu(hwp_support_ids);
3209        if (id) {
3210                bool hwp_forced = intel_pstate_hwp_is_enabled();
3211
3212                if (hwp_forced)
3213                        pr_info("HWP enabled by BIOS\n");
3214                else if (no_load)
3215                        return -ENODEV;
3216
3217                copy_cpu_funcs(&core_funcs);
3218                /*
3219                 * Avoid enabling HWP for processors without EPP support,
3220                 * because that means incomplete HWP implementation which is a
3221                 * corner case and supporting it is generally problematic.
3222                 *
3223                 * If HWP is enabled already, though, there is no choice but to
3224                 * deal with it.
3225                 */
3226                if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) {
3227                        hwp_active++;
3228                        hwp_mode_bdw = id->driver_data;
3229                        intel_pstate.attr = hwp_cpufreq_attrs;
3230                        intel_cpufreq.attr = hwp_cpufreq_attrs;
3231                        intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS;
3232                        intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf;
3233                        if (!default_driver)
3234                                default_driver = &intel_pstate;
3235
3236                        if (boot_cpu_has(X86_FEATURE_HYBRID_CPU))
3237                                intel_pstate_cppc_set_cpu_scaling();
3238
3239                        goto hwp_cpu_matched;
3240                }
3241                pr_info("HWP not enabled\n");
3242        } else {
3243                if (no_load)
3244                        return -ENODEV;
3245
3246                id = x86_match_cpu(intel_pstate_cpu_ids);
3247                if (!id) {
3248                        pr_info("CPU model not supported\n");
3249                        return -ENODEV;
3250                }
3251
3252                copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
3253        }
3254
3255        if (intel_pstate_msrs_not_valid()) {
3256                pr_info("Invalid MSRs\n");
3257                return -ENODEV;
3258        }
3259        /* Without HWP start in the passive mode. */
3260        if (!default_driver)
3261                default_driver = &intel_cpufreq;
3262
3263hwp_cpu_matched:
3264        /*
3265         * The Intel pstate driver will be ignored if the platform
3266         * firmware has its own power management modes.
3267         */
3268        if (intel_pstate_platform_pwr_mgmt_exists()) {
3269                pr_info("P-states controlled by the platform\n");
3270                return -ENODEV;
3271        }
3272
3273        if (!hwp_active && hwp_only)
3274                return -ENOTSUPP;
3275
3276        pr_info("Intel P-state driver initializing\n");
3277
3278        all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
3279        if (!all_cpu_data)
3280                return -ENOMEM;
3281
3282        intel_pstate_request_control_from_smm();
3283
3284        intel_pstate_sysfs_expose_params();
3285
3286        mutex_lock(&intel_pstate_driver_lock);
3287        rc = intel_pstate_register_driver(default_driver);
3288        mutex_unlock(&intel_pstate_driver_lock);
3289        if (rc) {
3290                intel_pstate_sysfs_remove();
3291                return rc;
3292        }
3293
3294        if (hwp_active) {
3295                const struct x86_cpu_id *id;
3296
3297                id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids);
3298                if (id) {
3299                        set_power_ctl_ee_state(false);
3300                        pr_info("Disabling energy efficiency optimization\n");
3301                }
3302
3303                pr_info("HWP enabled\n");
3304        } else if (boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
3305                pr_warn("Problematic setup: Hybrid processor with disabled HWP\n");
3306        }
3307
3308        return 0;
3309}
3310device_initcall(intel_pstate_init);
3311
3312static int __init intel_pstate_setup(char *str)
3313{
3314        if (!str)
3315                return -EINVAL;
3316
3317        if (!strcmp(str, "disable"))
3318                no_load = 1;
3319        else if (!strcmp(str, "active"))
3320                default_driver = &intel_pstate;
3321        else if (!strcmp(str, "passive"))
3322                default_driver = &intel_cpufreq;
3323
3324        if (!strcmp(str, "no_hwp"))
3325                no_hwp = 1;
3326
3327        if (!strcmp(str, "force"))
3328                force_load = 1;
3329        if (!strcmp(str, "hwp_only"))
3330                hwp_only = 1;
3331        if (!strcmp(str, "per_cpu_perf_limits"))
3332                per_cpu_limits = true;
3333
3334#ifdef CONFIG_ACPI
3335        if (!strcmp(str, "support_acpi_ppc"))
3336                acpi_ppc = true;
3337#endif
3338
3339        return 0;
3340}
3341early_param("intel_pstate", intel_pstate_setup);
3342
3343MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
3344MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
3345MODULE_LICENSE("GPL");
3346