linux/drivers/cpufreq/intel_pstate.c
<<
>>
Prefs
   1/*
   2 * intel_pstate.c: Native P state management for Intel processors
   3 *
   4 * (C) Copyright 2012 Intel Corporation
   5 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * as published by the Free Software Foundation; version 2
  10 * of the License.
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/kernel.h>
  16#include <linux/kernel_stat.h>
  17#include <linux/module.h>
  18#include <linux/ktime.h>
  19#include <linux/hrtimer.h>
  20#include <linux/tick.h>
  21#include <linux/slab.h>
  22#include <linux/sched.h>
  23#include <linux/list.h>
  24#include <linux/cpu.h>
  25#include <linux/cpufreq.h>
  26#include <linux/sysfs.h>
  27#include <linux/types.h>
  28#include <linux/fs.h>
  29#include <linux/debugfs.h>
  30#include <linux/acpi.h>
  31#include <linux/vmalloc.h>
  32#include <trace/events/power.h>
  33
  34#include <asm/div64.h>
  35#include <asm/msr.h>
  36#include <asm/cpu_device_id.h>
  37#include <asm/cpufeature.h>
  38#include <asm/intel-family.h>
  39
  40#define ATOM_RATIOS             0x66a
  41#define ATOM_VIDS               0x66b
  42#define ATOM_TURBO_RATIOS       0x66c
  43#define ATOM_TURBO_VIDS         0x66d
  44
  45#ifdef CONFIG_ACPI
  46#include <acpi/processor.h>
  47#endif
  48
  49#define FRAC_BITS 8
  50#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
  51#define fp_toint(X) ((X) >> FRAC_BITS)
  52
  53#define EXT_BITS 6
  54#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
  55
  56static inline int32_t mul_fp(int32_t x, int32_t y)
  57{
  58        return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
  59}
  60
  61static inline int32_t div_fp(s64 x, s64 y)
  62{
  63        return div64_s64((int64_t)x << FRAC_BITS, y);
  64}
  65
  66static inline int ceiling_fp(int32_t x)
  67{
  68        int mask, ret;
  69
  70        ret = fp_toint(x);
  71        mask = (1 << FRAC_BITS) - 1;
  72        if (x & mask)
  73                ret += 1;
  74        return ret;
  75}
  76
  77static inline u64 mul_ext_fp(u64 x, u64 y)
  78{
  79        return (x * y) >> EXT_FRAC_BITS;
  80}
  81
  82static inline u64 div_ext_fp(u64 x, u64 y)
  83{
  84        return div64_u64(x << EXT_FRAC_BITS, y);
  85}
  86
  87/**
  88 * struct sample -      Store performance sample
  89 * @core_avg_perf:      Ratio of APERF/MPERF which is the actual average
  90 *                      performance during last sample period
  91 * @busy_scaled:        Scaled busy value which is used to calculate next
  92 *                      P state. This can be different than core_avg_perf
  93 *                      to account for cpu idle period
  94 * @aperf:              Difference of actual performance frequency clock count
  95 *                      read from APERF MSR between last and current sample
  96 * @mperf:              Difference of maximum performance frequency clock count
  97 *                      read from MPERF MSR between last and current sample
  98 * @tsc:                Difference of time stamp counter between last and
  99 *                      current sample
 100 * @time:               Current time from scheduler
 101 *
 102 * This structure is used in the cpudata structure to store performance sample
 103 * data for choosing next P State.
 104 */
 105struct sample {
 106        int32_t core_avg_perf;
 107        int32_t busy_scaled;
 108        u64 aperf;
 109        u64 mperf;
 110        u64 tsc;
 111        u64 time;
 112};
 113
 114/**
 115 * struct pstate_data - Store P state data
 116 * @current_pstate:     Current requested P state
 117 * @min_pstate:         Min P state possible for this platform
 118 * @max_pstate:         Max P state possible for this platform
 119 * @max_pstate_physical:This is physical Max P state for a processor
 120 *                      This can be higher than the max_pstate which can
 121 *                      be limited by platform thermal design power limits
 122 * @scaling:            Scaling factor to  convert frequency to cpufreq
 123 *                      frequency units
 124 * @turbo_pstate:       Max Turbo P state possible for this platform
 125 *
 126 * Stores the per cpu model P state limits and current P state.
 127 */
 128struct pstate_data {
 129        int     current_pstate;
 130        int     min_pstate;
 131        int     max_pstate;
 132        int     max_pstate_physical;
 133        int     scaling;
 134        int     turbo_pstate;
 135};
 136
 137/**
 138 * struct vid_data -    Stores voltage information data
 139 * @min:                VID data for this platform corresponding to
 140 *                      the lowest P state
 141 * @max:                VID data corresponding to the highest P State.
 142 * @turbo:              VID data for turbo P state
 143 * @ratio:              Ratio of (vid max - vid min) /
 144 *                      (max P state - Min P State)
 145 *
 146 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
 147 * This data is used in Atom platforms, where in addition to target P state,
 148 * the voltage data needs to be specified to select next P State.
 149 */
 150struct vid_data {
 151        int min;
 152        int max;
 153        int turbo;
 154        int32_t ratio;
 155};
 156
 157/**
 158 * struct _pid -        Stores PID data
 159 * @setpoint:           Target set point for busyness or performance
 160 * @integral:           Storage for accumulated error values
 161 * @p_gain:             PID proportional gain
 162 * @i_gain:             PID integral gain
 163 * @d_gain:             PID derivative gain
 164 * @deadband:           PID deadband
 165 * @last_err:           Last error storage for integral part of PID calculation
 166 *
 167 * Stores PID coefficients and last error for PID controller.
 168 */
 169struct _pid {
 170        int setpoint;
 171        int32_t integral;
 172        int32_t p_gain;
 173        int32_t i_gain;
 174        int32_t d_gain;
 175        int deadband;
 176        int32_t last_err;
 177};
 178
 179/**
 180 * struct cpudata -     Per CPU instance data storage
 181 * @cpu:                CPU number for this instance data
 182 * @update_util:        CPUFreq utility callback information
 183 * @update_util_set:    CPUFreq utility callback is set
 184 * @pstate:             Stores P state limits for this CPU
 185 * @vid:                Stores VID limits for this CPU
 186 * @pid:                Stores PID parameters for this CPU
 187 * @last_sample_time:   Last Sample time
 188 * @prev_aperf:         Last APERF value read from APERF MSR
 189 * @prev_mperf:         Last MPERF value read from MPERF MSR
 190 * @prev_tsc:           Last timestamp counter (TSC) value
 191 * @prev_cummulative_iowait: IO Wait time difference from last and
 192 *                      current sample
 193 * @sample:             Storage for storing last Sample data
 194 * @acpi_perf_data:     Stores ACPI perf information read from _PSS
 195 * @valid_pss_table:    Set to true for valid ACPI _PSS entries found
 196 *
 197 * This structure stores per CPU instance data for all CPUs.
 198 */
 199struct cpudata {
 200        int cpu;
 201
 202        struct update_util_data update_util;
 203        bool   update_util_set;
 204
 205        struct pstate_data pstate;
 206        struct vid_data vid;
 207        struct _pid pid;
 208
 209        u64     last_sample_time;
 210        u64     prev_aperf;
 211        u64     prev_mperf;
 212        u64     prev_tsc;
 213        u64     prev_cummulative_iowait;
 214        struct sample sample;
 215#ifdef CONFIG_ACPI
 216        struct acpi_processor_performance acpi_perf_data;
 217        bool valid_pss_table;
 218#endif
 219};
 220
 221static struct cpudata **all_cpu_data;
 222
 223/**
 224 * struct pid_adjust_policy - Stores static PID configuration data
 225 * @sample_rate_ms:     PID calculation sample rate in ms
 226 * @sample_rate_ns:     Sample rate calculation in ns
 227 * @deadband:           PID deadband
 228 * @setpoint:           PID Setpoint
 229 * @p_gain_pct:         PID proportional gain
 230 * @i_gain_pct:         PID integral gain
 231 * @d_gain_pct:         PID derivative gain
 232 *
 233 * Stores per CPU model static PID configuration data.
 234 */
 235struct pstate_adjust_policy {
 236        int sample_rate_ms;
 237        s64 sample_rate_ns;
 238        int deadband;
 239        int setpoint;
 240        int p_gain_pct;
 241        int d_gain_pct;
 242        int i_gain_pct;
 243};
 244
 245/**
 246 * struct pstate_funcs - Per CPU model specific callbacks
 247 * @get_max:            Callback to get maximum non turbo effective P state
 248 * @get_max_physical:   Callback to get maximum non turbo physical P state
 249 * @get_min:            Callback to get minimum P state
 250 * @get_turbo:          Callback to get turbo P state
 251 * @get_scaling:        Callback to get frequency scaling factor
 252 * @get_val:            Callback to convert P state to actual MSR write value
 253 * @get_vid:            Callback to get VID data for Atom platforms
 254 * @get_target_pstate:  Callback to a function to calculate next P state to use
 255 *
 256 * Core and Atom CPU models have different way to get P State limits. This
 257 * structure is used to store those callbacks.
 258 */
 259struct pstate_funcs {
 260        int (*get_max)(void);
 261        int (*get_max_physical)(void);
 262        int (*get_min)(void);
 263        int (*get_turbo)(void);
 264        int (*get_scaling)(void);
 265        u64 (*get_val)(struct cpudata*, int pstate);
 266        void (*get_vid)(struct cpudata *);
 267        int32_t (*get_target_pstate)(struct cpudata *);
 268};
 269
 270/**
 271 * struct cpu_defaults- Per CPU model default config data
 272 * @pid_policy: PID config data
 273 * @funcs:              Callback function data
 274 */
 275struct cpu_defaults {
 276        struct pstate_adjust_policy pid_policy;
 277        struct pstate_funcs funcs;
 278};
 279
 280static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
 281static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 282
 283static struct pstate_adjust_policy pid_params __read_mostly;
 284static struct pstate_funcs pstate_funcs __read_mostly;
 285static int hwp_active __read_mostly;
 286
 287#ifdef CONFIG_ACPI
 288static bool acpi_ppc;
 289#endif
 290
 291/**
 292 * struct perf_limits - Store user and policy limits
 293 * @no_turbo:           User requested turbo state from intel_pstate sysfs
 294 * @turbo_disabled:     Platform turbo status either from msr
 295 *                      MSR_IA32_MISC_ENABLE or when maximum available pstate
 296 *                      matches the maximum turbo pstate
 297 * @max_perf_pct:       Effective maximum performance limit in percentage, this
 298 *                      is minimum of either limits enforced by cpufreq policy
 299 *                      or limits from user set limits via intel_pstate sysfs
 300 * @min_perf_pct:       Effective minimum performance limit in percentage, this
 301 *                      is maximum of either limits enforced by cpufreq policy
 302 *                      or limits from user set limits via intel_pstate sysfs
 303 * @max_perf:           This is a scaled value between 0 to 255 for max_perf_pct
 304 *                      This value is used to limit max pstate
 305 * @min_perf:           This is a scaled value between 0 to 255 for min_perf_pct
 306 *                      This value is used to limit min pstate
 307 * @max_policy_pct:     The maximum performance in percentage enforced by
 308 *                      cpufreq setpolicy interface
 309 * @max_sysfs_pct:      The maximum performance in percentage enforced by
 310 *                      intel pstate sysfs interface
 311 * @min_policy_pct:     The minimum performance in percentage enforced by
 312 *                      cpufreq setpolicy interface
 313 * @min_sysfs_pct:      The minimum performance in percentage enforced by
 314 *                      intel pstate sysfs interface
 315 *
 316 * Storage for user and policy defined limits.
 317 */
 318struct perf_limits {
 319        int no_turbo;
 320        int turbo_disabled;
 321        int max_perf_pct;
 322        int min_perf_pct;
 323        int32_t max_perf;
 324        int32_t min_perf;
 325        int max_policy_pct;
 326        int max_sysfs_pct;
 327        int min_policy_pct;
 328        int min_sysfs_pct;
 329};
 330
 331static struct perf_limits performance_limits = {
 332        .no_turbo = 0,
 333        .turbo_disabled = 0,
 334        .max_perf_pct = 100,
 335        .max_perf = int_tofp(1),
 336        .min_perf_pct = 100,
 337        .min_perf = int_tofp(1),
 338        .max_policy_pct = 100,
 339        .max_sysfs_pct = 100,
 340        .min_policy_pct = 0,
 341        .min_sysfs_pct = 0,
 342};
 343
 344static struct perf_limits powersave_limits = {
 345        .no_turbo = 0,
 346        .turbo_disabled = 0,
 347        .max_perf_pct = 100,
 348        .max_perf = int_tofp(1),
 349        .min_perf_pct = 0,
 350        .min_perf = 0,
 351        .max_policy_pct = 100,
 352        .max_sysfs_pct = 100,
 353        .min_policy_pct = 0,
 354        .min_sysfs_pct = 0,
 355};
 356
 357#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 358static struct perf_limits *limits = &performance_limits;
 359#else
 360static struct perf_limits *limits = &powersave_limits;
 361#endif
 362
 363#ifdef CONFIG_ACPI
 364
 365static bool intel_pstate_get_ppc_enable_status(void)
 366{
 367        if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
 368            acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
 369                return true;
 370
 371        return acpi_ppc;
 372}
 373
 374static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 375{
 376        struct cpudata *cpu;
 377        int ret;
 378        int i;
 379
 380        if (hwp_active)
 381                return;
 382
 383        if (!intel_pstate_get_ppc_enable_status())
 384                return;
 385
 386        cpu = all_cpu_data[policy->cpu];
 387
 388        ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
 389                                                  policy->cpu);
 390        if (ret)
 391                return;
 392
 393        /*
 394         * Check if the control value in _PSS is for PERF_CTL MSR, which should
 395         * guarantee that the states returned by it map to the states in our
 396         * list directly.
 397         */
 398        if (cpu->acpi_perf_data.control_register.space_id !=
 399                                                ACPI_ADR_SPACE_FIXED_HARDWARE)
 400                goto err;
 401
 402        /*
 403         * If there is only one entry _PSS, simply ignore _PSS and continue as
 404         * usual without taking _PSS into account
 405         */
 406        if (cpu->acpi_perf_data.state_count < 2)
 407                goto err;
 408
 409        pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
 410        for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
 411                pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
 412                         (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
 413                         (u32) cpu->acpi_perf_data.states[i].core_frequency,
 414                         (u32) cpu->acpi_perf_data.states[i].power,
 415                         (u32) cpu->acpi_perf_data.states[i].control);
 416        }
 417
 418        /*
 419         * The _PSS table doesn't contain whole turbo frequency range.
 420         * This just contains +1 MHZ above the max non turbo frequency,
 421         * with control value corresponding to max turbo ratio. But
 422         * when cpufreq set policy is called, it will call with this
 423         * max frequency, which will cause a reduced performance as
 424         * this driver uses real max turbo frequency as the max
 425         * frequency. So correct this frequency in _PSS table to
 426         * correct max turbo frequency based on the turbo state.
 427         * Also need to convert to MHz as _PSS freq is in MHz.
 428         */
 429        if (!limits->turbo_disabled)
 430                cpu->acpi_perf_data.states[0].core_frequency =
 431                                        policy->cpuinfo.max_freq / 1000;
 432        cpu->valid_pss_table = true;
 433        pr_debug("_PPC limits will be enforced\n");
 434
 435        return;
 436
 437 err:
 438        cpu->valid_pss_table = false;
 439        acpi_processor_unregister_performance(policy->cpu);
 440}
 441
 442static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 443{
 444        struct cpudata *cpu;
 445
 446        cpu = all_cpu_data[policy->cpu];
 447        if (!cpu->valid_pss_table)
 448                return;
 449
 450        acpi_processor_unregister_performance(policy->cpu);
 451}
 452
 453#else
 454static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 455{
 456}
 457
 458static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 459{
 460}
 461#endif
 462
 463static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
 464                             int deadband, int integral) {
 465        pid->setpoint = int_tofp(setpoint);
 466        pid->deadband  = int_tofp(deadband);
 467        pid->integral  = int_tofp(integral);
 468        pid->last_err  = int_tofp(setpoint) - int_tofp(busy);
 469}
 470
 471static inline void pid_p_gain_set(struct _pid *pid, int percent)
 472{
 473        pid->p_gain = div_fp(percent, 100);
 474}
 475
 476static inline void pid_i_gain_set(struct _pid *pid, int percent)
 477{
 478        pid->i_gain = div_fp(percent, 100);
 479}
 480
 481static inline void pid_d_gain_set(struct _pid *pid, int percent)
 482{
 483        pid->d_gain = div_fp(percent, 100);
 484}
 485
 486static signed int pid_calc(struct _pid *pid, int32_t busy)
 487{
 488        signed int result;
 489        int32_t pterm, dterm, fp_error;
 490        int32_t integral_limit;
 491
 492        fp_error = pid->setpoint - busy;
 493
 494        if (abs(fp_error) <= pid->deadband)
 495                return 0;
 496
 497        pterm = mul_fp(pid->p_gain, fp_error);
 498
 499        pid->integral += fp_error;
 500
 501        /*
 502         * We limit the integral here so that it will never
 503         * get higher than 30.  This prevents it from becoming
 504         * too large an input over long periods of time and allows
 505         * it to get factored out sooner.
 506         *
 507         * The value of 30 was chosen through experimentation.
 508         */
 509        integral_limit = int_tofp(30);
 510        if (pid->integral > integral_limit)
 511                pid->integral = integral_limit;
 512        if (pid->integral < -integral_limit)
 513                pid->integral = -integral_limit;
 514
 515        dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
 516        pid->last_err = fp_error;
 517
 518        result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
 519        result = result + (1 << (FRAC_BITS-1));
 520        return (signed int)fp_toint(result);
 521}
 522
 523static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
 524{
 525        pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
 526        pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
 527        pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
 528
 529        pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
 530}
 531
 532static inline void intel_pstate_reset_all_pid(void)
 533{
 534        unsigned int cpu;
 535
 536        for_each_online_cpu(cpu) {
 537                if (all_cpu_data[cpu])
 538                        intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
 539        }
 540}
 541
 542static inline void update_turbo_state(void)
 543{
 544        u64 misc_en;
 545        struct cpudata *cpu;
 546
 547        cpu = all_cpu_data[0];
 548        rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
 549        limits->turbo_disabled =
 550                (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
 551                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 552}
 553
 554static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 555{
 556        int min, hw_min, max, hw_max, cpu, range, adj_range;
 557        u64 value, cap;
 558
 559        rdmsrl(MSR_HWP_CAPABILITIES, cap);
 560        hw_min = HWP_LOWEST_PERF(cap);
 561        hw_max = HWP_HIGHEST_PERF(cap);
 562        range = hw_max - hw_min;
 563
 564        for_each_cpu(cpu, cpumask) {
 565                rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
 566                adj_range = limits->min_perf_pct * range / 100;
 567                min = hw_min + adj_range;
 568                value &= ~HWP_MIN_PERF(~0L);
 569                value |= HWP_MIN_PERF(min);
 570
 571                adj_range = limits->max_perf_pct * range / 100;
 572                max = hw_min + adj_range;
 573                if (limits->no_turbo) {
 574                        hw_max = HWP_GUARANTEED_PERF(cap);
 575                        if (hw_max < max)
 576                                max = hw_max;
 577                }
 578
 579                value &= ~HWP_MAX_PERF(~0L);
 580                value |= HWP_MAX_PERF(max);
 581                wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 582        }
 583}
 584
 585static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
 586{
 587        if (hwp_active)
 588                intel_pstate_hwp_set(policy->cpus);
 589
 590        return 0;
 591}
 592
 593static void intel_pstate_hwp_set_online_cpus(void)
 594{
 595        get_online_cpus();
 596        intel_pstate_hwp_set(cpu_online_mask);
 597        put_online_cpus();
 598}
 599
 600/************************** debugfs begin ************************/
 601static int pid_param_set(void *data, u64 val)
 602{
 603        *(u32 *)data = val;
 604        intel_pstate_reset_all_pid();
 605        return 0;
 606}
 607
 608static int pid_param_get(void *data, u64 *val)
 609{
 610        *val = *(u32 *)data;
 611        return 0;
 612}
 613DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n");
 614
 615struct pid_param {
 616        char *name;
 617        void *value;
 618};
 619
 620static struct pid_param pid_files[] = {
 621        {"sample_rate_ms", &pid_params.sample_rate_ms},
 622        {"d_gain_pct", &pid_params.d_gain_pct},
 623        {"i_gain_pct", &pid_params.i_gain_pct},
 624        {"deadband", &pid_params.deadband},
 625        {"setpoint", &pid_params.setpoint},
 626        {"p_gain_pct", &pid_params.p_gain_pct},
 627        {NULL, NULL}
 628};
 629
 630static void __init intel_pstate_debug_expose_params(void)
 631{
 632        struct dentry *debugfs_parent;
 633        int i = 0;
 634
 635        if (hwp_active)
 636                return;
 637        debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
 638        if (IS_ERR_OR_NULL(debugfs_parent))
 639                return;
 640        while (pid_files[i].name) {
 641                debugfs_create_file(pid_files[i].name, 0660,
 642                                    debugfs_parent, pid_files[i].value,
 643                                    &fops_pid_param);
 644                i++;
 645        }
 646}
 647
 648/************************** debugfs end ************************/
 649
 650/************************** sysfs begin ************************/
 651#define show_one(file_name, object)                                     \
 652        static ssize_t show_##file_name                                 \
 653        (struct kobject *kobj, struct attribute *attr, char *buf)       \
 654        {                                                               \
 655                return sprintf(buf, "%u\n", limits->object);            \
 656        }
 657
 658static ssize_t show_turbo_pct(struct kobject *kobj,
 659                                struct attribute *attr, char *buf)
 660{
 661        struct cpudata *cpu;
 662        int total, no_turbo, turbo_pct;
 663        uint32_t turbo_fp;
 664
 665        cpu = all_cpu_data[0];
 666
 667        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
 668        no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
 669        turbo_fp = div_fp(no_turbo, total);
 670        turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
 671        return sprintf(buf, "%u\n", turbo_pct);
 672}
 673
 674static ssize_t show_num_pstates(struct kobject *kobj,
 675                                struct attribute *attr, char *buf)
 676{
 677        struct cpudata *cpu;
 678        int total;
 679
 680        cpu = all_cpu_data[0];
 681        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
 682        return sprintf(buf, "%u\n", total);
 683}
 684
 685static ssize_t show_no_turbo(struct kobject *kobj,
 686                             struct attribute *attr, char *buf)
 687{
 688        ssize_t ret;
 689
 690        update_turbo_state();
 691        if (limits->turbo_disabled)
 692                ret = sprintf(buf, "%u\n", limits->turbo_disabled);
 693        else
 694                ret = sprintf(buf, "%u\n", limits->no_turbo);
 695
 696        return ret;
 697}
 698
 699static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 700                              const char *buf, size_t count)
 701{
 702        unsigned int input;
 703        int ret;
 704
 705        ret = sscanf(buf, "%u", &input);
 706        if (ret != 1)
 707                return -EINVAL;
 708
 709        update_turbo_state();
 710        if (limits->turbo_disabled) {
 711                pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
 712                return -EPERM;
 713        }
 714
 715        limits->no_turbo = clamp_t(int, input, 0, 1);
 716
 717        if (hwp_active)
 718                intel_pstate_hwp_set_online_cpus();
 719
 720        return count;
 721}
 722
 723static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 724                                  const char *buf, size_t count)
 725{
 726        unsigned int input;
 727        int ret;
 728
 729        ret = sscanf(buf, "%u", &input);
 730        if (ret != 1)
 731                return -EINVAL;
 732
 733        limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
 734        limits->max_perf_pct = min(limits->max_policy_pct,
 735                                   limits->max_sysfs_pct);
 736        limits->max_perf_pct = max(limits->min_policy_pct,
 737                                   limits->max_perf_pct);
 738        limits->max_perf_pct = max(limits->min_perf_pct,
 739                                   limits->max_perf_pct);
 740        limits->max_perf = div_fp(limits->max_perf_pct, 100);
 741
 742        if (hwp_active)
 743                intel_pstate_hwp_set_online_cpus();
 744        return count;
 745}
 746
 747static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 748                                  const char *buf, size_t count)
 749{
 750        unsigned int input;
 751        int ret;
 752
 753        ret = sscanf(buf, "%u", &input);
 754        if (ret != 1)
 755                return -EINVAL;
 756
 757        limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
 758        limits->min_perf_pct = max(limits->min_policy_pct,
 759                                   limits->min_sysfs_pct);
 760        limits->min_perf_pct = min(limits->max_policy_pct,
 761                                   limits->min_perf_pct);
 762        limits->min_perf_pct = min(limits->max_perf_pct,
 763                                   limits->min_perf_pct);
 764        limits->min_perf = div_fp(limits->min_perf_pct, 100);
 765
 766        if (hwp_active)
 767                intel_pstate_hwp_set_online_cpus();
 768        return count;
 769}
 770
 771show_one(max_perf_pct, max_perf_pct);
 772show_one(min_perf_pct, min_perf_pct);
 773
 774define_one_global_rw(no_turbo);
 775define_one_global_rw(max_perf_pct);
 776define_one_global_rw(min_perf_pct);
 777define_one_global_ro(turbo_pct);
 778define_one_global_ro(num_pstates);
 779
 780static struct attribute *intel_pstate_attributes[] = {
 781        &no_turbo.attr,
 782        &max_perf_pct.attr,
 783        &min_perf_pct.attr,
 784        &turbo_pct.attr,
 785        &num_pstates.attr,
 786        NULL
 787};
 788
 789static struct attribute_group intel_pstate_attr_group = {
 790        .attrs = intel_pstate_attributes,
 791};
 792
 793static void __init intel_pstate_sysfs_expose_params(void)
 794{
 795        struct kobject *intel_pstate_kobject;
 796        int rc;
 797
 798        intel_pstate_kobject = kobject_create_and_add("intel_pstate",
 799                                                &cpu_subsys.dev_root->kobj);
 800        BUG_ON(!intel_pstate_kobject);
 801        rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
 802        BUG_ON(rc);
 803}
 804/************************** sysfs end ************************/
 805
 806static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 807{
 808        /* First disable HWP notification interrupt as we don't process them */
 809        if (static_cpu_has(X86_FEATURE_HWP_NOTIFY))
 810                wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
 811
 812        wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
 813}
 814
 815static int atom_get_min_pstate(void)
 816{
 817        u64 value;
 818
 819        rdmsrl(ATOM_RATIOS, value);
 820        return (value >> 8) & 0x7F;
 821}
 822
 823static int atom_get_max_pstate(void)
 824{
 825        u64 value;
 826
 827        rdmsrl(ATOM_RATIOS, value);
 828        return (value >> 16) & 0x7F;
 829}
 830
 831static int atom_get_turbo_pstate(void)
 832{
 833        u64 value;
 834
 835        rdmsrl(ATOM_TURBO_RATIOS, value);
 836        return value & 0x7F;
 837}
 838
 839static u64 atom_get_val(struct cpudata *cpudata, int pstate)
 840{
 841        u64 val;
 842        int32_t vid_fp;
 843        u32 vid;
 844
 845        val = (u64)pstate << 8;
 846        if (limits->no_turbo && !limits->turbo_disabled)
 847                val |= (u64)1 << 32;
 848
 849        vid_fp = cpudata->vid.min + mul_fp(
 850                int_tofp(pstate - cpudata->pstate.min_pstate),
 851                cpudata->vid.ratio);
 852
 853        vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
 854        vid = ceiling_fp(vid_fp);
 855
 856        if (pstate > cpudata->pstate.max_pstate)
 857                vid = cpudata->vid.turbo;
 858
 859        return val | vid;
 860}
 861
 862static int silvermont_get_scaling(void)
 863{
 864        u64 value;
 865        int i;
 866        /* Defined in Table 35-6 from SDM (Sept 2015) */
 867        static int silvermont_freq_table[] = {
 868                83300, 100000, 133300, 116700, 80000};
 869
 870        rdmsrl(MSR_FSB_FREQ, value);
 871        i = value & 0x7;
 872        WARN_ON(i > 4);
 873
 874        return silvermont_freq_table[i];
 875}
 876
 877static int airmont_get_scaling(void)
 878{
 879        u64 value;
 880        int i;
 881        /* Defined in Table 35-10 from SDM (Sept 2015) */
 882        static int airmont_freq_table[] = {
 883                83300, 100000, 133300, 116700, 80000,
 884                93300, 90000, 88900, 87500};
 885
 886        rdmsrl(MSR_FSB_FREQ, value);
 887        i = value & 0xF;
 888        WARN_ON(i > 8);
 889
 890        return airmont_freq_table[i];
 891}
 892
 893static void atom_get_vid(struct cpudata *cpudata)
 894{
 895        u64 value;
 896
 897        rdmsrl(ATOM_VIDS, value);
 898        cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
 899        cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
 900        cpudata->vid.ratio = div_fp(
 901                cpudata->vid.max - cpudata->vid.min,
 902                int_tofp(cpudata->pstate.max_pstate -
 903                        cpudata->pstate.min_pstate));
 904
 905        rdmsrl(ATOM_TURBO_VIDS, value);
 906        cpudata->vid.turbo = value & 0x7f;
 907}
 908
 909static int core_get_min_pstate(void)
 910{
 911        u64 value;
 912
 913        rdmsrl(MSR_PLATFORM_INFO, value);
 914        return (value >> 40) & 0xFF;
 915}
 916
 917static int core_get_max_pstate_physical(void)
 918{
 919        u64 value;
 920
 921        rdmsrl(MSR_PLATFORM_INFO, value);
 922        return (value >> 8) & 0xFF;
 923}
 924
 925static int core_get_max_pstate(void)
 926{
 927        u64 tar;
 928        u64 plat_info;
 929        int max_pstate;
 930        int err;
 931
 932        rdmsrl(MSR_PLATFORM_INFO, plat_info);
 933        max_pstate = (plat_info >> 8) & 0xFF;
 934
 935        err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
 936        if (!err) {
 937                /* Do some sanity checking for safety */
 938                if (plat_info & 0x600000000) {
 939                        u64 tdp_ctrl;
 940                        u64 tdp_ratio;
 941                        int tdp_msr;
 942
 943                        err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
 944                        if (err)
 945                                goto skip_tar;
 946
 947                        tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x3);
 948                        err = rdmsrl_safe(tdp_msr, &tdp_ratio);
 949                        if (err)
 950                                goto skip_tar;
 951
 952                        /* For level 1 and 2, bits[23:16] contain the ratio */
 953                        if (tdp_ctrl)
 954                                tdp_ratio >>= 16;
 955
 956                        tdp_ratio &= 0xff; /* ratios are only 8 bits long */
 957                        if (tdp_ratio - 1 == tar) {
 958                                max_pstate = tar;
 959                                pr_debug("max_pstate=TAC %x\n", max_pstate);
 960                        } else {
 961                                goto skip_tar;
 962                        }
 963                }
 964        }
 965
 966skip_tar:
 967        return max_pstate;
 968}
 969
 970static int core_get_turbo_pstate(void)
 971{
 972        u64 value;
 973        int nont, ret;
 974
 975        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
 976        nont = core_get_max_pstate();
 977        ret = (value) & 255;
 978        if (ret <= nont)
 979                ret = nont;
 980        return ret;
 981}
 982
 983static inline int core_get_scaling(void)
 984{
 985        return 100000;
 986}
 987
 988static u64 core_get_val(struct cpudata *cpudata, int pstate)
 989{
 990        u64 val;
 991
 992        val = (u64)pstate << 8;
 993        if (limits->no_turbo && !limits->turbo_disabled)
 994                val |= (u64)1 << 32;
 995
 996        return val;
 997}
 998
 999static int knl_get_turbo_pstate(void)
1000{
1001        u64 value;
1002        int nont, ret;
1003
1004        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1005        nont = core_get_max_pstate();
1006        ret = (((value) >> 8) & 0xFF);
1007        if (ret <= nont)
1008                ret = nont;
1009        return ret;
1010}
1011
1012static struct cpu_defaults core_params = {
1013        .pid_policy = {
1014                .sample_rate_ms = 10,
1015                .deadband = 0,
1016                .setpoint = 97,
1017                .p_gain_pct = 20,
1018                .d_gain_pct = 0,
1019                .i_gain_pct = 0,
1020        },
1021        .funcs = {
1022                .get_max = core_get_max_pstate,
1023                .get_max_physical = core_get_max_pstate_physical,
1024                .get_min = core_get_min_pstate,
1025                .get_turbo = core_get_turbo_pstate,
1026                .get_scaling = core_get_scaling,
1027                .get_val = core_get_val,
1028                .get_target_pstate = get_target_pstate_use_performance,
1029        },
1030};
1031
1032static struct cpu_defaults silvermont_params = {
1033        .pid_policy = {
1034                .sample_rate_ms = 10,
1035                .deadband = 0,
1036                .setpoint = 60,
1037                .p_gain_pct = 14,
1038                .d_gain_pct = 0,
1039                .i_gain_pct = 4,
1040        },
1041        .funcs = {
1042                .get_max = atom_get_max_pstate,
1043                .get_max_physical = atom_get_max_pstate,
1044                .get_min = atom_get_min_pstate,
1045                .get_turbo = atom_get_turbo_pstate,
1046                .get_val = atom_get_val,
1047                .get_scaling = silvermont_get_scaling,
1048                .get_vid = atom_get_vid,
1049                .get_target_pstate = get_target_pstate_use_cpu_load,
1050        },
1051};
1052
1053static struct cpu_defaults airmont_params = {
1054        .pid_policy = {
1055                .sample_rate_ms = 10,
1056                .deadband = 0,
1057                .setpoint = 60,
1058                .p_gain_pct = 14,
1059                .d_gain_pct = 0,
1060                .i_gain_pct = 4,
1061        },
1062        .funcs = {
1063                .get_max = atom_get_max_pstate,
1064                .get_max_physical = atom_get_max_pstate,
1065                .get_min = atom_get_min_pstate,
1066                .get_turbo = atom_get_turbo_pstate,
1067                .get_val = atom_get_val,
1068                .get_scaling = airmont_get_scaling,
1069                .get_vid = atom_get_vid,
1070                .get_target_pstate = get_target_pstate_use_cpu_load,
1071        },
1072};
1073
1074static struct cpu_defaults knl_params = {
1075        .pid_policy = {
1076                .sample_rate_ms = 10,
1077                .deadband = 0,
1078                .setpoint = 97,
1079                .p_gain_pct = 20,
1080                .d_gain_pct = 0,
1081                .i_gain_pct = 0,
1082        },
1083        .funcs = {
1084                .get_max = core_get_max_pstate,
1085                .get_max_physical = core_get_max_pstate_physical,
1086                .get_min = core_get_min_pstate,
1087                .get_turbo = knl_get_turbo_pstate,
1088                .get_scaling = core_get_scaling,
1089                .get_val = core_get_val,
1090                .get_target_pstate = get_target_pstate_use_performance,
1091        },
1092};
1093
1094static struct cpu_defaults bxt_params = {
1095        .pid_policy = {
1096                .sample_rate_ms = 10,
1097                .deadband = 0,
1098                .setpoint = 60,
1099                .p_gain_pct = 14,
1100                .d_gain_pct = 0,
1101                .i_gain_pct = 4,
1102        },
1103        .funcs = {
1104                .get_max = core_get_max_pstate,
1105                .get_max_physical = core_get_max_pstate_physical,
1106                .get_min = core_get_min_pstate,
1107                .get_turbo = core_get_turbo_pstate,
1108                .get_scaling = core_get_scaling,
1109                .get_val = core_get_val,
1110                .get_target_pstate = get_target_pstate_use_cpu_load,
1111        },
1112};
1113
1114static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
1115{
1116        int max_perf = cpu->pstate.turbo_pstate;
1117        int max_perf_adj;
1118        int min_perf;
1119
1120        if (limits->no_turbo || limits->turbo_disabled)
1121                max_perf = cpu->pstate.max_pstate;
1122
1123        /*
1124         * performance can be limited by user through sysfs, by cpufreq
1125         * policy, or by cpu specific default values determined through
1126         * experimentation.
1127         */
1128        max_perf_adj = fp_toint(max_perf * limits->max_perf);
1129        *max = clamp_t(int, max_perf_adj,
1130                        cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
1131
1132        min_perf = fp_toint(max_perf * limits->min_perf);
1133        *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
1134}
1135
1136static void intel_pstate_set_min_pstate(struct cpudata *cpu)
1137{
1138        int pstate = cpu->pstate.min_pstate;
1139
1140        trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1141        cpu->pstate.current_pstate = pstate;
1142        /*
1143         * Generally, there is no guarantee that this code will always run on
1144         * the CPU being updated, so force the register update to run on the
1145         * right CPU.
1146         */
1147        wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
1148                      pstate_funcs.get_val(cpu, pstate));
1149}
1150
1151static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
1152{
1153        cpu->pstate.min_pstate = pstate_funcs.get_min();
1154        cpu->pstate.max_pstate = pstate_funcs.get_max();
1155        cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
1156        cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
1157        cpu->pstate.scaling = pstate_funcs.get_scaling();
1158
1159        if (pstate_funcs.get_vid)
1160                pstate_funcs.get_vid(cpu);
1161
1162        intel_pstate_set_min_pstate(cpu);
1163}
1164
1165static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
1166{
1167        struct sample *sample = &cpu->sample;
1168
1169        sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
1170}
1171
1172static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
1173{
1174        u64 aperf, mperf;
1175        unsigned long flags;
1176        u64 tsc;
1177
1178        local_irq_save(flags);
1179        rdmsrl(MSR_IA32_APERF, aperf);
1180        rdmsrl(MSR_IA32_MPERF, mperf);
1181        tsc = rdtsc();
1182        if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
1183                local_irq_restore(flags);
1184                return false;
1185        }
1186        local_irq_restore(flags);
1187
1188        cpu->last_sample_time = cpu->sample.time;
1189        cpu->sample.time = time;
1190        cpu->sample.aperf = aperf;
1191        cpu->sample.mperf = mperf;
1192        cpu->sample.tsc =  tsc;
1193        cpu->sample.aperf -= cpu->prev_aperf;
1194        cpu->sample.mperf -= cpu->prev_mperf;
1195        cpu->sample.tsc -= cpu->prev_tsc;
1196
1197        cpu->prev_aperf = aperf;
1198        cpu->prev_mperf = mperf;
1199        cpu->prev_tsc = tsc;
1200        /*
1201         * First time this function is invoked in a given cycle, all of the
1202         * previous sample data fields are equal to zero or stale and they must
1203         * be populated with meaningful numbers for things to work, so assume
1204         * that sample.time will always be reset before setting the utilization
1205         * update hook and make the caller skip the sample then.
1206         */
1207        return !!cpu->last_sample_time;
1208}
1209
1210static inline int32_t get_avg_frequency(struct cpudata *cpu)
1211{
1212        return mul_ext_fp(cpu->sample.core_avg_perf,
1213                          cpu->pstate.max_pstate_physical * cpu->pstate.scaling);
1214}
1215
1216static inline int32_t get_avg_pstate(struct cpudata *cpu)
1217{
1218        return mul_ext_fp(cpu->pstate.max_pstate_physical,
1219                          cpu->sample.core_avg_perf);
1220}
1221
1222static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
1223{
1224        struct sample *sample = &cpu->sample;
1225        u64 cummulative_iowait, delta_iowait_us;
1226        u64 delta_iowait_mperf;
1227        u64 mperf, now;
1228        int32_t cpu_load;
1229
1230        cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now);
1231
1232        /*
1233         * Convert iowait time into number of IO cycles spent at max_freq.
1234         * IO is considered as busy only for the cpu_load algorithm. For
1235         * performance this is not needed since we always try to reach the
1236         * maximum P-State, so we are already boosting the IOs.
1237         */
1238        delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait;
1239        delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling *
1240                cpu->pstate.max_pstate, MSEC_PER_SEC);
1241
1242        mperf = cpu->sample.mperf + delta_iowait_mperf;
1243        cpu->prev_cummulative_iowait = cummulative_iowait;
1244
1245        /*
1246         * The load can be estimated as the ratio of the mperf counter
1247         * running at a constant frequency during active periods
1248         * (C0) and the time stamp counter running at the same frequency
1249         * also during C-states.
1250         */
1251        cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc);
1252        cpu->sample.busy_scaled = cpu_load;
1253
1254        return get_avg_pstate(cpu) - pid_calc(&cpu->pid, cpu_load);
1255}
1256
1257static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
1258{
1259        int32_t perf_scaled, max_pstate, current_pstate, sample_ratio;
1260        u64 duration_ns;
1261
1262        /*
1263         * perf_scaled is the average performance during the last sampling
1264         * period scaled by the ratio of the maximum P-state to the P-state
1265         * requested last time (in percent).  That measures the system's
1266         * response to the previous P-state selection.
1267         */
1268        max_pstate = cpu->pstate.max_pstate_physical;
1269        current_pstate = cpu->pstate.current_pstate;
1270        perf_scaled = mul_ext_fp(cpu->sample.core_avg_perf,
1271                               div_fp(100 * max_pstate, current_pstate));
1272
1273        /*
1274         * Since our utilization update callback will not run unless we are
1275         * in C0, check if the actual elapsed time is significantly greater (3x)
1276         * than our sample interval.  If it is, then we were idle for a long
1277         * enough period of time to adjust our performance metric.
1278         */
1279        duration_ns = cpu->sample.time - cpu->last_sample_time;
1280        if ((s64)duration_ns > pid_params.sample_rate_ns * 3) {
1281                sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns);
1282                perf_scaled = mul_fp(perf_scaled, sample_ratio);
1283        } else {
1284                sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc);
1285                if (sample_ratio < int_tofp(1))
1286                        perf_scaled = 0;
1287        }
1288
1289        cpu->sample.busy_scaled = perf_scaled;
1290        return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
1291}
1292
1293static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
1294{
1295        int max_perf, min_perf;
1296
1297        update_turbo_state();
1298
1299        intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
1300        pstate = clamp_t(int, pstate, min_perf, max_perf);
1301        trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1302        if (pstate == cpu->pstate.current_pstate)
1303                return;
1304
1305        cpu->pstate.current_pstate = pstate;
1306        wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
1307}
1308
1309static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
1310{
1311        int from, target_pstate;
1312        struct sample *sample;
1313
1314        from = cpu->pstate.current_pstate;
1315
1316        target_pstate = pstate_funcs.get_target_pstate(cpu);
1317
1318        intel_pstate_update_pstate(cpu, target_pstate);
1319
1320        sample = &cpu->sample;
1321        trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
1322                fp_toint(sample->busy_scaled),
1323                from,
1324                cpu->pstate.current_pstate,
1325                sample->mperf,
1326                sample->aperf,
1327                sample->tsc,
1328                get_avg_frequency(cpu));
1329}
1330
1331static void intel_pstate_update_util(struct update_util_data *data, u64 time,
1332                                     unsigned long util, unsigned long max)
1333{
1334        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1335        u64 delta_ns = time - cpu->sample.time;
1336
1337        if ((s64)delta_ns >= pid_params.sample_rate_ns) {
1338                bool sample_taken = intel_pstate_sample(cpu, time);
1339
1340                if (sample_taken) {
1341                        intel_pstate_calc_avg_perf(cpu);
1342                        if (!hwp_active)
1343                                intel_pstate_adjust_busy_pstate(cpu);
1344                }
1345        }
1346}
1347
1348#define ICPU(model, policy) \
1349        { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
1350                        (unsigned long)&policy }
1351
1352static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
1353        ICPU(INTEL_FAM6_SANDYBRIDGE,            core_params),
1354        ICPU(INTEL_FAM6_SANDYBRIDGE_X,          core_params),
1355        ICPU(INTEL_FAM6_ATOM_SILVERMONT1,       silvermont_params),
1356        ICPU(INTEL_FAM6_IVYBRIDGE,              core_params),
1357        ICPU(INTEL_FAM6_HASWELL_CORE,           core_params),
1358        ICPU(INTEL_FAM6_BROADWELL_CORE,         core_params),
1359        ICPU(INTEL_FAM6_IVYBRIDGE_X,            core_params),
1360        ICPU(INTEL_FAM6_HASWELL_X,              core_params),
1361        ICPU(INTEL_FAM6_HASWELL_ULT,            core_params),
1362        ICPU(INTEL_FAM6_HASWELL_GT3E,           core_params),
1363        ICPU(INTEL_FAM6_BROADWELL_GT3E,         core_params),
1364        ICPU(INTEL_FAM6_ATOM_AIRMONT,           airmont_params),
1365        ICPU(INTEL_FAM6_SKYLAKE_MOBILE,         core_params),
1366        ICPU(INTEL_FAM6_BROADWELL_X,            core_params),
1367        ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,        core_params),
1368        ICPU(INTEL_FAM6_BROADWELL_XEON_D,       core_params),
1369        ICPU(INTEL_FAM6_XEON_PHI_KNL,           knl_params),
1370        ICPU(INTEL_FAM6_ATOM_GOLDMONT,          bxt_params),
1371        {}
1372};
1373MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
1374
1375static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
1376        ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params),
1377        ICPU(INTEL_FAM6_BROADWELL_X, core_params),
1378        ICPU(INTEL_FAM6_SKYLAKE_X, core_params),
1379        {}
1380};
1381
1382static int intel_pstate_init_cpu(unsigned int cpunum)
1383{
1384        struct cpudata *cpu;
1385
1386        if (!all_cpu_data[cpunum])
1387                all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
1388                                               GFP_KERNEL);
1389        if (!all_cpu_data[cpunum])
1390                return -ENOMEM;
1391
1392        cpu = all_cpu_data[cpunum];
1393
1394        cpu->cpu = cpunum;
1395
1396        if (hwp_active) {
1397                intel_pstate_hwp_enable(cpu);
1398                pid_params.sample_rate_ms = 50;
1399                pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
1400        }
1401
1402        intel_pstate_get_cpu_pstates(cpu);
1403
1404        intel_pstate_busy_pid_reset(cpu);
1405
1406        pr_debug("controlling: cpu %d\n", cpunum);
1407
1408        return 0;
1409}
1410
1411static unsigned int intel_pstate_get(unsigned int cpu_num)
1412{
1413        struct cpudata *cpu = all_cpu_data[cpu_num];
1414
1415        return cpu ? get_avg_frequency(cpu) : 0;
1416}
1417
1418static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
1419{
1420        struct cpudata *cpu = all_cpu_data[cpu_num];
1421
1422        if (cpu->update_util_set)
1423                return;
1424
1425        /* Prevent intel_pstate_update_util() from using stale data. */
1426        cpu->sample.time = 0;
1427        cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
1428                                     intel_pstate_update_util);
1429        cpu->update_util_set = true;
1430}
1431
1432static void intel_pstate_clear_update_util_hook(unsigned int cpu)
1433{
1434        struct cpudata *cpu_data = all_cpu_data[cpu];
1435
1436        if (!cpu_data->update_util_set)
1437                return;
1438
1439        cpufreq_remove_update_util_hook(cpu);
1440        cpu_data->update_util_set = false;
1441        synchronize_sched();
1442}
1443
1444static void intel_pstate_set_performance_limits(struct perf_limits *limits)
1445{
1446        limits->no_turbo = 0;
1447        limits->turbo_disabled = 0;
1448        limits->max_perf_pct = 100;
1449        limits->max_perf = int_tofp(1);
1450        limits->min_perf_pct = 100;
1451        limits->min_perf = int_tofp(1);
1452        limits->max_policy_pct = 100;
1453        limits->max_sysfs_pct = 100;
1454        limits->min_policy_pct = 0;
1455        limits->min_sysfs_pct = 0;
1456}
1457
1458static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1459{
1460        struct cpudata *cpu;
1461
1462        if (!policy->cpuinfo.max_freq)
1463                return -ENODEV;
1464
1465        pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
1466                 policy->cpuinfo.max_freq, policy->max);
1467
1468        cpu = all_cpu_data[0];
1469        if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
1470            policy->max < policy->cpuinfo.max_freq &&
1471            policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
1472                pr_debug("policy->max > max non turbo frequency\n");
1473                policy->max = policy->cpuinfo.max_freq;
1474        }
1475
1476        if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
1477                limits = &performance_limits;
1478                if (policy->max >= policy->cpuinfo.max_freq) {
1479                        pr_debug("set performance\n");
1480                        intel_pstate_set_performance_limits(limits);
1481                        goto out;
1482                }
1483        } else {
1484                pr_debug("set powersave\n");
1485                limits = &powersave_limits;
1486        }
1487
1488        limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
1489        limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
1490        limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
1491                                              policy->cpuinfo.max_freq);
1492        limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
1493
1494        /* Normalize user input to [min_policy_pct, max_policy_pct] */
1495        limits->min_perf_pct = max(limits->min_policy_pct,
1496                                   limits->min_sysfs_pct);
1497        limits->min_perf_pct = min(limits->max_policy_pct,
1498                                   limits->min_perf_pct);
1499        limits->max_perf_pct = min(limits->max_policy_pct,
1500                                   limits->max_sysfs_pct);
1501        limits->max_perf_pct = max(limits->min_policy_pct,
1502                                   limits->max_perf_pct);
1503
1504        /* Make sure min_perf_pct <= max_perf_pct */
1505        limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
1506
1507        limits->min_perf = div_fp(limits->min_perf_pct, 100);
1508        limits->max_perf = div_fp(limits->max_perf_pct, 100);
1509        limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
1510
1511 out:
1512        intel_pstate_set_update_util_hook(policy->cpu);
1513
1514        intel_pstate_hwp_set_policy(policy);
1515
1516        return 0;
1517}
1518
1519static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
1520{
1521        cpufreq_verify_within_cpu_limits(policy);
1522
1523        if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
1524            policy->policy != CPUFREQ_POLICY_PERFORMANCE)
1525                return -EINVAL;
1526
1527        return 0;
1528}
1529
1530static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
1531{
1532        int cpu_num = policy->cpu;
1533        struct cpudata *cpu = all_cpu_data[cpu_num];
1534
1535        pr_debug("CPU %d exiting\n", cpu_num);
1536
1537        intel_pstate_clear_update_util_hook(cpu_num);
1538
1539        if (hwp_active)
1540                return;
1541
1542        intel_pstate_set_min_pstate(cpu);
1543}
1544
1545static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
1546{
1547        struct cpudata *cpu;
1548        int rc;
1549
1550        rc = intel_pstate_init_cpu(policy->cpu);
1551        if (rc)
1552                return rc;
1553
1554        cpu = all_cpu_data[policy->cpu];
1555
1556        if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
1557                policy->policy = CPUFREQ_POLICY_PERFORMANCE;
1558        else
1559                policy->policy = CPUFREQ_POLICY_POWERSAVE;
1560
1561        policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
1562        policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
1563
1564        /* cpuinfo and default policy values */
1565        policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
1566        update_turbo_state();
1567        policy->cpuinfo.max_freq = limits->turbo_disabled ?
1568                        cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
1569        policy->cpuinfo.max_freq *= cpu->pstate.scaling;
1570
1571        intel_pstate_init_acpi_perf_limits(policy);
1572        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
1573        cpumask_set_cpu(policy->cpu, policy->cpus);
1574
1575        return 0;
1576}
1577
1578static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
1579{
1580        intel_pstate_exit_perf_limits(policy);
1581
1582        return 0;
1583}
1584
1585static struct cpufreq_driver intel_pstate_driver = {
1586        .flags          = CPUFREQ_CONST_LOOPS,
1587        .verify         = intel_pstate_verify_policy,
1588        .setpolicy      = intel_pstate_set_policy,
1589        .resume         = intel_pstate_hwp_set_policy,
1590        .get            = intel_pstate_get,
1591        .init           = intel_pstate_cpu_init,
1592        .exit           = intel_pstate_cpu_exit,
1593        .stop_cpu       = intel_pstate_stop_cpu,
1594        .name           = "intel_pstate",
1595};
1596
1597static int no_load __initdata;
1598static int no_hwp __initdata;
1599static int hwp_only __initdata;
1600static unsigned int force_load __initdata;
1601
1602static int __init intel_pstate_msrs_not_valid(void)
1603{
1604        if (!pstate_funcs.get_max() ||
1605            !pstate_funcs.get_min() ||
1606            !pstate_funcs.get_turbo())
1607                return -ENODEV;
1608
1609        return 0;
1610}
1611
1612static void __init copy_pid_params(struct pstate_adjust_policy *policy)
1613{
1614        pid_params.sample_rate_ms = policy->sample_rate_ms;
1615        pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
1616        pid_params.p_gain_pct = policy->p_gain_pct;
1617        pid_params.i_gain_pct = policy->i_gain_pct;
1618        pid_params.d_gain_pct = policy->d_gain_pct;
1619        pid_params.deadband = policy->deadband;
1620        pid_params.setpoint = policy->setpoint;
1621}
1622
1623static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
1624{
1625        pstate_funcs.get_max   = funcs->get_max;
1626        pstate_funcs.get_max_physical = funcs->get_max_physical;
1627        pstate_funcs.get_min   = funcs->get_min;
1628        pstate_funcs.get_turbo = funcs->get_turbo;
1629        pstate_funcs.get_scaling = funcs->get_scaling;
1630        pstate_funcs.get_val   = funcs->get_val;
1631        pstate_funcs.get_vid   = funcs->get_vid;
1632        pstate_funcs.get_target_pstate = funcs->get_target_pstate;
1633
1634}
1635
1636#ifdef CONFIG_ACPI
1637
1638static bool __init intel_pstate_no_acpi_pss(void)
1639{
1640        int i;
1641
1642        for_each_possible_cpu(i) {
1643                acpi_status status;
1644                union acpi_object *pss;
1645                struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
1646                struct acpi_processor *pr = per_cpu(processors, i);
1647
1648                if (!pr)
1649                        continue;
1650
1651                status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
1652                if (ACPI_FAILURE(status))
1653                        continue;
1654
1655                pss = buffer.pointer;
1656                if (pss && pss->type == ACPI_TYPE_PACKAGE) {
1657                        kfree(pss);
1658                        return false;
1659                }
1660
1661                kfree(pss);
1662        }
1663
1664        return true;
1665}
1666
1667static bool __init intel_pstate_has_acpi_ppc(void)
1668{
1669        int i;
1670
1671        for_each_possible_cpu(i) {
1672                struct acpi_processor *pr = per_cpu(processors, i);
1673
1674                if (!pr)
1675                        continue;
1676                if (acpi_has_method(pr->handle, "_PPC"))
1677                        return true;
1678        }
1679        return false;
1680}
1681
1682enum {
1683        PSS,
1684        PPC,
1685};
1686
1687struct hw_vendor_info {
1688        u16  valid;
1689        char oem_id[ACPI_OEM_ID_SIZE];
1690        char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
1691        int  oem_pwr_table;
1692};
1693
1694/* Hardware vendor-specific info that has its own power management modes */
1695static struct hw_vendor_info vendor_info[] __initdata = {
1696        {1, "HP    ", "ProLiant", PSS},
1697        {1, "ORACLE", "X4-2    ", PPC},
1698        {1, "ORACLE", "X4-2L   ", PPC},
1699        {1, "ORACLE", "X4-2B   ", PPC},
1700        {1, "ORACLE", "X3-2    ", PPC},
1701        {1, "ORACLE", "X3-2L   ", PPC},
1702        {1, "ORACLE", "X3-2B   ", PPC},
1703        {1, "ORACLE", "X4470M2 ", PPC},
1704        {1, "ORACLE", "X4270M3 ", PPC},
1705        {1, "ORACLE", "X4270M2 ", PPC},
1706        {1, "ORACLE", "X4170M2 ", PPC},
1707        {1, "ORACLE", "X4170 M3", PPC},
1708        {1, "ORACLE", "X4275 M3", PPC},
1709        {1, "ORACLE", "X6-2    ", PPC},
1710        {1, "ORACLE", "Sudbury ", PPC},
1711        {0, "", ""},
1712};
1713
1714static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
1715{
1716        struct acpi_table_header hdr;
1717        struct hw_vendor_info *v_info;
1718        const struct x86_cpu_id *id;
1719        u64 misc_pwr;
1720
1721        id = x86_match_cpu(intel_pstate_cpu_oob_ids);
1722        if (id) {
1723                rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
1724                if ( misc_pwr & (1 << 8))
1725                        return true;
1726        }
1727
1728        if (acpi_disabled ||
1729            ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr)))
1730                return false;
1731
1732        for (v_info = vendor_info; v_info->valid; v_info++) {
1733                if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
1734                        !strncmp(hdr.oem_table_id, v_info->oem_table_id,
1735                                                ACPI_OEM_TABLE_ID_SIZE))
1736                        switch (v_info->oem_pwr_table) {
1737                        case PSS:
1738                                return intel_pstate_no_acpi_pss();
1739                        case PPC:
1740                                return intel_pstate_has_acpi_ppc() &&
1741                                        (!force_load);
1742                        }
1743        }
1744
1745        return false;
1746}
1747#else /* CONFIG_ACPI not enabled */
1748static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
1749static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
1750#endif /* CONFIG_ACPI */
1751
1752static const struct x86_cpu_id hwp_support_ids[] __initconst = {
1753        { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
1754        {}
1755};
1756
1757static int __init intel_pstate_init(void)
1758{
1759        int cpu, rc = 0;
1760        const struct x86_cpu_id *id;
1761        struct cpu_defaults *cpu_def;
1762
1763        if (no_load)
1764                return -ENODEV;
1765
1766        if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
1767                copy_cpu_funcs(&core_params.funcs);
1768                hwp_active++;
1769                goto hwp_cpu_matched;
1770        }
1771
1772        id = x86_match_cpu(intel_pstate_cpu_ids);
1773        if (!id)
1774                return -ENODEV;
1775
1776        cpu_def = (struct cpu_defaults *)id->driver_data;
1777
1778        copy_pid_params(&cpu_def->pid_policy);
1779        copy_cpu_funcs(&cpu_def->funcs);
1780
1781        if (intel_pstate_msrs_not_valid())
1782                return -ENODEV;
1783
1784hwp_cpu_matched:
1785        /*
1786         * The Intel pstate driver will be ignored if the platform
1787         * firmware has its own power management modes.
1788         */
1789        if (intel_pstate_platform_pwr_mgmt_exists())
1790                return -ENODEV;
1791
1792        pr_info("Intel P-state driver initializing\n");
1793
1794        all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
1795        if (!all_cpu_data)
1796                return -ENOMEM;
1797
1798        if (!hwp_active && hwp_only)
1799                goto out;
1800
1801        rc = cpufreq_register_driver(&intel_pstate_driver);
1802        if (rc)
1803                goto out;
1804
1805        intel_pstate_debug_expose_params();
1806        intel_pstate_sysfs_expose_params();
1807
1808        if (hwp_active)
1809                pr_info("HWP enabled\n");
1810
1811        return rc;
1812out:
1813        get_online_cpus();
1814        for_each_online_cpu(cpu) {
1815                if (all_cpu_data[cpu]) {
1816                        intel_pstate_clear_update_util_hook(cpu);
1817                        kfree(all_cpu_data[cpu]);
1818                }
1819        }
1820
1821        put_online_cpus();
1822        vfree(all_cpu_data);
1823        return -ENODEV;
1824}
1825device_initcall(intel_pstate_init);
1826
1827static int __init intel_pstate_setup(char *str)
1828{
1829        if (!str)
1830                return -EINVAL;
1831
1832        if (!strcmp(str, "disable"))
1833                no_load = 1;
1834        if (!strcmp(str, "no_hwp")) {
1835                pr_info("HWP disabled\n");
1836                no_hwp = 1;
1837        }
1838        if (!strcmp(str, "force"))
1839                force_load = 1;
1840        if (!strcmp(str, "hwp_only"))
1841                hwp_only = 1;
1842
1843#ifdef CONFIG_ACPI
1844        if (!strcmp(str, "support_acpi_ppc"))
1845                acpi_ppc = true;
1846#endif
1847
1848        return 0;
1849}
1850early_param("intel_pstate", intel_pstate_setup);
1851
1852MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
1853MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
1854MODULE_LICENSE("GPL");
1855