linux/drivers/thermal/cpu_cooling.c
<<
>>
Prefs
   1/*
   2 *  linux/drivers/thermal/cpu_cooling.c
   3 *
   4 *  Copyright (C) 2012  Samsung Electronics Co., Ltd(http://www.samsung.com)
   5 *  Copyright (C) 2012  Amit Daniel <amit.kachhap@linaro.org>
   6 *
   7 *  Copyright (C) 2014  Viresh Kumar <viresh.kumar@linaro.org>
   8 *
   9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  10 *  This program is free software; you can redistribute it and/or modify
  11 *  it under the terms of the GNU General Public License as published by
  12 *  the Free Software Foundation; version 2 of the License.
  13 *
  14 *  This program is distributed in the hope that it will be useful, but
  15 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  16 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 *  General Public License for more details.
  18 *
  19 *  You should have received a copy of the GNU General Public License along
  20 *  with this program; if not, write to the Free Software Foundation, Inc.,
  21 *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  22 *
  23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24 */
  25#include <linux/module.h>
  26#include <linux/thermal.h>
  27#include <linux/cpufreq.h>
  28#include <linux/err.h>
  29#include <linux/pm_opp.h>
  30#include <linux/slab.h>
  31#include <linux/cpu.h>
  32#include <linux/cpu_cooling.h>
  33
  34#include <trace/events/thermal.h>
  35
  36/*
  37 * Cooling state <-> CPUFreq frequency
  38 *
  39 * Cooling states are translated to frequencies throughout this driver and this
  40 * is the relation between them.
  41 *
  42 * Highest cooling state corresponds to lowest possible frequency.
  43 *
  44 * i.e.
  45 *      level 0 --> 1st Max Freq
  46 *      level 1 --> 2nd Max Freq
  47 *      ...
  48 */
  49
  50/**
  51 * struct power_table - frequency to power conversion
  52 * @frequency:  frequency in KHz
  53 * @power:      power in mW
  54 *
  55 * This structure is built when the cooling device registers and helps
  56 * in translating frequency to power and viceversa.
  57 */
  58struct power_table {
  59        u32 frequency;
  60        u32 power;
  61};
  62
  63/**
  64 * struct cpufreq_cooling_device - data for cooling device with cpufreq
  65 * @id: unique integer value corresponding to each cpufreq_cooling_device
  66 *      registered.
  67 * @cool_dev: thermal_cooling_device pointer to keep track of the
  68 *      registered cooling device.
  69 * @cpufreq_state: integer value representing the current state of cpufreq
  70 *      cooling devices.
  71 * @clipped_freq: integer value representing the absolute value of the clipped
  72 *      frequency.
  73 * @max_level: maximum cooling level. One less than total number of valid
  74 *      cpufreq frequencies.
  75 * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  76 * @node: list_head to link all cpufreq_cooling_device together.
  77 * @last_load: load measured by the latest call to cpufreq_get_actual_power()
  78 * @time_in_idle: previous reading of the absolute time that this cpu was idle
  79 * @time_in_idle_timestamp: wall time of the last invocation of
  80 *      get_cpu_idle_time_us()
  81 * @dyn_power_table: array of struct power_table for frequency to power
  82 *      conversion, sorted in ascending order.
  83 * @dyn_power_table_entries: number of entries in the @dyn_power_table array
  84 * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered
  85 * @plat_get_static_power: callback to calculate the static power
  86 *
  87 * This structure is required for keeping information of each registered
  88 * cpufreq_cooling_device.
  89 */
  90struct cpufreq_cooling_device {
  91        int id;
  92        struct thermal_cooling_device *cool_dev;
  93        unsigned int cpufreq_state;
  94        unsigned int clipped_freq;
  95        unsigned int max_level;
  96        unsigned int *freq_table;       /* In descending order */
  97        struct cpumask allowed_cpus;
  98        struct list_head node;
  99        u32 last_load;
 100        u64 *time_in_idle;
 101        u64 *time_in_idle_timestamp;
 102        struct power_table *dyn_power_table;
 103        int dyn_power_table_entries;
 104        struct device *cpu_dev;
 105        get_static_t plat_get_static_power;
 106};
 107static DEFINE_IDR(cpufreq_idr);
 108static DEFINE_MUTEX(cooling_cpufreq_lock);
 109
 110static unsigned int cpufreq_dev_count;
 111
 112static DEFINE_MUTEX(cooling_list_lock);
 113static LIST_HEAD(cpufreq_dev_list);
 114
 115/**
 116 * get_idr - function to get a unique id.
 117 * @idr: struct idr * handle used to create a id.
 118 * @id: int * value generated by this function.
 119 *
 120 * This function will populate @id with an unique
 121 * id, using the idr API.
 122 *
 123 * Return: 0 on success, an error code on failure.
 124 */
 125static int get_idr(struct idr *idr, int *id)
 126{
 127        int ret;
 128
 129        mutex_lock(&cooling_cpufreq_lock);
 130        ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
 131        mutex_unlock(&cooling_cpufreq_lock);
 132        if (unlikely(ret < 0))
 133                return ret;
 134        *id = ret;
 135
 136        return 0;
 137}
 138
 139/**
 140 * release_idr - function to free the unique id.
 141 * @idr: struct idr * handle used for creating the id.
 142 * @id: int value representing the unique id.
 143 */
 144static void release_idr(struct idr *idr, int id)
 145{
 146        mutex_lock(&cooling_cpufreq_lock);
 147        idr_remove(idr, id);
 148        mutex_unlock(&cooling_cpufreq_lock);
 149}
 150
 151/* Below code defines functions to be used for cpufreq as cooling device */
 152
 153/**
 154 * get_level: Find the level for a particular frequency
 155 * @cpufreq_dev: cpufreq_dev for which the property is required
 156 * @freq: Frequency
 157 *
 158 * Return: level on success, THERMAL_CSTATE_INVALID on error.
 159 */
 160static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_dev,
 161                               unsigned int freq)
 162{
 163        unsigned long level;
 164
 165        for (level = 0; level <= cpufreq_dev->max_level; level++) {
 166                if (freq == cpufreq_dev->freq_table[level])
 167                        return level;
 168
 169                if (freq > cpufreq_dev->freq_table[level])
 170                        break;
 171        }
 172
 173        return THERMAL_CSTATE_INVALID;
 174}
 175
 176/**
 177 * cpufreq_cooling_get_level - for a given cpu, return the cooling level.
 178 * @cpu: cpu for which the level is required
 179 * @freq: the frequency of interest
 180 *
 181 * This function will match the cooling level corresponding to the
 182 * requested @freq and return it.
 183 *
 184 * Return: The matched cooling level on success or THERMAL_CSTATE_INVALID
 185 * otherwise.
 186 */
 187unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
 188{
 189        struct cpufreq_cooling_device *cpufreq_dev;
 190
 191        mutex_lock(&cooling_list_lock);
 192        list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
 193                if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) {
 194                        mutex_unlock(&cooling_list_lock);
 195                        return get_level(cpufreq_dev, freq);
 196                }
 197        }
 198        mutex_unlock(&cooling_list_lock);
 199
 200        pr_err("%s: cpu:%d not part of any cooling device\n", __func__, cpu);
 201        return THERMAL_CSTATE_INVALID;
 202}
 203EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level);
 204
 205/**
 206 * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
 207 * @nb: struct notifier_block * with callback info.
 208 * @event: value showing cpufreq event for which this function invoked.
 209 * @data: callback-specific data
 210 *
 211 * Callback to hijack the notification on cpufreq policy transition.
 212 * Every time there is a change in policy, we will intercept and
 213 * update the cpufreq policy with thermal constraints.
 214 *
 215 * Return: 0 (success)
 216 */
 217static int cpufreq_thermal_notifier(struct notifier_block *nb,
 218                                    unsigned long event, void *data)
 219{
 220        struct cpufreq_policy *policy = data;
 221        unsigned long clipped_freq;
 222        struct cpufreq_cooling_device *cpufreq_dev;
 223
 224        if (event != CPUFREQ_ADJUST)
 225                return NOTIFY_DONE;
 226
 227        mutex_lock(&cooling_list_lock);
 228        list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
 229                if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus))
 230                        continue;
 231
 232                /*
 233                 * policy->max is the maximum allowed frequency defined by user
 234                 * and clipped_freq is the maximum that thermal constraints
 235                 * allow.
 236                 *
 237                 * If clipped_freq is lower than policy->max, then we need to
 238                 * readjust policy->max.
 239                 *
 240                 * But, if clipped_freq is greater than policy->max, we don't
 241                 * need to do anything.
 242                 */
 243                clipped_freq = cpufreq_dev->clipped_freq;
 244
 245                if (policy->max > clipped_freq)
 246                        cpufreq_verify_within_limits(policy, 0, clipped_freq);
 247                break;
 248        }
 249        mutex_unlock(&cooling_list_lock);
 250
 251        return NOTIFY_OK;
 252}
 253
 254/**
 255 * build_dyn_power_table() - create a dynamic power to frequency table
 256 * @cpufreq_device:     the cpufreq cooling device in which to store the table
 257 * @capacitance: dynamic power coefficient for these cpus
 258 *
 259 * Build a dynamic power to frequency table for this cpu and store it
 260 * in @cpufreq_device.  This table will be used in cpu_power_to_freq() and
 261 * cpu_freq_to_power() to convert between power and frequency
 262 * efficiently.  Power is stored in mW, frequency in KHz.  The
 263 * resulting table is in ascending order.
 264 *
 265 * Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
 266 * -ENOMEM if we run out of memory or -EAGAIN if an OPP was
 267 * added/enabled while the function was executing.
 268 */
 269static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device,
 270                                 u32 capacitance)
 271{
 272        struct power_table *power_table;
 273        struct dev_pm_opp *opp;
 274        struct device *dev = NULL;
 275        int num_opps = 0, cpu, i, ret = 0;
 276        unsigned long freq;
 277
 278        for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
 279                dev = get_cpu_device(cpu);
 280                if (!dev) {
 281                        dev_warn(&cpufreq_device->cool_dev->device,
 282                                 "No cpu device for cpu %d\n", cpu);
 283                        continue;
 284                }
 285
 286                num_opps = dev_pm_opp_get_opp_count(dev);
 287                if (num_opps > 0)
 288                        break;
 289                else if (num_opps < 0)
 290                        return num_opps;
 291        }
 292
 293        if (num_opps == 0)
 294                return -EINVAL;
 295
 296        power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL);
 297        if (!power_table)
 298                return -ENOMEM;
 299
 300        rcu_read_lock();
 301
 302        for (freq = 0, i = 0;
 303             opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp);
 304             freq++, i++) {
 305                u32 freq_mhz, voltage_mv;
 306                u64 power;
 307
 308                if (i >= num_opps) {
 309                        rcu_read_unlock();
 310                        ret = -EAGAIN;
 311                        goto free_power_table;
 312                }
 313
 314                freq_mhz = freq / 1000000;
 315                voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
 316
 317                /*
 318                 * Do the multiplication with MHz and millivolt so as
 319                 * to not overflow.
 320                 */
 321                power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
 322                do_div(power, 1000000000);
 323
 324                /* frequency is stored in power_table in KHz */
 325                power_table[i].frequency = freq / 1000;
 326
 327                /* power is stored in mW */
 328                power_table[i].power = power;
 329        }
 330
 331        rcu_read_unlock();
 332
 333        if (i != num_opps) {
 334                ret = PTR_ERR(opp);
 335                goto free_power_table;
 336        }
 337
 338        cpufreq_device->cpu_dev = dev;
 339        cpufreq_device->dyn_power_table = power_table;
 340        cpufreq_device->dyn_power_table_entries = i;
 341
 342        return 0;
 343
 344free_power_table:
 345        kfree(power_table);
 346
 347        return ret;
 348}
 349
 350static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device,
 351                             u32 freq)
 352{
 353        int i;
 354        struct power_table *pt = cpufreq_device->dyn_power_table;
 355
 356        for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
 357                if (freq < pt[i].frequency)
 358                        break;
 359
 360        return pt[i - 1].power;
 361}
 362
 363static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
 364                             u32 power)
 365{
 366        int i;
 367        struct power_table *pt = cpufreq_device->dyn_power_table;
 368
 369        for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
 370                if (power < pt[i].power)
 371                        break;
 372
 373        return pt[i - 1].frequency;
 374}
 375
 376/**
 377 * get_load() - get load for a cpu since last updated
 378 * @cpufreq_device:     &struct cpufreq_cooling_device for this cpu
 379 * @cpu:        cpu number
 380 * @cpu_idx:    index of the cpu in cpufreq_device->allowed_cpus
 381 *
 382 * Return: The average load of cpu @cpu in percentage since this
 383 * function was last called.
 384 */
 385static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu,
 386                    int cpu_idx)
 387{
 388        u32 load;
 389        u64 now, now_idle, delta_time, delta_idle;
 390
 391        now_idle = get_cpu_idle_time(cpu, &now, 0);
 392        delta_idle = now_idle - cpufreq_device->time_in_idle[cpu_idx];
 393        delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu_idx];
 394
 395        if (delta_time <= delta_idle)
 396                load = 0;
 397        else
 398                load = div64_u64(100 * (delta_time - delta_idle), delta_time);
 399
 400        cpufreq_device->time_in_idle[cpu_idx] = now_idle;
 401        cpufreq_device->time_in_idle_timestamp[cpu_idx] = now;
 402
 403        return load;
 404}
 405
 406/**
 407 * get_static_power() - calculate the static power consumed by the cpus
 408 * @cpufreq_device:     struct &cpufreq_cooling_device for this cpu cdev
 409 * @tz:         thermal zone device in which we're operating
 410 * @freq:       frequency in KHz
 411 * @power:      pointer in which to store the calculated static power
 412 *
 413 * Calculate the static power consumed by the cpus described by
 414 * @cpu_actor running at frequency @freq.  This function relies on a
 415 * platform specific function that should have been provided when the
 416 * actor was registered.  If it wasn't, the static power is assumed to
 417 * be negligible.  The calculated static power is stored in @power.
 418 *
 419 * Return: 0 on success, -E* on failure.
 420 */
 421static int get_static_power(struct cpufreq_cooling_device *cpufreq_device,
 422                            struct thermal_zone_device *tz, unsigned long freq,
 423                            u32 *power)
 424{
 425        struct dev_pm_opp *opp;
 426        unsigned long voltage;
 427        struct cpumask *cpumask = &cpufreq_device->allowed_cpus;
 428        unsigned long freq_hz = freq * 1000;
 429
 430        if (!cpufreq_device->plat_get_static_power ||
 431            !cpufreq_device->cpu_dev) {
 432                *power = 0;
 433                return 0;
 434        }
 435
 436        rcu_read_lock();
 437
 438        opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz,
 439                                         true);
 440        voltage = dev_pm_opp_get_voltage(opp);
 441
 442        rcu_read_unlock();
 443
 444        if (voltage == 0) {
 445                dev_warn_ratelimited(cpufreq_device->cpu_dev,
 446                                     "Failed to get voltage for frequency %lu: %ld\n",
 447                                     freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0);
 448                return -EINVAL;
 449        }
 450
 451        return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay,
 452                                                     voltage, power);
 453}
 454
 455/**
 456 * get_dynamic_power() - calculate the dynamic power
 457 * @cpufreq_device:     &cpufreq_cooling_device for this cdev
 458 * @freq:       current frequency
 459 *
 460 * Return: the dynamic power consumed by the cpus described by
 461 * @cpufreq_device.
 462 */
 463static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device,
 464                             unsigned long freq)
 465{
 466        u32 raw_cpu_power;
 467
 468        raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq);
 469        return (raw_cpu_power * cpufreq_device->last_load) / 100;
 470}
 471
 472/* cpufreq cooling device callback functions are defined below */
 473
 474/**
 475 * cpufreq_get_max_state - callback function to get the max cooling state.
 476 * @cdev: thermal cooling device pointer.
 477 * @state: fill this variable with the max cooling state.
 478 *
 479 * Callback for the thermal cooling device to return the cpufreq
 480 * max cooling state.
 481 *
 482 * Return: 0 on success, an error code otherwise.
 483 */
 484static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
 485                                 unsigned long *state)
 486{
 487        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 488
 489        *state = cpufreq_device->max_level;
 490        return 0;
 491}
 492
 493/**
 494 * cpufreq_get_cur_state - callback function to get the current cooling state.
 495 * @cdev: thermal cooling device pointer.
 496 * @state: fill this variable with the current cooling state.
 497 *
 498 * Callback for the thermal cooling device to return the cpufreq
 499 * current cooling state.
 500 *
 501 * Return: 0 on success, an error code otherwise.
 502 */
 503static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
 504                                 unsigned long *state)
 505{
 506        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 507
 508        *state = cpufreq_device->cpufreq_state;
 509
 510        return 0;
 511}
 512
 513/**
 514 * cpufreq_set_cur_state - callback function to set the current cooling state.
 515 * @cdev: thermal cooling device pointer.
 516 * @state: set this variable to the current cooling state.
 517 *
 518 * Callback for the thermal cooling device to change the cpufreq
 519 * current cooling state.
 520 *
 521 * Return: 0 on success, an error code otherwise.
 522 */
 523static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 524                                 unsigned long state)
 525{
 526        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 527        unsigned int cpu = cpumask_any(&cpufreq_device->allowed_cpus);
 528        unsigned int clip_freq;
 529
 530        /* Request state should be less than max_level */
 531        if (WARN_ON(state > cpufreq_device->max_level))
 532                return -EINVAL;
 533
 534        /* Check if the old cooling action is same as new cooling action */
 535        if (cpufreq_device->cpufreq_state == state)
 536                return 0;
 537
 538        clip_freq = cpufreq_device->freq_table[state];
 539        cpufreq_device->cpufreq_state = state;
 540        cpufreq_device->clipped_freq = clip_freq;
 541
 542        cpufreq_update_policy(cpu);
 543
 544        return 0;
 545}
 546
 547/**
 548 * cpufreq_get_requested_power() - get the current power
 549 * @cdev:       &thermal_cooling_device pointer
 550 * @tz:         a valid thermal zone device pointer
 551 * @power:      pointer in which to store the resulting power
 552 *
 553 * Calculate the current power consumption of the cpus in milliwatts
 554 * and store it in @power.  This function should actually calculate
 555 * the requested power, but it's hard to get the frequency that
 556 * cpufreq would have assigned if there were no thermal limits.
 557 * Instead, we calculate the current power on the assumption that the
 558 * immediate future will look like the immediate past.
 559 *
 560 * We use the current frequency and the average load since this
 561 * function was last called.  In reality, there could have been
 562 * multiple opps since this function was last called and that affects
 563 * the load calculation.  While it's not perfectly accurate, this
 564 * simplification is good enough and works.  REVISIT this, as more
 565 * complex code may be needed if experiments show that it's not
 566 * accurate enough.
 567 *
 568 * Return: 0 on success, -E* if getting the static power failed.
 569 */
 570static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 571                                       struct thermal_zone_device *tz,
 572                                       u32 *power)
 573{
 574        unsigned long freq;
 575        int i = 0, cpu, ret;
 576        u32 static_power, dynamic_power, total_load = 0;
 577        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 578        u32 *load_cpu = NULL;
 579
 580        cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
 581
 582        /*
 583         * All the CPUs are offline, thus the requested power by
 584         * the cdev is 0
 585         */
 586        if (cpu >= nr_cpu_ids) {
 587                *power = 0;
 588                return 0;
 589        }
 590
 591        freq = cpufreq_quick_get(cpu);
 592
 593        if (trace_thermal_power_cpu_get_power_enabled()) {
 594                u32 ncpus = cpumask_weight(&cpufreq_device->allowed_cpus);
 595
 596                load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL);
 597        }
 598
 599        for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
 600                u32 load;
 601
 602                if (cpu_online(cpu))
 603                        load = get_load(cpufreq_device, cpu, i);
 604                else
 605                        load = 0;
 606
 607                total_load += load;
 608                if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
 609                        load_cpu[i] = load;
 610
 611                i++;
 612        }
 613
 614        cpufreq_device->last_load = total_load;
 615
 616        dynamic_power = get_dynamic_power(cpufreq_device, freq);
 617        ret = get_static_power(cpufreq_device, tz, freq, &static_power);
 618        if (ret) {
 619                kfree(load_cpu);
 620                return ret;
 621        }
 622
 623        if (load_cpu) {
 624                trace_thermal_power_cpu_get_power(
 625                        &cpufreq_device->allowed_cpus,
 626                        freq, load_cpu, i, dynamic_power, static_power);
 627
 628                kfree(load_cpu);
 629        }
 630
 631        *power = static_power + dynamic_power;
 632        return 0;
 633}
 634
 635/**
 636 * cpufreq_state2power() - convert a cpu cdev state to power consumed
 637 * @cdev:       &thermal_cooling_device pointer
 638 * @tz:         a valid thermal zone device pointer
 639 * @state:      cooling device state to be converted
 640 * @power:      pointer in which to store the resulting power
 641 *
 642 * Convert cooling device state @state into power consumption in
 643 * milliwatts assuming 100% load.  Store the calculated power in
 644 * @power.
 645 *
 646 * Return: 0 on success, -EINVAL if the cooling device state could not
 647 * be converted into a frequency or other -E* if there was an error
 648 * when calculating the static power.
 649 */
 650static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 651                               struct thermal_zone_device *tz,
 652                               unsigned long state, u32 *power)
 653{
 654        unsigned int freq, num_cpus;
 655        cpumask_t cpumask;
 656        u32 static_power, dynamic_power;
 657        int ret;
 658        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 659
 660        cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
 661        num_cpus = cpumask_weight(&cpumask);
 662
 663        /* None of our cpus are online, so no power */
 664        if (num_cpus == 0) {
 665                *power = 0;
 666                return 0;
 667        }
 668
 669        freq = cpufreq_device->freq_table[state];
 670        if (!freq)
 671                return -EINVAL;
 672
 673        dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
 674        ret = get_static_power(cpufreq_device, tz, freq, &static_power);
 675        if (ret)
 676                return ret;
 677
 678        *power = static_power + dynamic_power;
 679        return 0;
 680}
 681
 682/**
 683 * cpufreq_power2state() - convert power to a cooling device state
 684 * @cdev:       &thermal_cooling_device pointer
 685 * @tz:         a valid thermal zone device pointer
 686 * @power:      power in milliwatts to be converted
 687 * @state:      pointer in which to store the resulting state
 688 *
 689 * Calculate a cooling device state for the cpus described by @cdev
 690 * that would allow them to consume at most @power mW and store it in
 691 * @state.  Note that this calculation depends on external factors
 692 * such as the cpu load or the current static power.  Calling this
 693 * function with the same power as input can yield different cooling
 694 * device states depending on those external factors.
 695 *
 696 * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
 697 * the calculated frequency could not be converted to a valid state.
 698 * The latter should not happen unless the frequencies available to
 699 * cpufreq have changed since the initialization of the cpu cooling
 700 * device.
 701 */
 702static int cpufreq_power2state(struct thermal_cooling_device *cdev,
 703                               struct thermal_zone_device *tz, u32 power,
 704                               unsigned long *state)
 705{
 706        unsigned int cpu, cur_freq, target_freq;
 707        int ret;
 708        s32 dyn_power;
 709        u32 last_load, normalised_power, static_power;
 710        struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 711
 712        cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
 713
 714        /* None of our cpus are online */
 715        if (cpu >= nr_cpu_ids)
 716                return -ENODEV;
 717
 718        cur_freq = cpufreq_quick_get(cpu);
 719        ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
 720        if (ret)
 721                return ret;
 722
 723        dyn_power = power - static_power;
 724        dyn_power = dyn_power > 0 ? dyn_power : 0;
 725        last_load = cpufreq_device->last_load ?: 1;
 726        normalised_power = (dyn_power * 100) / last_load;
 727        target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
 728
 729        *state = cpufreq_cooling_get_level(cpu, target_freq);
 730        if (*state == THERMAL_CSTATE_INVALID) {
 731                dev_warn_ratelimited(&cdev->device,
 732                                     "Failed to convert %dKHz for cpu %d into a cdev state\n",
 733                                     target_freq, cpu);
 734                return -EINVAL;
 735        }
 736
 737        trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus,
 738                                      target_freq, *state, power);
 739        return 0;
 740}
 741
 742/* Bind cpufreq callbacks to thermal cooling device ops */
 743static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 744        .get_max_state = cpufreq_get_max_state,
 745        .get_cur_state = cpufreq_get_cur_state,
 746        .set_cur_state = cpufreq_set_cur_state,
 747};
 748
 749/* Notifier for cpufreq policy change */
 750static struct notifier_block thermal_cpufreq_notifier_block = {
 751        .notifier_call = cpufreq_thermal_notifier,
 752};
 753
 754static unsigned int find_next_max(struct cpufreq_frequency_table *table,
 755                                  unsigned int prev_max)
 756{
 757        struct cpufreq_frequency_table *pos;
 758        unsigned int max = 0;
 759
 760        cpufreq_for_each_valid_entry(pos, table) {
 761                if (pos->frequency > max && pos->frequency < prev_max)
 762                        max = pos->frequency;
 763        }
 764
 765        return max;
 766}
 767
 768/**
 769 * __cpufreq_cooling_register - helper function to create cpufreq cooling device
 770 * @np: a valid struct device_node to the cooling device device tree node
 771 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 772 * Normally this should be same as cpufreq policy->related_cpus.
 773 * @capacitance: dynamic power coefficient for these cpus
 774 * @plat_static_func: function to calculate the static power consumed by these
 775 *                    cpus (optional)
 776 *
 777 * This interface function registers the cpufreq cooling device with the name
 778 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 779 * cooling devices. It also gives the opportunity to link the cooling device
 780 * with a device tree node, in order to bind it via the thermal DT code.
 781 *
 782 * Return: a valid struct thermal_cooling_device pointer on success,
 783 * on failure, it returns a corresponding ERR_PTR().
 784 */
 785static struct thermal_cooling_device *
 786__cpufreq_cooling_register(struct device_node *np,
 787                        const struct cpumask *clip_cpus, u32 capacitance,
 788                        get_static_t plat_static_func)
 789{
 790        struct thermal_cooling_device *cool_dev;
 791        struct cpufreq_cooling_device *cpufreq_dev;
 792        char dev_name[THERMAL_NAME_LENGTH];
 793        struct cpufreq_frequency_table *pos, *table;
 794        unsigned int freq, i, num_cpus;
 795        int ret;
 796
 797        table = cpufreq_frequency_get_table(cpumask_first(clip_cpus));
 798        if (!table) {
 799                pr_debug("%s: CPUFreq table not found\n", __func__);
 800                return ERR_PTR(-EPROBE_DEFER);
 801        }
 802
 803        cpufreq_dev = kzalloc(sizeof(*cpufreq_dev), GFP_KERNEL);
 804        if (!cpufreq_dev)
 805                return ERR_PTR(-ENOMEM);
 806
 807        num_cpus = cpumask_weight(clip_cpus);
 808        cpufreq_dev->time_in_idle = kcalloc(num_cpus,
 809                                            sizeof(*cpufreq_dev->time_in_idle),
 810                                            GFP_KERNEL);
 811        if (!cpufreq_dev->time_in_idle) {
 812                cool_dev = ERR_PTR(-ENOMEM);
 813                goto free_cdev;
 814        }
 815
 816        cpufreq_dev->time_in_idle_timestamp =
 817                kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp),
 818                        GFP_KERNEL);
 819        if (!cpufreq_dev->time_in_idle_timestamp) {
 820                cool_dev = ERR_PTR(-ENOMEM);
 821                goto free_time_in_idle;
 822        }
 823
 824        /* Find max levels */
 825        cpufreq_for_each_valid_entry(pos, table)
 826                cpufreq_dev->max_level++;
 827
 828        cpufreq_dev->freq_table = kmalloc(sizeof(*cpufreq_dev->freq_table) *
 829                                          cpufreq_dev->max_level, GFP_KERNEL);
 830        if (!cpufreq_dev->freq_table) {
 831                cool_dev = ERR_PTR(-ENOMEM);
 832                goto free_time_in_idle_timestamp;
 833        }
 834
 835        /* max_level is an index, not a counter */
 836        cpufreq_dev->max_level--;
 837
 838        cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus);
 839
 840        if (capacitance) {
 841                cpufreq_cooling_ops.get_requested_power =
 842                        cpufreq_get_requested_power;
 843                cpufreq_cooling_ops.state2power = cpufreq_state2power;
 844                cpufreq_cooling_ops.power2state = cpufreq_power2state;
 845                cpufreq_dev->plat_get_static_power = plat_static_func;
 846
 847                ret = build_dyn_power_table(cpufreq_dev, capacitance);
 848                if (ret) {
 849                        cool_dev = ERR_PTR(ret);
 850                        goto free_table;
 851                }
 852        }
 853
 854        ret = get_idr(&cpufreq_idr, &cpufreq_dev->id);
 855        if (ret) {
 856                cool_dev = ERR_PTR(ret);
 857                goto free_power_table;
 858        }
 859
 860        snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
 861                 cpufreq_dev->id);
 862
 863        cool_dev = thermal_of_cooling_device_register(np, dev_name, cpufreq_dev,
 864                                                      &cpufreq_cooling_ops);
 865        if (IS_ERR(cool_dev))
 866                goto remove_idr;
 867
 868        /* Fill freq-table in descending order of frequencies */
 869        for (i = 0, freq = -1; i <= cpufreq_dev->max_level; i++) {
 870                freq = find_next_max(table, freq);
 871                cpufreq_dev->freq_table[i] = freq;
 872
 873                /* Warn for duplicate entries */
 874                if (!freq)
 875                        pr_warn("%s: table has duplicate entries\n", __func__);
 876                else
 877                        pr_debug("%s: freq:%u KHz\n", __func__, freq);
 878        }
 879
 880        cpufreq_dev->clipped_freq = cpufreq_dev->freq_table[0];
 881        cpufreq_dev->cool_dev = cool_dev;
 882
 883        mutex_lock(&cooling_cpufreq_lock);
 884
 885        mutex_lock(&cooling_list_lock);
 886        list_add(&cpufreq_dev->node, &cpufreq_dev_list);
 887        mutex_unlock(&cooling_list_lock);
 888
 889        /* Register the notifier for first cpufreq cooling device */
 890        if (!cpufreq_dev_count++)
 891                cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
 892                                          CPUFREQ_POLICY_NOTIFIER);
 893        mutex_unlock(&cooling_cpufreq_lock);
 894
 895        return cool_dev;
 896
 897remove_idr:
 898        release_idr(&cpufreq_idr, cpufreq_dev->id);
 899free_power_table:
 900        kfree(cpufreq_dev->dyn_power_table);
 901free_table:
 902        kfree(cpufreq_dev->freq_table);
 903free_time_in_idle_timestamp:
 904        kfree(cpufreq_dev->time_in_idle_timestamp);
 905free_time_in_idle:
 906        kfree(cpufreq_dev->time_in_idle);
 907free_cdev:
 908        kfree(cpufreq_dev);
 909
 910        return cool_dev;
 911}
 912
 913/**
 914 * cpufreq_cooling_register - function to create cpufreq cooling device.
 915 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 916 *
 917 * This interface function registers the cpufreq cooling device with the name
 918 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 919 * cooling devices.
 920 *
 921 * Return: a valid struct thermal_cooling_device pointer on success,
 922 * on failure, it returns a corresponding ERR_PTR().
 923 */
 924struct thermal_cooling_device *
 925cpufreq_cooling_register(const struct cpumask *clip_cpus)
 926{
 927        return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL);
 928}
 929EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 930
 931/**
 932 * of_cpufreq_cooling_register - function to create cpufreq cooling device.
 933 * @np: a valid struct device_node to the cooling device device tree node
 934 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 935 *
 936 * This interface function registers the cpufreq cooling device with the name
 937 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 938 * cooling devices. Using this API, the cpufreq cooling device will be
 939 * linked to the device tree node provided.
 940 *
 941 * Return: a valid struct thermal_cooling_device pointer on success,
 942 * on failure, it returns a corresponding ERR_PTR().
 943 */
 944struct thermal_cooling_device *
 945of_cpufreq_cooling_register(struct device_node *np,
 946                            const struct cpumask *clip_cpus)
 947{
 948        if (!np)
 949                return ERR_PTR(-EINVAL);
 950
 951        return __cpufreq_cooling_register(np, clip_cpus, 0, NULL);
 952}
 953EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 954
 955/**
 956 * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
 957 * @clip_cpus:  cpumask of cpus where the frequency constraints will happen
 958 * @capacitance:        dynamic power coefficient for these cpus
 959 * @plat_static_func:   function to calculate the static power consumed by these
 960 *                      cpus (optional)
 961 *
 962 * This interface function registers the cpufreq cooling device with
 963 * the name "thermal-cpufreq-%x".  This api can support multiple
 964 * instances of cpufreq cooling devices.  Using this function, the
 965 * cooling device will implement the power extensions by using a
 966 * simple cpu power model.  The cpus must have registered their OPPs
 967 * using the OPP library.
 968 *
 969 * An optional @plat_static_func may be provided to calculate the
 970 * static power consumed by these cpus.  If the platform's static
 971 * power consumption is unknown or negligible, make it NULL.
 972 *
 973 * Return: a valid struct thermal_cooling_device pointer on success,
 974 * on failure, it returns a corresponding ERR_PTR().
 975 */
 976struct thermal_cooling_device *
 977cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance,
 978                               get_static_t plat_static_func)
 979{
 980        return __cpufreq_cooling_register(NULL, clip_cpus, capacitance,
 981                                plat_static_func);
 982}
 983EXPORT_SYMBOL(cpufreq_power_cooling_register);
 984
 985/**
 986 * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
 987 * @np: a valid struct device_node to the cooling device device tree node
 988 * @clip_cpus:  cpumask of cpus where the frequency constraints will happen
 989 * @capacitance:        dynamic power coefficient for these cpus
 990 * @plat_static_func:   function to calculate the static power consumed by these
 991 *                      cpus (optional)
 992 *
 993 * This interface function registers the cpufreq cooling device with
 994 * the name "thermal-cpufreq-%x".  This api can support multiple
 995 * instances of cpufreq cooling devices.  Using this API, the cpufreq
 996 * cooling device will be linked to the device tree node provided.
 997 * Using this function, the cooling device will implement the power
 998 * extensions by using a simple cpu power model.  The cpus must have
 999 * registered their OPPs using the OPP library.
1000 *
1001 * An optional @plat_static_func may be provided to calculate the
1002 * static power consumed by these cpus.  If the platform's static
1003 * power consumption is unknown or negligible, make it NULL.
1004 *
1005 * Return: a valid struct thermal_cooling_device pointer on success,
1006 * on failure, it returns a corresponding ERR_PTR().
1007 */
1008struct thermal_cooling_device *
1009of_cpufreq_power_cooling_register(struct device_node *np,
1010                                  const struct cpumask *clip_cpus,
1011                                  u32 capacitance,
1012                                  get_static_t plat_static_func)
1013{
1014        if (!np)
1015                return ERR_PTR(-EINVAL);
1016
1017        return __cpufreq_cooling_register(np, clip_cpus, capacitance,
1018                                plat_static_func);
1019}
1020EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
1021
1022/**
1023 * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
1024 * @cdev: thermal cooling device pointer.
1025 *
1026 * This interface function unregisters the "thermal-cpufreq-%x" cooling device.
1027 */
1028void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
1029{
1030        struct cpufreq_cooling_device *cpufreq_dev;
1031
1032        if (!cdev)
1033                return;
1034
1035        cpufreq_dev = cdev->devdata;
1036
1037        /* Unregister the notifier for the last cpufreq cooling device */
1038        mutex_lock(&cooling_cpufreq_lock);
1039        if (!--cpufreq_dev_count)
1040                cpufreq_unregister_notifier(&thermal_cpufreq_notifier_block,
1041                                            CPUFREQ_POLICY_NOTIFIER);
1042
1043        mutex_lock(&cooling_list_lock);
1044        list_del(&cpufreq_dev->node);
1045        mutex_unlock(&cooling_list_lock);
1046
1047        mutex_unlock(&cooling_cpufreq_lock);
1048
1049        thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
1050        release_idr(&cpufreq_idr, cpufreq_dev->id);
1051        kfree(cpufreq_dev->dyn_power_table);
1052        kfree(cpufreq_dev->time_in_idle_timestamp);
1053        kfree(cpufreq_dev->time_in_idle);
1054        kfree(cpufreq_dev->freq_table);
1055        kfree(cpufreq_dev);
1056}
1057EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
1058