linux/drivers/thermal/intel_powerclamp.c
<<
>>
Prefs
   1/*
   2 * intel_powerclamp.c - package c-state idle injection
   3 *
   4 * Copyright (c) 2012, Intel Corporation.
   5 *
   6 * Authors:
   7 *     Arjan van de Ven <arjan@linux.intel.com>
   8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms and conditions of the GNU General Public License,
  12 * version 2, as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope it will be useful, but WITHOUT
  15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17 * more details.
  18 *
  19 * You should have received a copy of the GNU General Public License along with
  20 * this program; if not, write to the Free Software Foundation, Inc.,
  21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  22 *
  23 *
  24 *      TODO:
  25 *           1. better handle wakeup from external interrupts, currently a fixed
  26 *              compensation is added to clamping duration when excessive amount
  27 *              of wakeups are observed during idle time. the reason is that in
  28 *              case of external interrupts without need for ack, clamping down
  29 *              cpu in non-irq context does not reduce irq. for majority of the
  30 *              cases, clamping down cpu does help reduce irq as well, we should
  31 *              be able to differenciate the two cases and give a quantitative
  32 *              solution for the irqs that we can control. perhaps based on
  33 *              get_cpu_iowait_time_us()
  34 *
  35 *           2. synchronization with other hw blocks
  36 *
  37 *
  38 */
  39
  40#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  41
  42#include <linux/module.h>
  43#include <linux/kernel.h>
  44#include <linux/delay.h>
  45#include <linux/kthread.h>
  46#include <linux/cpu.h>
  47#include <linux/thermal.h>
  48#include <linux/slab.h>
  49#include <linux/tick.h>
  50#include <linux/debugfs.h>
  51#include <linux/seq_file.h>
  52#include <linux/sched/rt.h>
  53#include <uapi/linux/sched/types.h>
  54
  55#include <asm/nmi.h>
  56#include <asm/msr.h>
  57#include <asm/mwait.h>
  58#include <asm/cpu_device_id.h>
  59#include <asm/hardirq.h>
  60
  61#define MAX_TARGET_RATIO (50U)
  62/* For each undisturbed clamping period (no extra wake ups during idle time),
  63 * we increment the confidence counter for the given target ratio.
  64 * CONFIDENCE_OK defines the level where runtime calibration results are
  65 * valid.
  66 */
  67#define CONFIDENCE_OK (3)
  68/* Default idle injection duration, driver adjust sleep time to meet target
  69 * idle ratio. Similar to frequency modulation.
  70 */
  71#define DEFAULT_DURATION_JIFFIES (6)
  72
  73static unsigned int target_mwait;
  74static struct dentry *debug_dir;
  75
  76/* user selected target */
  77static unsigned int set_target_ratio;
  78static unsigned int current_ratio;
  79static bool should_skip;
  80static bool reduce_irq;
  81static atomic_t idle_wakeup_counter;
  82static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  83                                  * control parameters. default to BSP but BSP
  84                                  * can be offlined.
  85                                  */
  86static bool clamping;
  87
  88static const struct sched_param sparam = {
  89        .sched_priority = MAX_USER_RT_PRIO / 2,
  90};
  91struct powerclamp_worker_data {
  92        struct kthread_worker *worker;
  93        struct kthread_work balancing_work;
  94        struct kthread_delayed_work idle_injection_work;
  95        unsigned int cpu;
  96        unsigned int count;
  97        unsigned int guard;
  98        unsigned int window_size_now;
  99        unsigned int target_ratio;
 100        unsigned int duration_jiffies;
 101        bool clamping;
 102};
 103
 104static struct powerclamp_worker_data * __percpu worker_data;
 105static struct thermal_cooling_device *cooling_dev;
 106static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 107                                           * clamping kthread worker
 108                                           */
 109
 110static unsigned int duration;
 111static unsigned int pkg_cstate_ratio_cur;
 112static unsigned int window_size;
 113
 114static int duration_set(const char *arg, const struct kernel_param *kp)
 115{
 116        int ret = 0;
 117        unsigned long new_duration;
 118
 119        ret = kstrtoul(arg, 10, &new_duration);
 120        if (ret)
 121                goto exit;
 122        if (new_duration > 25 || new_duration < 6) {
 123                pr_err("Out of recommended range %lu, between 6-25ms\n",
 124                        new_duration);
 125                ret = -EINVAL;
 126        }
 127
 128        duration = clamp(new_duration, 6ul, 25ul);
 129        smp_mb();
 130
 131exit:
 132
 133        return ret;
 134}
 135
 136static const struct kernel_param_ops duration_ops = {
 137        .set = duration_set,
 138        .get = param_get_int,
 139};
 140
 141
 142module_param_cb(duration, &duration_ops, &duration, 0644);
 143MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 144
 145struct powerclamp_calibration_data {
 146        unsigned long confidence;  /* used for calibration, basically a counter
 147                                    * gets incremented each time a clamping
 148                                    * period is completed without extra wakeups
 149                                    * once that counter is reached given level,
 150                                    * compensation is deemed usable.
 151                                    */
 152        unsigned long steady_comp; /* steady state compensation used when
 153                                    * no extra wakeups occurred.
 154                                    */
 155        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 156                                     * mostly from external interrupts.
 157                                     */
 158};
 159
 160static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 161
 162static int window_size_set(const char *arg, const struct kernel_param *kp)
 163{
 164        int ret = 0;
 165        unsigned long new_window_size;
 166
 167        ret = kstrtoul(arg, 10, &new_window_size);
 168        if (ret)
 169                goto exit_win;
 170        if (new_window_size > 10 || new_window_size < 2) {
 171                pr_err("Out of recommended window size %lu, between 2-10\n",
 172                        new_window_size);
 173                ret = -EINVAL;
 174        }
 175
 176        window_size = clamp(new_window_size, 2ul, 10ul);
 177        smp_mb();
 178
 179exit_win:
 180
 181        return ret;
 182}
 183
 184static const struct kernel_param_ops window_size_ops = {
 185        .set = window_size_set,
 186        .get = param_get_int,
 187};
 188
 189module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 190MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 191        "\tpowerclamp controls idle ratio within this window. larger\n"
 192        "\twindow size results in slower response time but more smooth\n"
 193        "\tclamping results. default to 2.");
 194
 195static void find_target_mwait(void)
 196{
 197        unsigned int eax, ebx, ecx, edx;
 198        unsigned int highest_cstate = 0;
 199        unsigned int highest_subcstate = 0;
 200        int i;
 201
 202        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 203                return;
 204
 205        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 206
 207        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 208            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 209                return;
 210
 211        edx >>= MWAIT_SUBSTATE_SIZE;
 212        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 213                if (edx & MWAIT_SUBSTATE_MASK) {
 214                        highest_cstate = i;
 215                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 216                }
 217        }
 218        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 219                (highest_subcstate - 1);
 220
 221}
 222
 223struct pkg_cstate_info {
 224        bool skip;
 225        int msr_index;
 226        int cstate_id;
 227};
 228
 229#define PKG_CSTATE_INIT(id) {                           \
 230                .msr_index = MSR_PKG_C##id##_RESIDENCY, \
 231                .cstate_id = id                         \
 232                        }
 233
 234static struct pkg_cstate_info pkg_cstates[] = {
 235        PKG_CSTATE_INIT(2),
 236        PKG_CSTATE_INIT(3),
 237        PKG_CSTATE_INIT(6),
 238        PKG_CSTATE_INIT(7),
 239        PKG_CSTATE_INIT(8),
 240        PKG_CSTATE_INIT(9),
 241        PKG_CSTATE_INIT(10),
 242        {NULL},
 243};
 244
 245static bool has_pkg_state_counter(void)
 246{
 247        u64 val;
 248        struct pkg_cstate_info *info = pkg_cstates;
 249
 250        /* check if any one of the counter msrs exists */
 251        while (info->msr_index) {
 252                if (!rdmsrl_safe(info->msr_index, &val))
 253                        return true;
 254                info++;
 255        }
 256
 257        return false;
 258}
 259
 260static u64 pkg_state_counter(void)
 261{
 262        u64 val;
 263        u64 count = 0;
 264        struct pkg_cstate_info *info = pkg_cstates;
 265
 266        while (info->msr_index) {
 267                if (!info->skip) {
 268                        if (!rdmsrl_safe(info->msr_index, &val))
 269                                count += val;
 270                        else
 271                                info->skip = true;
 272                }
 273                info++;
 274        }
 275
 276        return count;
 277}
 278
 279static unsigned int get_compensation(int ratio)
 280{
 281        unsigned int comp = 0;
 282
 283        /* we only use compensation if all adjacent ones are good */
 284        if (ratio == 1 &&
 285                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 286                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 287                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 288                comp = (cal_data[ratio].steady_comp +
 289                        cal_data[ratio + 1].steady_comp +
 290                        cal_data[ratio + 2].steady_comp) / 3;
 291        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 292                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 293                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 294                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 295                comp = (cal_data[ratio].steady_comp +
 296                        cal_data[ratio - 1].steady_comp +
 297                        cal_data[ratio - 2].steady_comp) / 3;
 298        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 299                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 300                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 301                comp = (cal_data[ratio].steady_comp +
 302                        cal_data[ratio - 1].steady_comp +
 303                        cal_data[ratio + 1].steady_comp) / 3;
 304        }
 305
 306        /* REVISIT: simple penalty of double idle injection */
 307        if (reduce_irq)
 308                comp = ratio;
 309        /* do not exceed limit */
 310        if (comp + ratio >= MAX_TARGET_RATIO)
 311                comp = MAX_TARGET_RATIO - ratio - 1;
 312
 313        return comp;
 314}
 315
 316static void adjust_compensation(int target_ratio, unsigned int win)
 317{
 318        int delta;
 319        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 320
 321        /*
 322         * adjust compensations if confidence level has not been reached or
 323         * there are too many wakeups during the last idle injection period, we
 324         * cannot trust the data for compensation.
 325         */
 326        if (d->confidence >= CONFIDENCE_OK ||
 327                atomic_read(&idle_wakeup_counter) >
 328                win * num_online_cpus())
 329                return;
 330
 331        delta = set_target_ratio - current_ratio;
 332        /* filter out bad data */
 333        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 334                if (d->steady_comp)
 335                        d->steady_comp =
 336                                roundup(delta+d->steady_comp, 2)/2;
 337                else
 338                        d->steady_comp = delta;
 339                d->confidence++;
 340        }
 341}
 342
 343static bool powerclamp_adjust_controls(unsigned int target_ratio,
 344                                unsigned int guard, unsigned int win)
 345{
 346        static u64 msr_last, tsc_last;
 347        u64 msr_now, tsc_now;
 348        u64 val64;
 349
 350        /* check result for the last window */
 351        msr_now = pkg_state_counter();
 352        tsc_now = rdtsc();
 353
 354        /* calculate pkg cstate vs tsc ratio */
 355        if (!msr_last || !tsc_last)
 356                current_ratio = 1;
 357        else if (tsc_now-tsc_last) {
 358                val64 = 100*(msr_now-msr_last);
 359                do_div(val64, (tsc_now-tsc_last));
 360                current_ratio = val64;
 361        }
 362
 363        /* update record */
 364        msr_last = msr_now;
 365        tsc_last = tsc_now;
 366
 367        adjust_compensation(target_ratio, win);
 368        /*
 369         * too many external interrupts, set flag such
 370         * that we can take measure later.
 371         */
 372        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 373                2 * win * num_online_cpus();
 374
 375        atomic_set(&idle_wakeup_counter, 0);
 376        /* if we are above target+guard, skip */
 377        return set_target_ratio + guard <= current_ratio;
 378}
 379
 380static void clamp_balancing_func(struct kthread_work *work)
 381{
 382        struct powerclamp_worker_data *w_data;
 383        int sleeptime;
 384        unsigned long target_jiffies;
 385        unsigned int compensated_ratio;
 386        int interval; /* jiffies to sleep for each attempt */
 387
 388        w_data = container_of(work, struct powerclamp_worker_data,
 389                              balancing_work);
 390
 391        /*
 392         * make sure user selected ratio does not take effect until
 393         * the next round. adjust target_ratio if user has changed
 394         * target such that we can converge quickly.
 395         */
 396        w_data->target_ratio = READ_ONCE(set_target_ratio);
 397        w_data->guard = 1 + w_data->target_ratio / 20;
 398        w_data->window_size_now = window_size;
 399        w_data->duration_jiffies = msecs_to_jiffies(duration);
 400        w_data->count++;
 401
 402        /*
 403         * systems may have different ability to enter package level
 404         * c-states, thus we need to compensate the injected idle ratio
 405         * to achieve the actual target reported by the HW.
 406         */
 407        compensated_ratio = w_data->target_ratio +
 408                get_compensation(w_data->target_ratio);
 409        if (compensated_ratio <= 0)
 410                compensated_ratio = 1;
 411        interval = w_data->duration_jiffies * 100 / compensated_ratio;
 412
 413        /* align idle time */
 414        target_jiffies = roundup(jiffies, interval);
 415        sleeptime = target_jiffies - jiffies;
 416        if (sleeptime <= 0)
 417                sleeptime = 1;
 418
 419        if (clamping && w_data->clamping && cpu_online(w_data->cpu))
 420                kthread_queue_delayed_work(w_data->worker,
 421                                           &w_data->idle_injection_work,
 422                                           sleeptime);
 423}
 424
 425static void clamp_idle_injection_func(struct kthread_work *work)
 426{
 427        struct powerclamp_worker_data *w_data;
 428
 429        w_data = container_of(work, struct powerclamp_worker_data,
 430                              idle_injection_work.work);
 431
 432        /*
 433         * only elected controlling cpu can collect stats and update
 434         * control parameters.
 435         */
 436        if (w_data->cpu == control_cpu &&
 437            !(w_data->count % w_data->window_size_now)) {
 438                should_skip =
 439                        powerclamp_adjust_controls(w_data->target_ratio,
 440                                                   w_data->guard,
 441                                                   w_data->window_size_now);
 442                smp_mb();
 443        }
 444
 445        if (should_skip)
 446                goto balance;
 447
 448        play_idle(jiffies_to_msecs(w_data->duration_jiffies));
 449
 450balance:
 451        if (clamping && w_data->clamping && cpu_online(w_data->cpu))
 452                kthread_queue_work(w_data->worker, &w_data->balancing_work);
 453}
 454
 455/*
 456 * 1 HZ polling while clamping is active, useful for userspace
 457 * to monitor actual idle ratio.
 458 */
 459static void poll_pkg_cstate(struct work_struct *dummy);
 460static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 461static void poll_pkg_cstate(struct work_struct *dummy)
 462{
 463        static u64 msr_last;
 464        static u64 tsc_last;
 465
 466        u64 msr_now;
 467        u64 tsc_now;
 468        u64 val64;
 469
 470        msr_now = pkg_state_counter();
 471        tsc_now = rdtsc();
 472
 473        /* calculate pkg cstate vs tsc ratio */
 474        if (!msr_last || !tsc_last)
 475                pkg_cstate_ratio_cur = 1;
 476        else {
 477                if (tsc_now - tsc_last) {
 478                        val64 = 100 * (msr_now - msr_last);
 479                        do_div(val64, (tsc_now - tsc_last));
 480                        pkg_cstate_ratio_cur = val64;
 481                }
 482        }
 483
 484        /* update record */
 485        msr_last = msr_now;
 486        tsc_last = tsc_now;
 487
 488        if (true == clamping)
 489                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 490}
 491
 492static void start_power_clamp_worker(unsigned long cpu)
 493{
 494        struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 495        struct kthread_worker *worker;
 496
 497        worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
 498        if (IS_ERR(worker))
 499                return;
 500
 501        w_data->worker = worker;
 502        w_data->count = 0;
 503        w_data->cpu = cpu;
 504        w_data->clamping = true;
 505        set_bit(cpu, cpu_clamping_mask);
 506        sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
 507        kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
 508        kthread_init_delayed_work(&w_data->idle_injection_work,
 509                                  clamp_idle_injection_func);
 510        kthread_queue_work(w_data->worker, &w_data->balancing_work);
 511}
 512
 513static void stop_power_clamp_worker(unsigned long cpu)
 514{
 515        struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 516
 517        if (!w_data->worker)
 518                return;
 519
 520        w_data->clamping = false;
 521        /*
 522         * Make sure that all works that get queued after this point see
 523         * the clamping disabled. The counter part is not needed because
 524         * there is an implicit memory barrier when the queued work
 525         * is proceed.
 526         */
 527        smp_wmb();
 528        kthread_cancel_work_sync(&w_data->balancing_work);
 529        kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
 530        /*
 531         * The balancing work still might be queued here because
 532         * the handling of the "clapming" variable, cancel, and queue
 533         * operations are not synchronized via a lock. But it is not
 534         * a big deal. The balancing work is fast and destroy kthread
 535         * will wait for it.
 536         */
 537        clear_bit(w_data->cpu, cpu_clamping_mask);
 538        kthread_destroy_worker(w_data->worker);
 539
 540        w_data->worker = NULL;
 541}
 542
 543static int start_power_clamp(void)
 544{
 545        unsigned long cpu;
 546
 547        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 548        /* prevent cpu hotplug */
 549        get_online_cpus();
 550
 551        /* prefer BSP */
 552        control_cpu = 0;
 553        if (!cpu_online(control_cpu))
 554                control_cpu = smp_processor_id();
 555
 556        clamping = true;
 557        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 558
 559        /* start one kthread worker per online cpu */
 560        for_each_online_cpu(cpu) {
 561                start_power_clamp_worker(cpu);
 562        }
 563        put_online_cpus();
 564
 565        return 0;
 566}
 567
 568static void end_power_clamp(void)
 569{
 570        int i;
 571
 572        /*
 573         * Block requeuing in all the kthread workers. They will flush and
 574         * stop faster.
 575         */
 576        clamping = false;
 577        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 578                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 579                        pr_debug("clamping worker for cpu %d alive, destroy\n",
 580                                 i);
 581                        stop_power_clamp_worker(i);
 582                }
 583        }
 584}
 585
 586static int powerclamp_cpu_online(unsigned int cpu)
 587{
 588        if (clamping == false)
 589                return 0;
 590        start_power_clamp_worker(cpu);
 591        /* prefer BSP as controlling CPU */
 592        if (cpu == 0) {
 593                control_cpu = 0;
 594                smp_mb();
 595        }
 596        return 0;
 597}
 598
 599static int powerclamp_cpu_predown(unsigned int cpu)
 600{
 601        if (clamping == false)
 602                return 0;
 603
 604        stop_power_clamp_worker(cpu);
 605        if (cpu != control_cpu)
 606                return 0;
 607
 608        control_cpu = cpumask_first(cpu_online_mask);
 609        if (control_cpu == cpu)
 610                control_cpu = cpumask_next(cpu, cpu_online_mask);
 611        smp_mb();
 612        return 0;
 613}
 614
 615static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 616                                 unsigned long *state)
 617{
 618        *state = MAX_TARGET_RATIO;
 619
 620        return 0;
 621}
 622
 623static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 624                                 unsigned long *state)
 625{
 626        if (true == clamping)
 627                *state = pkg_cstate_ratio_cur;
 628        else
 629                /* to save power, do not poll idle ratio while not clamping */
 630                *state = -1; /* indicates invalid state */
 631
 632        return 0;
 633}
 634
 635static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 636                                 unsigned long new_target_ratio)
 637{
 638        int ret = 0;
 639
 640        new_target_ratio = clamp(new_target_ratio, 0UL,
 641                                (unsigned long) (MAX_TARGET_RATIO-1));
 642        if (set_target_ratio == 0 && new_target_ratio > 0) {
 643                pr_info("Start idle injection to reduce power\n");
 644                set_target_ratio = new_target_ratio;
 645                ret = start_power_clamp();
 646                goto exit_set;
 647        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 648                pr_info("Stop forced idle injection\n");
 649                end_power_clamp();
 650                set_target_ratio = 0;
 651        } else  /* adjust currently running */ {
 652                set_target_ratio = new_target_ratio;
 653                /* make new set_target_ratio visible to other cpus */
 654                smp_mb();
 655        }
 656
 657exit_set:
 658        return ret;
 659}
 660
 661/* bind to generic thermal layer as cooling device*/
 662static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 663        .get_max_state = powerclamp_get_max_state,
 664        .get_cur_state = powerclamp_get_cur_state,
 665        .set_cur_state = powerclamp_set_cur_state,
 666};
 667
 668static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
 669        { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
 670        {}
 671};
 672MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 673
 674static int __init powerclamp_probe(void)
 675{
 676
 677        if (!x86_match_cpu(intel_powerclamp_ids)) {
 678                pr_err("CPU does not support MWAIT\n");
 679                return -ENODEV;
 680        }
 681
 682        /* The goal for idle time alignment is to achieve package cstate. */
 683        if (!has_pkg_state_counter()) {
 684                pr_info("No package C-state available\n");
 685                return -ENODEV;
 686        }
 687
 688        /* find the deepest mwait value */
 689        find_target_mwait();
 690
 691        return 0;
 692}
 693
 694static int powerclamp_debug_show(struct seq_file *m, void *unused)
 695{
 696        int i = 0;
 697
 698        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 699        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 700        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 701                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 702                        i,
 703                        cal_data[i].confidence,
 704                        cal_data[i].steady_comp,
 705                        cal_data[i].dynamic_comp);
 706        }
 707
 708        return 0;
 709}
 710
 711static int powerclamp_debug_open(struct inode *inode,
 712                        struct file *file)
 713{
 714        return single_open(file, powerclamp_debug_show, inode->i_private);
 715}
 716
 717static const struct file_operations powerclamp_debug_fops = {
 718        .open           = powerclamp_debug_open,
 719        .read           = seq_read,
 720        .llseek         = seq_lseek,
 721        .release        = single_release,
 722        .owner          = THIS_MODULE,
 723};
 724
 725static inline void powerclamp_create_debug_files(void)
 726{
 727        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 728        if (!debug_dir)
 729                return;
 730
 731        if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
 732                                        cal_data, &powerclamp_debug_fops))
 733                goto file_error;
 734
 735        return;
 736
 737file_error:
 738        debugfs_remove_recursive(debug_dir);
 739}
 740
 741static enum cpuhp_state hp_state;
 742
 743static int __init powerclamp_init(void)
 744{
 745        int retval;
 746        int bitmap_size;
 747
 748        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 749        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 750        if (!cpu_clamping_mask)
 751                return -ENOMEM;
 752
 753        /* probe cpu features and ids here */
 754        retval = powerclamp_probe();
 755        if (retval)
 756                goto exit_free;
 757
 758        /* set default limit, maybe adjusted during runtime based on feedback */
 759        window_size = 2;
 760        retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 761                                           "thermal/intel_powerclamp:online",
 762                                           powerclamp_cpu_online,
 763                                           powerclamp_cpu_predown);
 764        if (retval < 0)
 765                goto exit_free;
 766
 767        hp_state = retval;
 768
 769        worker_data = alloc_percpu(struct powerclamp_worker_data);
 770        if (!worker_data) {
 771                retval = -ENOMEM;
 772                goto exit_unregister;
 773        }
 774
 775        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 776                                                &powerclamp_cooling_ops);
 777        if (IS_ERR(cooling_dev)) {
 778                retval = -ENODEV;
 779                goto exit_free_thread;
 780        }
 781
 782        if (!duration)
 783                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 784
 785        powerclamp_create_debug_files();
 786
 787        return 0;
 788
 789exit_free_thread:
 790        free_percpu(worker_data);
 791exit_unregister:
 792        cpuhp_remove_state_nocalls(hp_state);
 793exit_free:
 794        kfree(cpu_clamping_mask);
 795        return retval;
 796}
 797module_init(powerclamp_init);
 798
 799static void __exit powerclamp_exit(void)
 800{
 801        end_power_clamp();
 802        cpuhp_remove_state_nocalls(hp_state);
 803        free_percpu(worker_data);
 804        thermal_cooling_device_unregister(cooling_dev);
 805        kfree(cpu_clamping_mask);
 806
 807        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 808        debugfs_remove_recursive(debug_dir);
 809}
 810module_exit(powerclamp_exit);
 811
 812MODULE_LICENSE("GPL");
 813MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 814MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 815MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 816