linux/drivers/thermal/intel_powerclamp.c
<<
>>
Prefs
   1/*
   2 * intel_powerclamp.c - package c-state idle injection
   3 *
   4 * Copyright (c) 2012, Intel Corporation.
   5 *
   6 * Authors:
   7 *     Arjan van de Ven <arjan@linux.intel.com>
   8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms and conditions of the GNU General Public License,
  12 * version 2, as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope it will be useful, but WITHOUT
  15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17 * more details.
  18 *
  19 * You should have received a copy of the GNU General Public License along with
  20 * this program; if not, write to the Free Software Foundation, Inc.,
  21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  22 *
  23 *
  24 *      TODO:
  25 *           1. better handle wakeup from external interrupts, currently a fixed
  26 *              compensation is added to clamping duration when excessive amount
  27 *              of wakeups are observed during idle time. the reason is that in
  28 *              case of external interrupts without need for ack, clamping down
  29 *              cpu in non-irq context does not reduce irq. for majority of the
  30 *              cases, clamping down cpu does help reduce irq as well, we should
  31 *              be able to differenciate the two cases and give a quantitative
  32 *              solution for the irqs that we can control. perhaps based on
  33 *              get_cpu_iowait_time_us()
  34 *
  35 *           2. synchronization with other hw blocks
  36 *
  37 *
  38 */
  39
  40#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  41
  42#include <linux/module.h>
  43#include <linux/kernel.h>
  44#include <linux/delay.h>
  45#include <linux/kthread.h>
  46#include <linux/freezer.h>
  47#include <linux/cpu.h>
  48#include <linux/thermal.h>
  49#include <linux/slab.h>
  50#include <linux/tick.h>
  51#include <linux/debugfs.h>
  52#include <linux/seq_file.h>
  53#include <linux/sched/rt.h>
  54
  55#include <asm/nmi.h>
  56#include <asm/msr.h>
  57#include <asm/mwait.h>
  58#include <asm/cpu_device_id.h>
  59#include <asm/idle.h>
  60#include <asm/hardirq.h>
  61
  62#define MAX_TARGET_RATIO (50U)
  63/* For each undisturbed clamping period (no extra wake ups during idle time),
  64 * we increment the confidence counter for the given target ratio.
  65 * CONFIDENCE_OK defines the level where runtime calibration results are
  66 * valid.
  67 */
  68#define CONFIDENCE_OK (3)
  69/* Default idle injection duration, driver adjust sleep time to meet target
  70 * idle ratio. Similar to frequency modulation.
  71 */
  72#define DEFAULT_DURATION_JIFFIES (6)
  73
  74static unsigned int target_mwait;
  75static struct dentry *debug_dir;
  76
  77/* user selected target */
  78static unsigned int set_target_ratio;
  79static unsigned int current_ratio;
  80static bool should_skip;
  81static bool reduce_irq;
  82static atomic_t idle_wakeup_counter;
  83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  84                                  * control parameters. default to BSP but BSP
  85                                  * can be offlined.
  86                                  */
  87static bool clamping;
  88
  89
  90static struct task_struct * __percpu *powerclamp_thread;
  91static struct thermal_cooling_device *cooling_dev;
  92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
  93                                           * clamping thread
  94                                           */
  95
  96static unsigned int duration;
  97static unsigned int pkg_cstate_ratio_cur;
  98static unsigned int window_size;
  99
 100static int duration_set(const char *arg, const struct kernel_param *kp)
 101{
 102        int ret = 0;
 103        unsigned long new_duration;
 104
 105        ret = kstrtoul(arg, 10, &new_duration);
 106        if (ret)
 107                goto exit;
 108        if (new_duration > 25 || new_duration < 6) {
 109                pr_err("Out of recommended range %lu, between 6-25ms\n",
 110                        new_duration);
 111                ret = -EINVAL;
 112        }
 113
 114        duration = clamp(new_duration, 6ul, 25ul);
 115        smp_mb();
 116
 117exit:
 118
 119        return ret;
 120}
 121
 122static const struct kernel_param_ops duration_ops = {
 123        .set = duration_set,
 124        .get = param_get_int,
 125};
 126
 127
 128module_param_cb(duration, &duration_ops, &duration, 0644);
 129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 130
 131struct powerclamp_calibration_data {
 132        unsigned long confidence;  /* used for calibration, basically a counter
 133                                    * gets incremented each time a clamping
 134                                    * period is completed without extra wakeups
 135                                    * once that counter is reached given level,
 136                                    * compensation is deemed usable.
 137                                    */
 138        unsigned long steady_comp; /* steady state compensation used when
 139                                    * no extra wakeups occurred.
 140                                    */
 141        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 142                                     * mostly from external interrupts.
 143                                     */
 144};
 145
 146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 147
 148static int window_size_set(const char *arg, const struct kernel_param *kp)
 149{
 150        int ret = 0;
 151        unsigned long new_window_size;
 152
 153        ret = kstrtoul(arg, 10, &new_window_size);
 154        if (ret)
 155                goto exit_win;
 156        if (new_window_size > 10 || new_window_size < 2) {
 157                pr_err("Out of recommended window size %lu, between 2-10\n",
 158                        new_window_size);
 159                ret = -EINVAL;
 160        }
 161
 162        window_size = clamp(new_window_size, 2ul, 10ul);
 163        smp_mb();
 164
 165exit_win:
 166
 167        return ret;
 168}
 169
 170static const struct kernel_param_ops window_size_ops = {
 171        .set = window_size_set,
 172        .get = param_get_int,
 173};
 174
 175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 177        "\tpowerclamp controls idle ratio within this window. larger\n"
 178        "\twindow size results in slower response time but more smooth\n"
 179        "\tclamping results. default to 2.");
 180
 181static void find_target_mwait(void)
 182{
 183        unsigned int eax, ebx, ecx, edx;
 184        unsigned int highest_cstate = 0;
 185        unsigned int highest_subcstate = 0;
 186        int i;
 187
 188        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 189                return;
 190
 191        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 192
 193        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 194            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 195                return;
 196
 197        edx >>= MWAIT_SUBSTATE_SIZE;
 198        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 199                if (edx & MWAIT_SUBSTATE_MASK) {
 200                        highest_cstate = i;
 201                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 202                }
 203        }
 204        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 205                (highest_subcstate - 1);
 206
 207}
 208
 209struct pkg_cstate_info {
 210        bool skip;
 211        int msr_index;
 212        int cstate_id;
 213};
 214
 215#define PKG_CSTATE_INIT(id) {                           \
 216                .msr_index = MSR_PKG_C##id##_RESIDENCY, \
 217                .cstate_id = id                         \
 218                        }
 219
 220static struct pkg_cstate_info pkg_cstates[] = {
 221        PKG_CSTATE_INIT(2),
 222        PKG_CSTATE_INIT(3),
 223        PKG_CSTATE_INIT(6),
 224        PKG_CSTATE_INIT(7),
 225        PKG_CSTATE_INIT(8),
 226        PKG_CSTATE_INIT(9),
 227        PKG_CSTATE_INIT(10),
 228        {NULL},
 229};
 230
 231static bool has_pkg_state_counter(void)
 232{
 233        u64 val;
 234        struct pkg_cstate_info *info = pkg_cstates;
 235
 236        /* check if any one of the counter msrs exists */
 237        while (info->msr_index) {
 238                if (!rdmsrl_safe(info->msr_index, &val))
 239                        return true;
 240                info++;
 241        }
 242
 243        return false;
 244}
 245
 246static u64 pkg_state_counter(void)
 247{
 248        u64 val;
 249        u64 count = 0;
 250        struct pkg_cstate_info *info = pkg_cstates;
 251
 252        while (info->msr_index) {
 253                if (!info->skip) {
 254                        if (!rdmsrl_safe(info->msr_index, &val))
 255                                count += val;
 256                        else
 257                                info->skip = true;
 258                }
 259                info++;
 260        }
 261
 262        return count;
 263}
 264
 265static void noop_timer(unsigned long foo)
 266{
 267        /* empty... just the fact that we get the interrupt wakes us up */
 268}
 269
 270static unsigned int get_compensation(int ratio)
 271{
 272        unsigned int comp = 0;
 273
 274        /* we only use compensation if all adjacent ones are good */
 275        if (ratio == 1 &&
 276                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 277                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 278                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 279                comp = (cal_data[ratio].steady_comp +
 280                        cal_data[ratio + 1].steady_comp +
 281                        cal_data[ratio + 2].steady_comp) / 3;
 282        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 283                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 284                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 285                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 286                comp = (cal_data[ratio].steady_comp +
 287                        cal_data[ratio - 1].steady_comp +
 288                        cal_data[ratio - 2].steady_comp) / 3;
 289        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 290                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 291                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 292                comp = (cal_data[ratio].steady_comp +
 293                        cal_data[ratio - 1].steady_comp +
 294                        cal_data[ratio + 1].steady_comp) / 3;
 295        }
 296
 297        /* REVISIT: simple penalty of double idle injection */
 298        if (reduce_irq)
 299                comp = ratio;
 300        /* do not exceed limit */
 301        if (comp + ratio >= MAX_TARGET_RATIO)
 302                comp = MAX_TARGET_RATIO - ratio - 1;
 303
 304        return comp;
 305}
 306
 307static void adjust_compensation(int target_ratio, unsigned int win)
 308{
 309        int delta;
 310        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 311
 312        /*
 313         * adjust compensations if confidence level has not been reached or
 314         * there are too many wakeups during the last idle injection period, we
 315         * cannot trust the data for compensation.
 316         */
 317        if (d->confidence >= CONFIDENCE_OK ||
 318                atomic_read(&idle_wakeup_counter) >
 319                win * num_online_cpus())
 320                return;
 321
 322        delta = set_target_ratio - current_ratio;
 323        /* filter out bad data */
 324        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 325                if (d->steady_comp)
 326                        d->steady_comp =
 327                                roundup(delta+d->steady_comp, 2)/2;
 328                else
 329                        d->steady_comp = delta;
 330                d->confidence++;
 331        }
 332}
 333
 334static bool powerclamp_adjust_controls(unsigned int target_ratio,
 335                                unsigned int guard, unsigned int win)
 336{
 337        static u64 msr_last, tsc_last;
 338        u64 msr_now, tsc_now;
 339        u64 val64;
 340
 341        /* check result for the last window */
 342        msr_now = pkg_state_counter();
 343        tsc_now = rdtsc();
 344
 345        /* calculate pkg cstate vs tsc ratio */
 346        if (!msr_last || !tsc_last)
 347                current_ratio = 1;
 348        else if (tsc_now-tsc_last) {
 349                val64 = 100*(msr_now-msr_last);
 350                do_div(val64, (tsc_now-tsc_last));
 351                current_ratio = val64;
 352        }
 353
 354        /* update record */
 355        msr_last = msr_now;
 356        tsc_last = tsc_now;
 357
 358        adjust_compensation(target_ratio, win);
 359        /*
 360         * too many external interrupts, set flag such
 361         * that we can take measure later.
 362         */
 363        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 364                2 * win * num_online_cpus();
 365
 366        atomic_set(&idle_wakeup_counter, 0);
 367        /* if we are above target+guard, skip */
 368        return set_target_ratio + guard <= current_ratio;
 369}
 370
 371static int clamp_thread(void *arg)
 372{
 373        int cpunr = (unsigned long)arg;
 374        DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 375        static const struct sched_param param = {
 376                .sched_priority = MAX_USER_RT_PRIO/2,
 377        };
 378        unsigned int count = 0;
 379        unsigned int target_ratio;
 380
 381        set_bit(cpunr, cpu_clamping_mask);
 382        set_freezable();
 383        init_timer_on_stack(&wakeup_timer);
 384        sched_setscheduler(current, SCHED_FIFO, &param);
 385
 386        while (true == clamping && !kthread_should_stop() &&
 387                cpu_online(cpunr)) {
 388                int sleeptime;
 389                unsigned long target_jiffies;
 390                unsigned int guard;
 391                unsigned int compensated_ratio;
 392                int interval; /* jiffies to sleep for each attempt */
 393                unsigned int duration_jiffies = msecs_to_jiffies(duration);
 394                unsigned int window_size_now;
 395
 396                try_to_freeze();
 397                /*
 398                 * make sure user selected ratio does not take effect until
 399                 * the next round. adjust target_ratio if user has changed
 400                 * target such that we can converge quickly.
 401                 */
 402                target_ratio = set_target_ratio;
 403                guard = 1 + target_ratio/20;
 404                window_size_now = window_size;
 405                count++;
 406
 407                /*
 408                 * systems may have different ability to enter package level
 409                 * c-states, thus we need to compensate the injected idle ratio
 410                 * to achieve the actual target reported by the HW.
 411                 */
 412                compensated_ratio = target_ratio +
 413                        get_compensation(target_ratio);
 414                if (compensated_ratio <= 0)
 415                        compensated_ratio = 1;
 416                interval = duration_jiffies * 100 / compensated_ratio;
 417
 418                /* align idle time */
 419                target_jiffies = roundup(jiffies, interval);
 420                sleeptime = target_jiffies - jiffies;
 421                if (sleeptime <= 0)
 422                        sleeptime = 1;
 423                schedule_timeout_interruptible(sleeptime);
 424                /*
 425                 * only elected controlling cpu can collect stats and update
 426                 * control parameters.
 427                 */
 428                if (cpunr == control_cpu && !(count%window_size_now)) {
 429                        should_skip =
 430                                powerclamp_adjust_controls(target_ratio,
 431                                                        guard, window_size_now);
 432                        smp_mb();
 433                }
 434
 435                if (should_skip)
 436                        continue;
 437
 438                target_jiffies = jiffies + duration_jiffies;
 439                mod_timer(&wakeup_timer, target_jiffies);
 440                if (unlikely(local_softirq_pending()))
 441                        continue;
 442                /*
 443                 * stop tick sched during idle time, interrupts are still
 444                 * allowed. thus jiffies are updated properly.
 445                 */
 446                preempt_disable();
 447                /* mwait until target jiffies is reached */
 448                while (time_before(jiffies, target_jiffies)) {
 449                        unsigned long ecx = 1;
 450                        unsigned long eax = target_mwait;
 451
 452                        /*
 453                         * REVISIT: may call enter_idle() to notify drivers who
 454                         * can save power during cpu idle. same for exit_idle()
 455                         */
 456                        local_touch_nmi();
 457                        stop_critical_timings();
 458                        mwait_idle_with_hints(eax, ecx);
 459                        start_critical_timings();
 460                        atomic_inc(&idle_wakeup_counter);
 461                }
 462                preempt_enable();
 463        }
 464        del_timer_sync(&wakeup_timer);
 465        clear_bit(cpunr, cpu_clamping_mask);
 466
 467        return 0;
 468}
 469
 470/*
 471 * 1 HZ polling while clamping is active, useful for userspace
 472 * to monitor actual idle ratio.
 473 */
 474static void poll_pkg_cstate(struct work_struct *dummy);
 475static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 476static void poll_pkg_cstate(struct work_struct *dummy)
 477{
 478        static u64 msr_last;
 479        static u64 tsc_last;
 480        static unsigned long jiffies_last;
 481
 482        u64 msr_now;
 483        unsigned long jiffies_now;
 484        u64 tsc_now;
 485        u64 val64;
 486
 487        msr_now = pkg_state_counter();
 488        tsc_now = rdtsc();
 489        jiffies_now = jiffies;
 490
 491        /* calculate pkg cstate vs tsc ratio */
 492        if (!msr_last || !tsc_last)
 493                pkg_cstate_ratio_cur = 1;
 494        else {
 495                if (tsc_now - tsc_last) {
 496                        val64 = 100 * (msr_now - msr_last);
 497                        do_div(val64, (tsc_now - tsc_last));
 498                        pkg_cstate_ratio_cur = val64;
 499                }
 500        }
 501
 502        /* update record */
 503        msr_last = msr_now;
 504        jiffies_last = jiffies_now;
 505        tsc_last = tsc_now;
 506
 507        if (true == clamping)
 508                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 509}
 510
 511static int start_power_clamp(void)
 512{
 513        unsigned long cpu;
 514        struct task_struct *thread;
 515
 516        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 517        /* prevent cpu hotplug */
 518        get_online_cpus();
 519
 520        /* prefer BSP */
 521        control_cpu = 0;
 522        if (!cpu_online(control_cpu))
 523                control_cpu = smp_processor_id();
 524
 525        clamping = true;
 526        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 527
 528        /* start one thread per online cpu */
 529        for_each_online_cpu(cpu) {
 530                struct task_struct **p =
 531                        per_cpu_ptr(powerclamp_thread, cpu);
 532
 533                thread = kthread_create_on_node(clamp_thread,
 534                                                (void *) cpu,
 535                                                cpu_to_node(cpu),
 536                                                "kidle_inject/%ld", cpu);
 537                /* bind to cpu here */
 538                if (likely(!IS_ERR(thread))) {
 539                        kthread_bind(thread, cpu);
 540                        wake_up_process(thread);
 541                        *p = thread;
 542                }
 543
 544        }
 545        put_online_cpus();
 546
 547        return 0;
 548}
 549
 550static void end_power_clamp(void)
 551{
 552        int i;
 553        struct task_struct *thread;
 554
 555        clamping = false;
 556        /*
 557         * make clamping visible to other cpus and give per cpu clamping threads
 558         * sometime to exit, or gets killed later.
 559         */
 560        smp_mb();
 561        msleep(20);
 562        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 563                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 564                        pr_debug("clamping thread for cpu %d alive, kill\n", i);
 565                        thread = *per_cpu_ptr(powerclamp_thread, i);
 566                        kthread_stop(thread);
 567                }
 568        }
 569}
 570
 571static int powerclamp_cpu_callback(struct notifier_block *nfb,
 572                                unsigned long action, void *hcpu)
 573{
 574        unsigned long cpu = (unsigned long)hcpu;
 575        struct task_struct *thread;
 576        struct task_struct **percpu_thread =
 577                per_cpu_ptr(powerclamp_thread, cpu);
 578
 579        if (false == clamping)
 580                goto exit_ok;
 581
 582        switch (action) {
 583        case CPU_ONLINE:
 584                thread = kthread_create_on_node(clamp_thread,
 585                                                (void *) cpu,
 586                                                cpu_to_node(cpu),
 587                                                "kidle_inject/%lu", cpu);
 588                if (likely(!IS_ERR(thread))) {
 589                        kthread_bind(thread, cpu);
 590                        wake_up_process(thread);
 591                        *percpu_thread = thread;
 592                }
 593                /* prefer BSP as controlling CPU */
 594                if (cpu == 0) {
 595                        control_cpu = 0;
 596                        smp_mb();
 597                }
 598                break;
 599        case CPU_DEAD:
 600                if (test_bit(cpu, cpu_clamping_mask)) {
 601                        pr_err("cpu %lu dead but powerclamping thread is not\n",
 602                                cpu);
 603                        kthread_stop(*percpu_thread);
 604                }
 605                if (cpu == control_cpu) {
 606                        control_cpu = smp_processor_id();
 607                        smp_mb();
 608                }
 609        }
 610
 611exit_ok:
 612        return NOTIFY_OK;
 613}
 614
 615static struct notifier_block powerclamp_cpu_notifier = {
 616        .notifier_call = powerclamp_cpu_callback,
 617};
 618
 619static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 620                                 unsigned long *state)
 621{
 622        *state = MAX_TARGET_RATIO;
 623
 624        return 0;
 625}
 626
 627static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 628                                 unsigned long *state)
 629{
 630        if (true == clamping)
 631                *state = pkg_cstate_ratio_cur;
 632        else
 633                /* to save power, do not poll idle ratio while not clamping */
 634                *state = -1; /* indicates invalid state */
 635
 636        return 0;
 637}
 638
 639static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 640                                 unsigned long new_target_ratio)
 641{
 642        int ret = 0;
 643
 644        new_target_ratio = clamp(new_target_ratio, 0UL,
 645                                (unsigned long) (MAX_TARGET_RATIO-1));
 646        if (set_target_ratio == 0 && new_target_ratio > 0) {
 647                pr_info("Start idle injection to reduce power\n");
 648                set_target_ratio = new_target_ratio;
 649                ret = start_power_clamp();
 650                goto exit_set;
 651        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 652                pr_info("Stop forced idle injection\n");
 653                end_power_clamp();
 654                set_target_ratio = 0;
 655        } else  /* adjust currently running */ {
 656                set_target_ratio = new_target_ratio;
 657                /* make new set_target_ratio visible to other cpus */
 658                smp_mb();
 659        }
 660
 661exit_set:
 662        return ret;
 663}
 664
 665/* bind to generic thermal layer as cooling device*/
 666static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 667        .get_max_state = powerclamp_get_max_state,
 668        .get_cur_state = powerclamp_get_cur_state,
 669        .set_cur_state = powerclamp_set_cur_state,
 670};
 671
 672static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
 673        { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
 674        {}
 675};
 676MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 677
 678static int __init powerclamp_probe(void)
 679{
 680
 681        if (!x86_match_cpu(intel_powerclamp_ids)) {
 682                pr_err("CPU does not support MWAIT");
 683                return -ENODEV;
 684        }
 685
 686        /* The goal for idle time alignment is to achieve package cstate. */
 687        if (!has_pkg_state_counter()) {
 688                pr_info("No package C-state available");
 689                return -ENODEV;
 690        }
 691
 692        /* find the deepest mwait value */
 693        find_target_mwait();
 694
 695        return 0;
 696}
 697
 698static int powerclamp_debug_show(struct seq_file *m, void *unused)
 699{
 700        int i = 0;
 701
 702        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 703        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 704        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 705                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 706                        i,
 707                        cal_data[i].confidence,
 708                        cal_data[i].steady_comp,
 709                        cal_data[i].dynamic_comp);
 710        }
 711
 712        return 0;
 713}
 714
 715static int powerclamp_debug_open(struct inode *inode,
 716                        struct file *file)
 717{
 718        return single_open(file, powerclamp_debug_show, inode->i_private);
 719}
 720
 721static const struct file_operations powerclamp_debug_fops = {
 722        .open           = powerclamp_debug_open,
 723        .read           = seq_read,
 724        .llseek         = seq_lseek,
 725        .release        = single_release,
 726        .owner          = THIS_MODULE,
 727};
 728
 729static inline void powerclamp_create_debug_files(void)
 730{
 731        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 732        if (!debug_dir)
 733                return;
 734
 735        if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
 736                                        cal_data, &powerclamp_debug_fops))
 737                goto file_error;
 738
 739        return;
 740
 741file_error:
 742        debugfs_remove_recursive(debug_dir);
 743}
 744
 745static int __init powerclamp_init(void)
 746{
 747        int retval;
 748        int bitmap_size;
 749
 750        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 751        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 752        if (!cpu_clamping_mask)
 753                return -ENOMEM;
 754
 755        /* probe cpu features and ids here */
 756        retval = powerclamp_probe();
 757        if (retval)
 758                goto exit_free;
 759
 760        /* set default limit, maybe adjusted during runtime based on feedback */
 761        window_size = 2;
 762        register_hotcpu_notifier(&powerclamp_cpu_notifier);
 763
 764        powerclamp_thread = alloc_percpu(struct task_struct *);
 765        if (!powerclamp_thread) {
 766                retval = -ENOMEM;
 767                goto exit_unregister;
 768        }
 769
 770        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 771                                                &powerclamp_cooling_ops);
 772        if (IS_ERR(cooling_dev)) {
 773                retval = -ENODEV;
 774                goto exit_free_thread;
 775        }
 776
 777        if (!duration)
 778                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 779
 780        powerclamp_create_debug_files();
 781
 782        return 0;
 783
 784exit_free_thread:
 785        free_percpu(powerclamp_thread);
 786exit_unregister:
 787        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 788exit_free:
 789        kfree(cpu_clamping_mask);
 790        return retval;
 791}
 792module_init(powerclamp_init);
 793
 794static void __exit powerclamp_exit(void)
 795{
 796        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 797        end_power_clamp();
 798        free_percpu(powerclamp_thread);
 799        thermal_cooling_device_unregister(cooling_dev);
 800        kfree(cpu_clamping_mask);
 801
 802        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 803        debugfs_remove_recursive(debug_dir);
 804}
 805module_exit(powerclamp_exit);
 806
 807MODULE_LICENSE("GPL");
 808MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 809MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 810MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 811