linux/drivers/thermal/intel_powerclamp.c
<<
>>
Prefs
   1/*
   2 * intel_powerclamp.c - package c-state idle injection
   3 *
   4 * Copyright (c) 2012, Intel Corporation.
   5 *
   6 * Authors:
   7 *     Arjan van de Ven <arjan@linux.intel.com>
   8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms and conditions of the GNU General Public License,
  12 * version 2, as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope it will be useful, but WITHOUT
  15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17 * more details.
  18 *
  19 * You should have received a copy of the GNU General Public License along with
  20 * this program; if not, write to the Free Software Foundation, Inc.,
  21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  22 *
  23 *
  24 *      TODO:
  25 *           1. better handle wakeup from external interrupts, currently a fixed
  26 *              compensation is added to clamping duration when excessive amount
  27 *              of wakeups are observed during idle time. the reason is that in
  28 *              case of external interrupts without need for ack, clamping down
  29 *              cpu in non-irq context does not reduce irq. for majority of the
  30 *              cases, clamping down cpu does help reduce irq as well, we should
  31 *              be able to differenciate the two cases and give a quantitative
  32 *              solution for the irqs that we can control. perhaps based on
  33 *              get_cpu_iowait_time_us()
  34 *
  35 *           2. synchronization with other hw blocks
  36 *
  37 *
  38 */
  39
  40#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  41
  42#include <linux/module.h>
  43#include <linux/kernel.h>
  44#include <linux/delay.h>
  45#include <linux/kthread.h>
  46#include <linux/freezer.h>
  47#include <linux/cpu.h>
  48#include <linux/thermal.h>
  49#include <linux/slab.h>
  50#include <linux/tick.h>
  51#include <linux/debugfs.h>
  52#include <linux/seq_file.h>
  53#include <linux/sched/rt.h>
  54
  55#include <asm/nmi.h>
  56#include <asm/msr.h>
  57#include <asm/mwait.h>
  58#include <asm/cpu_device_id.h>
  59#include <asm/idle.h>
  60#include <asm/hardirq.h>
  61
  62#define MAX_TARGET_RATIO (50U)
  63/* For each undisturbed clamping period (no extra wake ups during idle time),
  64 * we increment the confidence counter for the given target ratio.
  65 * CONFIDENCE_OK defines the level where runtime calibration results are
  66 * valid.
  67 */
  68#define CONFIDENCE_OK (3)
  69/* Default idle injection duration, driver adjust sleep time to meet target
  70 * idle ratio. Similar to frequency modulation.
  71 */
  72#define DEFAULT_DURATION_JIFFIES (6)
  73
  74static unsigned int target_mwait;
  75static struct dentry *debug_dir;
  76
  77/* user selected target */
  78static unsigned int set_target_ratio;
  79static unsigned int current_ratio;
  80static bool should_skip;
  81static bool reduce_irq;
  82static atomic_t idle_wakeup_counter;
  83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  84                                  * control parameters. default to BSP but BSP
  85                                  * can be offlined.
  86                                  */
  87static bool clamping;
  88
  89
  90static struct task_struct * __percpu *powerclamp_thread;
  91static struct thermal_cooling_device *cooling_dev;
  92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
  93                                           * clamping thread
  94                                           */
  95
  96static unsigned int duration;
  97static unsigned int pkg_cstate_ratio_cur;
  98static unsigned int window_size;
  99
 100static int duration_set(const char *arg, const struct kernel_param *kp)
 101{
 102        int ret = 0;
 103        unsigned long new_duration;
 104
 105        ret = kstrtoul(arg, 10, &new_duration);
 106        if (ret)
 107                goto exit;
 108        if (new_duration > 25 || new_duration < 6) {
 109                pr_err("Out of recommended range %lu, between 6-25ms\n",
 110                        new_duration);
 111                ret = -EINVAL;
 112        }
 113
 114        duration = clamp(new_duration, 6ul, 25ul);
 115        smp_mb();
 116
 117exit:
 118
 119        return ret;
 120}
 121
 122static const struct kernel_param_ops duration_ops = {
 123        .set = duration_set,
 124        .get = param_get_int,
 125};
 126
 127
 128module_param_cb(duration, &duration_ops, &duration, 0644);
 129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 130
 131struct powerclamp_calibration_data {
 132        unsigned long confidence;  /* used for calibration, basically a counter
 133                                    * gets incremented each time a clamping
 134                                    * period is completed without extra wakeups
 135                                    * once that counter is reached given level,
 136                                    * compensation is deemed usable.
 137                                    */
 138        unsigned long steady_comp; /* steady state compensation used when
 139                                    * no extra wakeups occurred.
 140                                    */
 141        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 142                                     * mostly from external interrupts.
 143                                     */
 144};
 145
 146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 147
 148static int window_size_set(const char *arg, const struct kernel_param *kp)
 149{
 150        int ret = 0;
 151        unsigned long new_window_size;
 152
 153        ret = kstrtoul(arg, 10, &new_window_size);
 154        if (ret)
 155                goto exit_win;
 156        if (new_window_size > 10 || new_window_size < 2) {
 157                pr_err("Out of recommended window size %lu, between 2-10\n",
 158                        new_window_size);
 159                ret = -EINVAL;
 160        }
 161
 162        window_size = clamp(new_window_size, 2ul, 10ul);
 163        smp_mb();
 164
 165exit_win:
 166
 167        return ret;
 168}
 169
 170static const struct kernel_param_ops window_size_ops = {
 171        .set = window_size_set,
 172        .get = param_get_int,
 173};
 174
 175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 177        "\tpowerclamp controls idle ratio within this window. larger\n"
 178        "\twindow size results in slower response time but more smooth\n"
 179        "\tclamping results. default to 2.");
 180
 181static void find_target_mwait(void)
 182{
 183        unsigned int eax, ebx, ecx, edx;
 184        unsigned int highest_cstate = 0;
 185        unsigned int highest_subcstate = 0;
 186        int i;
 187
 188        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 189                return;
 190
 191        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 192
 193        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 194            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 195                return;
 196
 197        edx >>= MWAIT_SUBSTATE_SIZE;
 198        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 199                if (edx & MWAIT_SUBSTATE_MASK) {
 200                        highest_cstate = i;
 201                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 202                }
 203        }
 204        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 205                (highest_subcstate - 1);
 206
 207}
 208
 209struct pkg_cstate_info {
 210        bool skip;
 211        int msr_index;
 212        int cstate_id;
 213};
 214
 215#define PKG_CSTATE_INIT(id) {                           \
 216                .msr_index = MSR_PKG_C##id##_RESIDENCY, \
 217                .cstate_id = id                         \
 218                        }
 219
 220static struct pkg_cstate_info pkg_cstates[] = {
 221        PKG_CSTATE_INIT(2),
 222        PKG_CSTATE_INIT(3),
 223        PKG_CSTATE_INIT(6),
 224        PKG_CSTATE_INIT(7),
 225        PKG_CSTATE_INIT(8),
 226        PKG_CSTATE_INIT(9),
 227        PKG_CSTATE_INIT(10),
 228        {NULL},
 229};
 230
 231static bool has_pkg_state_counter(void)
 232{
 233        u64 val;
 234        struct pkg_cstate_info *info = pkg_cstates;
 235
 236        /* check if any one of the counter msrs exists */
 237        while (info->msr_index) {
 238                if (!rdmsrl_safe(info->msr_index, &val))
 239                        return true;
 240                info++;
 241        }
 242
 243        return false;
 244}
 245
 246static u64 pkg_state_counter(void)
 247{
 248        u64 val;
 249        u64 count = 0;
 250        struct pkg_cstate_info *info = pkg_cstates;
 251
 252        while (info->msr_index) {
 253                if (!info->skip) {
 254                        if (!rdmsrl_safe(info->msr_index, &val))
 255                                count += val;
 256                        else
 257                                info->skip = true;
 258                }
 259                info++;
 260        }
 261
 262        return count;
 263}
 264
 265static void noop_timer(unsigned long foo)
 266{
 267        /* empty... just the fact that we get the interrupt wakes us up */
 268}
 269
 270static unsigned int get_compensation(int ratio)
 271{
 272        unsigned int comp = 0;
 273
 274        /* we only use compensation if all adjacent ones are good */
 275        if (ratio == 1 &&
 276                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 277                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 278                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 279                comp = (cal_data[ratio].steady_comp +
 280                        cal_data[ratio + 1].steady_comp +
 281                        cal_data[ratio + 2].steady_comp) / 3;
 282        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 283                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 284                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 285                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 286                comp = (cal_data[ratio].steady_comp +
 287                        cal_data[ratio - 1].steady_comp +
 288                        cal_data[ratio - 2].steady_comp) / 3;
 289        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 290                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 291                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 292                comp = (cal_data[ratio].steady_comp +
 293                        cal_data[ratio - 1].steady_comp +
 294                        cal_data[ratio + 1].steady_comp) / 3;
 295        }
 296
 297        /* REVISIT: simple penalty of double idle injection */
 298        if (reduce_irq)
 299                comp = ratio;
 300        /* do not exceed limit */
 301        if (comp + ratio >= MAX_TARGET_RATIO)
 302                comp = MAX_TARGET_RATIO - ratio - 1;
 303
 304        return comp;
 305}
 306
 307static void adjust_compensation(int target_ratio, unsigned int win)
 308{
 309        int delta;
 310        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 311
 312        /*
 313         * adjust compensations if confidence level has not been reached or
 314         * there are too many wakeups during the last idle injection period, we
 315         * cannot trust the data for compensation.
 316         */
 317        if (d->confidence >= CONFIDENCE_OK ||
 318                atomic_read(&idle_wakeup_counter) >
 319                win * num_online_cpus())
 320                return;
 321
 322        delta = set_target_ratio - current_ratio;
 323        /* filter out bad data */
 324        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 325                if (d->steady_comp)
 326                        d->steady_comp =
 327                                roundup(delta+d->steady_comp, 2)/2;
 328                else
 329                        d->steady_comp = delta;
 330                d->confidence++;
 331        }
 332}
 333
 334static bool powerclamp_adjust_controls(unsigned int target_ratio,
 335                                unsigned int guard, unsigned int win)
 336{
 337        static u64 msr_last, tsc_last;
 338        u64 msr_now, tsc_now;
 339        u64 val64;
 340
 341        /* check result for the last window */
 342        msr_now = pkg_state_counter();
 343        tsc_now = rdtsc();
 344
 345        /* calculate pkg cstate vs tsc ratio */
 346        if (!msr_last || !tsc_last)
 347                current_ratio = 1;
 348        else if (tsc_now-tsc_last) {
 349                val64 = 100*(msr_now-msr_last);
 350                do_div(val64, (tsc_now-tsc_last));
 351                current_ratio = val64;
 352        }
 353
 354        /* update record */
 355        msr_last = msr_now;
 356        tsc_last = tsc_now;
 357
 358        adjust_compensation(target_ratio, win);
 359        /*
 360         * too many external interrupts, set flag such
 361         * that we can take measure later.
 362         */
 363        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 364                2 * win * num_online_cpus();
 365
 366        atomic_set(&idle_wakeup_counter, 0);
 367        /* if we are above target+guard, skip */
 368        return set_target_ratio + guard <= current_ratio;
 369}
 370
 371static int clamp_thread(void *arg)
 372{
 373        int cpunr = (unsigned long)arg;
 374        DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 375        static const struct sched_param param = {
 376                .sched_priority = MAX_USER_RT_PRIO/2,
 377        };
 378        unsigned int count = 0;
 379        unsigned int target_ratio;
 380
 381        set_bit(cpunr, cpu_clamping_mask);
 382        set_freezable();
 383        init_timer_on_stack(&wakeup_timer);
 384        sched_setscheduler(current, SCHED_FIFO, &param);
 385
 386        while (true == clamping && !kthread_should_stop() &&
 387                cpu_online(cpunr)) {
 388                int sleeptime;
 389                unsigned long target_jiffies;
 390                unsigned int guard;
 391                unsigned int compensation = 0;
 392                int interval; /* jiffies to sleep for each attempt */
 393                unsigned int duration_jiffies = msecs_to_jiffies(duration);
 394                unsigned int window_size_now;
 395
 396                try_to_freeze();
 397                /*
 398                 * make sure user selected ratio does not take effect until
 399                 * the next round. adjust target_ratio if user has changed
 400                 * target such that we can converge quickly.
 401                 */
 402                target_ratio = set_target_ratio;
 403                guard = 1 + target_ratio/20;
 404                window_size_now = window_size;
 405                count++;
 406
 407                /*
 408                 * systems may have different ability to enter package level
 409                 * c-states, thus we need to compensate the injected idle ratio
 410                 * to achieve the actual target reported by the HW.
 411                 */
 412                compensation = get_compensation(target_ratio);
 413                interval = duration_jiffies*100/(target_ratio+compensation);
 414
 415                /* align idle time */
 416                target_jiffies = roundup(jiffies, interval);
 417                sleeptime = target_jiffies - jiffies;
 418                if (sleeptime <= 0)
 419                        sleeptime = 1;
 420                schedule_timeout_interruptible(sleeptime);
 421                /*
 422                 * only elected controlling cpu can collect stats and update
 423                 * control parameters.
 424                 */
 425                if (cpunr == control_cpu && !(count%window_size_now)) {
 426                        should_skip =
 427                                powerclamp_adjust_controls(target_ratio,
 428                                                        guard, window_size_now);
 429                        smp_mb();
 430                }
 431
 432                if (should_skip)
 433                        continue;
 434
 435                target_jiffies = jiffies + duration_jiffies;
 436                mod_timer(&wakeup_timer, target_jiffies);
 437                if (unlikely(local_softirq_pending()))
 438                        continue;
 439                /*
 440                 * stop tick sched during idle time, interrupts are still
 441                 * allowed. thus jiffies are updated properly.
 442                 */
 443                preempt_disable();
 444                /* mwait until target jiffies is reached */
 445                while (time_before(jiffies, target_jiffies)) {
 446                        unsigned long ecx = 1;
 447                        unsigned long eax = target_mwait;
 448
 449                        /*
 450                         * REVISIT: may call enter_idle() to notify drivers who
 451                         * can save power during cpu idle. same for exit_idle()
 452                         */
 453                        local_touch_nmi();
 454                        stop_critical_timings();
 455                        mwait_idle_with_hints(eax, ecx);
 456                        start_critical_timings();
 457                        atomic_inc(&idle_wakeup_counter);
 458                }
 459                preempt_enable();
 460        }
 461        del_timer_sync(&wakeup_timer);
 462        clear_bit(cpunr, cpu_clamping_mask);
 463
 464        return 0;
 465}
 466
 467/*
 468 * 1 HZ polling while clamping is active, useful for userspace
 469 * to monitor actual idle ratio.
 470 */
 471static void poll_pkg_cstate(struct work_struct *dummy);
 472static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 473static void poll_pkg_cstate(struct work_struct *dummy)
 474{
 475        static u64 msr_last;
 476        static u64 tsc_last;
 477        static unsigned long jiffies_last;
 478
 479        u64 msr_now;
 480        unsigned long jiffies_now;
 481        u64 tsc_now;
 482        u64 val64;
 483
 484        msr_now = pkg_state_counter();
 485        tsc_now = rdtsc();
 486        jiffies_now = jiffies;
 487
 488        /* calculate pkg cstate vs tsc ratio */
 489        if (!msr_last || !tsc_last)
 490                pkg_cstate_ratio_cur = 1;
 491        else {
 492                if (tsc_now - tsc_last) {
 493                        val64 = 100 * (msr_now - msr_last);
 494                        do_div(val64, (tsc_now - tsc_last));
 495                        pkg_cstate_ratio_cur = val64;
 496                }
 497        }
 498
 499        /* update record */
 500        msr_last = msr_now;
 501        jiffies_last = jiffies_now;
 502        tsc_last = tsc_now;
 503
 504        if (true == clamping)
 505                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 506}
 507
 508static int start_power_clamp(void)
 509{
 510        unsigned long cpu;
 511        struct task_struct *thread;
 512
 513        /* check if pkg cstate counter is completely 0, abort in this case */
 514        if (!has_pkg_state_counter()) {
 515                pr_err("pkg cstate counter not functional, abort\n");
 516                return -EINVAL;
 517        }
 518
 519        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 520        /* prevent cpu hotplug */
 521        get_online_cpus();
 522
 523        /* prefer BSP */
 524        control_cpu = 0;
 525        if (!cpu_online(control_cpu))
 526                control_cpu = smp_processor_id();
 527
 528        clamping = true;
 529        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 530
 531        /* start one thread per online cpu */
 532        for_each_online_cpu(cpu) {
 533                struct task_struct **p =
 534                        per_cpu_ptr(powerclamp_thread, cpu);
 535
 536                thread = kthread_create_on_node(clamp_thread,
 537                                                (void *) cpu,
 538                                                cpu_to_node(cpu),
 539                                                "kidle_inject/%ld", cpu);
 540                /* bind to cpu here */
 541                if (likely(!IS_ERR(thread))) {
 542                        kthread_bind(thread, cpu);
 543                        wake_up_process(thread);
 544                        *p = thread;
 545                }
 546
 547        }
 548        put_online_cpus();
 549
 550        return 0;
 551}
 552
 553static void end_power_clamp(void)
 554{
 555        int i;
 556        struct task_struct *thread;
 557
 558        clamping = false;
 559        /*
 560         * make clamping visible to other cpus and give per cpu clamping threads
 561         * sometime to exit, or gets killed later.
 562         */
 563        smp_mb();
 564        msleep(20);
 565        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 566                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 567                        pr_debug("clamping thread for cpu %d alive, kill\n", i);
 568                        thread = *per_cpu_ptr(powerclamp_thread, i);
 569                        kthread_stop(thread);
 570                }
 571        }
 572}
 573
 574static int powerclamp_cpu_callback(struct notifier_block *nfb,
 575                                unsigned long action, void *hcpu)
 576{
 577        unsigned long cpu = (unsigned long)hcpu;
 578        struct task_struct *thread;
 579        struct task_struct **percpu_thread =
 580                per_cpu_ptr(powerclamp_thread, cpu);
 581
 582        if (false == clamping)
 583                goto exit_ok;
 584
 585        switch (action) {
 586        case CPU_ONLINE:
 587                thread = kthread_create_on_node(clamp_thread,
 588                                                (void *) cpu,
 589                                                cpu_to_node(cpu),
 590                                                "kidle_inject/%lu", cpu);
 591                if (likely(!IS_ERR(thread))) {
 592                        kthread_bind(thread, cpu);
 593                        wake_up_process(thread);
 594                        *percpu_thread = thread;
 595                }
 596                /* prefer BSP as controlling CPU */
 597                if (cpu == 0) {
 598                        control_cpu = 0;
 599                        smp_mb();
 600                }
 601                break;
 602        case CPU_DEAD:
 603                if (test_bit(cpu, cpu_clamping_mask)) {
 604                        pr_err("cpu %lu dead but powerclamping thread is not\n",
 605                                cpu);
 606                        kthread_stop(*percpu_thread);
 607                }
 608                if (cpu == control_cpu) {
 609                        control_cpu = smp_processor_id();
 610                        smp_mb();
 611                }
 612        }
 613
 614exit_ok:
 615        return NOTIFY_OK;
 616}
 617
 618static struct notifier_block powerclamp_cpu_notifier = {
 619        .notifier_call = powerclamp_cpu_callback,
 620};
 621
 622static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 623                                 unsigned long *state)
 624{
 625        *state = MAX_TARGET_RATIO;
 626
 627        return 0;
 628}
 629
 630static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 631                                 unsigned long *state)
 632{
 633        if (true == clamping)
 634                *state = pkg_cstate_ratio_cur;
 635        else
 636                /* to save power, do not poll idle ratio while not clamping */
 637                *state = -1; /* indicates invalid state */
 638
 639        return 0;
 640}
 641
 642static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 643                                 unsigned long new_target_ratio)
 644{
 645        int ret = 0;
 646
 647        new_target_ratio = clamp(new_target_ratio, 0UL,
 648                                (unsigned long) (MAX_TARGET_RATIO-1));
 649        if (set_target_ratio == 0 && new_target_ratio > 0) {
 650                pr_info("Start idle injection to reduce power\n");
 651                set_target_ratio = new_target_ratio;
 652                ret = start_power_clamp();
 653                goto exit_set;
 654        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 655                pr_info("Stop forced idle injection\n");
 656                set_target_ratio = 0;
 657                end_power_clamp();
 658        } else  /* adjust currently running */ {
 659                set_target_ratio = new_target_ratio;
 660                /* make new set_target_ratio visible to other cpus */
 661                smp_mb();
 662        }
 663
 664exit_set:
 665        return ret;
 666}
 667
 668/* bind to generic thermal layer as cooling device*/
 669static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 670        .get_max_state = powerclamp_get_max_state,
 671        .get_cur_state = powerclamp_get_cur_state,
 672        .set_cur_state = powerclamp_set_cur_state,
 673};
 674
 675/* runs on Nehalem and later */
 676static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
 677        { X86_VENDOR_INTEL, 6, 0x1a},
 678        { X86_VENDOR_INTEL, 6, 0x1c},
 679        { X86_VENDOR_INTEL, 6, 0x1e},
 680        { X86_VENDOR_INTEL, 6, 0x1f},
 681        { X86_VENDOR_INTEL, 6, 0x25},
 682        { X86_VENDOR_INTEL, 6, 0x26},
 683        { X86_VENDOR_INTEL, 6, 0x2a},
 684        { X86_VENDOR_INTEL, 6, 0x2c},
 685        { X86_VENDOR_INTEL, 6, 0x2d},
 686        { X86_VENDOR_INTEL, 6, 0x2e},
 687        { X86_VENDOR_INTEL, 6, 0x2f},
 688        { X86_VENDOR_INTEL, 6, 0x37},
 689        { X86_VENDOR_INTEL, 6, 0x3a},
 690        { X86_VENDOR_INTEL, 6, 0x3c},
 691        { X86_VENDOR_INTEL, 6, 0x3d},
 692        { X86_VENDOR_INTEL, 6, 0x3e},
 693        { X86_VENDOR_INTEL, 6, 0x3f},
 694        { X86_VENDOR_INTEL, 6, 0x45},
 695        { X86_VENDOR_INTEL, 6, 0x46},
 696        { X86_VENDOR_INTEL, 6, 0x47},
 697        { X86_VENDOR_INTEL, 6, 0x4c},
 698        { X86_VENDOR_INTEL, 6, 0x4d},
 699        { X86_VENDOR_INTEL, 6, 0x4e},
 700        { X86_VENDOR_INTEL, 6, 0x4f},
 701        { X86_VENDOR_INTEL, 6, 0x56},
 702        { X86_VENDOR_INTEL, 6, 0x57},
 703        { X86_VENDOR_INTEL, 6, 0x5e},
 704        {}
 705};
 706MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 707
 708static int __init powerclamp_probe(void)
 709{
 710        if (!x86_match_cpu(intel_powerclamp_ids)) {
 711                pr_err("Intel powerclamp does not run on family %d model %d\n",
 712                                boot_cpu_data.x86, boot_cpu_data.x86_model);
 713                return -ENODEV;
 714        }
 715        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
 716                !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
 717                !boot_cpu_has(X86_FEATURE_MWAIT) ||
 718                !boot_cpu_has(X86_FEATURE_ARAT))
 719                return -ENODEV;
 720
 721        /* find the deepest mwait value */
 722        find_target_mwait();
 723
 724        return 0;
 725}
 726
 727static int powerclamp_debug_show(struct seq_file *m, void *unused)
 728{
 729        int i = 0;
 730
 731        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 732        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 733        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 734                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 735                        i,
 736                        cal_data[i].confidence,
 737                        cal_data[i].steady_comp,
 738                        cal_data[i].dynamic_comp);
 739        }
 740
 741        return 0;
 742}
 743
 744static int powerclamp_debug_open(struct inode *inode,
 745                        struct file *file)
 746{
 747        return single_open(file, powerclamp_debug_show, inode->i_private);
 748}
 749
 750static const struct file_operations powerclamp_debug_fops = {
 751        .open           = powerclamp_debug_open,
 752        .read           = seq_read,
 753        .llseek         = seq_lseek,
 754        .release        = single_release,
 755        .owner          = THIS_MODULE,
 756};
 757
 758static inline void powerclamp_create_debug_files(void)
 759{
 760        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 761        if (!debug_dir)
 762                return;
 763
 764        if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
 765                                        cal_data, &powerclamp_debug_fops))
 766                goto file_error;
 767
 768        return;
 769
 770file_error:
 771        debugfs_remove_recursive(debug_dir);
 772}
 773
 774static int __init powerclamp_init(void)
 775{
 776        int retval;
 777        int bitmap_size;
 778
 779        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 780        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 781        if (!cpu_clamping_mask)
 782                return -ENOMEM;
 783
 784        /* probe cpu features and ids here */
 785        retval = powerclamp_probe();
 786        if (retval)
 787                goto exit_free;
 788
 789        /* set default limit, maybe adjusted during runtime based on feedback */
 790        window_size = 2;
 791        register_hotcpu_notifier(&powerclamp_cpu_notifier);
 792
 793        powerclamp_thread = alloc_percpu(struct task_struct *);
 794        if (!powerclamp_thread) {
 795                retval = -ENOMEM;
 796                goto exit_unregister;
 797        }
 798
 799        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 800                                                &powerclamp_cooling_ops);
 801        if (IS_ERR(cooling_dev)) {
 802                retval = -ENODEV;
 803                goto exit_free_thread;
 804        }
 805
 806        if (!duration)
 807                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 808
 809        powerclamp_create_debug_files();
 810
 811        return 0;
 812
 813exit_free_thread:
 814        free_percpu(powerclamp_thread);
 815exit_unregister:
 816        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 817exit_free:
 818        kfree(cpu_clamping_mask);
 819        return retval;
 820}
 821module_init(powerclamp_init);
 822
 823static void __exit powerclamp_exit(void)
 824{
 825        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 826        end_power_clamp();
 827        free_percpu(powerclamp_thread);
 828        thermal_cooling_device_unregister(cooling_dev);
 829        kfree(cpu_clamping_mask);
 830
 831        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 832        debugfs_remove_recursive(debug_dir);
 833}
 834module_exit(powerclamp_exit);
 835
 836MODULE_LICENSE("GPL");
 837MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 838MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 839MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 840