linux/drivers/thermal/intel_powerclamp.c
<<
>>
Prefs
   1/*
   2 * intel_powerclamp.c - package c-state idle injection
   3 *
   4 * Copyright (c) 2012, Intel Corporation.
   5 *
   6 * Authors:
   7 *     Arjan van de Ven <arjan@linux.intel.com>
   8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms and conditions of the GNU General Public License,
  12 * version 2, as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope it will be useful, but WITHOUT
  15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17 * more details.
  18 *
  19 * You should have received a copy of the GNU General Public License along with
  20 * this program; if not, write to the Free Software Foundation, Inc.,
  21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  22 *
  23 *
  24 *      TODO:
  25 *           1. better handle wakeup from external interrupts, currently a fixed
  26 *              compensation is added to clamping duration when excessive amount
  27 *              of wakeups are observed during idle time. the reason is that in
  28 *              case of external interrupts without need for ack, clamping down
  29 *              cpu in non-irq context does not reduce irq. for majority of the
  30 *              cases, clamping down cpu does help reduce irq as well, we should
  31 *              be able to differenciate the two cases and give a quantitative
  32 *              solution for the irqs that we can control. perhaps based on
  33 *              get_cpu_iowait_time_us()
  34 *
  35 *           2. synchronization with other hw blocks
  36 *
  37 *
  38 */
  39
  40#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  41
  42#include <linux/module.h>
  43#include <linux/kernel.h>
  44#include <linux/delay.h>
  45#include <linux/kthread.h>
  46#include <linux/freezer.h>
  47#include <linux/cpu.h>
  48#include <linux/thermal.h>
  49#include <linux/slab.h>
  50#include <linux/tick.h>
  51#include <linux/debugfs.h>
  52#include <linux/seq_file.h>
  53#include <linux/sched/rt.h>
  54
  55#include <asm/nmi.h>
  56#include <asm/msr.h>
  57#include <asm/mwait.h>
  58#include <asm/cpu_device_id.h>
  59#include <asm/idle.h>
  60#include <asm/hardirq.h>
  61
  62#define MAX_TARGET_RATIO (50U)
  63/* For each undisturbed clamping period (no extra wake ups during idle time),
  64 * we increment the confidence counter for the given target ratio.
  65 * CONFIDENCE_OK defines the level where runtime calibration results are
  66 * valid.
  67 */
  68#define CONFIDENCE_OK (3)
  69/* Default idle injection duration, driver adjust sleep time to meet target
  70 * idle ratio. Similar to frequency modulation.
  71 */
  72#define DEFAULT_DURATION_JIFFIES (6)
  73
  74static unsigned int target_mwait;
  75static struct dentry *debug_dir;
  76
  77/* user selected target */
  78static unsigned int set_target_ratio;
  79static unsigned int current_ratio;
  80static bool should_skip;
  81static bool reduce_irq;
  82static atomic_t idle_wakeup_counter;
  83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  84                                  * control parameters. default to BSP but BSP
  85                                  * can be offlined.
  86                                  */
  87static bool clamping;
  88
  89
  90static struct task_struct * __percpu *powerclamp_thread;
  91static struct thermal_cooling_device *cooling_dev;
  92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
  93                                           * clamping thread
  94                                           */
  95
  96static unsigned int duration;
  97static unsigned int pkg_cstate_ratio_cur;
  98static unsigned int window_size;
  99
 100static int duration_set(const char *arg, const struct kernel_param *kp)
 101{
 102        int ret = 0;
 103        unsigned long new_duration;
 104
 105        ret = kstrtoul(arg, 10, &new_duration);
 106        if (ret)
 107                goto exit;
 108        if (new_duration > 25 || new_duration < 6) {
 109                pr_err("Out of recommended range %lu, between 6-25ms\n",
 110                        new_duration);
 111                ret = -EINVAL;
 112        }
 113
 114        duration = clamp(new_duration, 6ul, 25ul);
 115        smp_mb();
 116
 117exit:
 118
 119        return ret;
 120}
 121
 122static struct kernel_param_ops duration_ops = {
 123        .set = duration_set,
 124        .get = param_get_int,
 125};
 126
 127
 128module_param_cb(duration, &duration_ops, &duration, 0644);
 129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 130
 131struct powerclamp_calibration_data {
 132        unsigned long confidence;  /* used for calibration, basically a counter
 133                                    * gets incremented each time a clamping
 134                                    * period is completed without extra wakeups
 135                                    * once that counter is reached given level,
 136                                    * compensation is deemed usable.
 137                                    */
 138        unsigned long steady_comp; /* steady state compensation used when
 139                                    * no extra wakeups occurred.
 140                                    */
 141        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 142                                     * mostly from external interrupts.
 143                                     */
 144};
 145
 146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 147
 148static int window_size_set(const char *arg, const struct kernel_param *kp)
 149{
 150        int ret = 0;
 151        unsigned long new_window_size;
 152
 153        ret = kstrtoul(arg, 10, &new_window_size);
 154        if (ret)
 155                goto exit_win;
 156        if (new_window_size > 10 || new_window_size < 2) {
 157                pr_err("Out of recommended window size %lu, between 2-10\n",
 158                        new_window_size);
 159                ret = -EINVAL;
 160        }
 161
 162        window_size = clamp(new_window_size, 2ul, 10ul);
 163        smp_mb();
 164
 165exit_win:
 166
 167        return ret;
 168}
 169
 170static struct kernel_param_ops window_size_ops = {
 171        .set = window_size_set,
 172        .get = param_get_int,
 173};
 174
 175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 177        "\tpowerclamp controls idle ratio within this window. larger\n"
 178        "\twindow size results in slower response time but more smooth\n"
 179        "\tclamping results. default to 2.");
 180
 181static void find_target_mwait(void)
 182{
 183        unsigned int eax, ebx, ecx, edx;
 184        unsigned int highest_cstate = 0;
 185        unsigned int highest_subcstate = 0;
 186        int i;
 187
 188        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 189                return;
 190
 191        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 192
 193        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 194            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 195                return;
 196
 197        edx >>= MWAIT_SUBSTATE_SIZE;
 198        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 199                if (edx & MWAIT_SUBSTATE_MASK) {
 200                        highest_cstate = i;
 201                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 202                }
 203        }
 204        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 205                (highest_subcstate - 1);
 206
 207}
 208
 209static bool has_pkg_state_counter(void)
 210{
 211        u64 tmp;
 212        return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) ||
 213               !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) ||
 214               !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) ||
 215               !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp);
 216}
 217
 218static u64 pkg_state_counter(void)
 219{
 220        u64 val;
 221        u64 count = 0;
 222
 223        static bool skip_c2;
 224        static bool skip_c3;
 225        static bool skip_c6;
 226        static bool skip_c7;
 227
 228        if (!skip_c2) {
 229                if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
 230                        count += val;
 231                else
 232                        skip_c2 = true;
 233        }
 234
 235        if (!skip_c3) {
 236                if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
 237                        count += val;
 238                else
 239                        skip_c3 = true;
 240        }
 241
 242        if (!skip_c6) {
 243                if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
 244                        count += val;
 245                else
 246                        skip_c6 = true;
 247        }
 248
 249        if (!skip_c7) {
 250                if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
 251                        count += val;
 252                else
 253                        skip_c7 = true;
 254        }
 255
 256        return count;
 257}
 258
 259static void noop_timer(unsigned long foo)
 260{
 261        /* empty... just the fact that we get the interrupt wakes us up */
 262}
 263
 264static unsigned int get_compensation(int ratio)
 265{
 266        unsigned int comp = 0;
 267
 268        /* we only use compensation if all adjacent ones are good */
 269        if (ratio == 1 &&
 270                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 271                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 272                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 273                comp = (cal_data[ratio].steady_comp +
 274                        cal_data[ratio + 1].steady_comp +
 275                        cal_data[ratio + 2].steady_comp) / 3;
 276        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 277                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 278                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 279                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 280                comp = (cal_data[ratio].steady_comp +
 281                        cal_data[ratio - 1].steady_comp +
 282                        cal_data[ratio - 2].steady_comp) / 3;
 283        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 284                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 285                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 286                comp = (cal_data[ratio].steady_comp +
 287                        cal_data[ratio - 1].steady_comp +
 288                        cal_data[ratio + 1].steady_comp) / 3;
 289        }
 290
 291        /* REVISIT: simple penalty of double idle injection */
 292        if (reduce_irq)
 293                comp = ratio;
 294        /* do not exceed limit */
 295        if (comp + ratio >= MAX_TARGET_RATIO)
 296                comp = MAX_TARGET_RATIO - ratio - 1;
 297
 298        return comp;
 299}
 300
 301static void adjust_compensation(int target_ratio, unsigned int win)
 302{
 303        int delta;
 304        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 305
 306        /*
 307         * adjust compensations if confidence level has not been reached or
 308         * there are too many wakeups during the last idle injection period, we
 309         * cannot trust the data for compensation.
 310         */
 311        if (d->confidence >= CONFIDENCE_OK ||
 312                atomic_read(&idle_wakeup_counter) >
 313                win * num_online_cpus())
 314                return;
 315
 316        delta = set_target_ratio - current_ratio;
 317        /* filter out bad data */
 318        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 319                if (d->steady_comp)
 320                        d->steady_comp =
 321                                roundup(delta+d->steady_comp, 2)/2;
 322                else
 323                        d->steady_comp = delta;
 324                d->confidence++;
 325        }
 326}
 327
 328static bool powerclamp_adjust_controls(unsigned int target_ratio,
 329                                unsigned int guard, unsigned int win)
 330{
 331        static u64 msr_last, tsc_last;
 332        u64 msr_now, tsc_now;
 333        u64 val64;
 334
 335        /* check result for the last window */
 336        msr_now = pkg_state_counter();
 337        rdtscll(tsc_now);
 338
 339        /* calculate pkg cstate vs tsc ratio */
 340        if (!msr_last || !tsc_last)
 341                current_ratio = 1;
 342        else if (tsc_now-tsc_last) {
 343                val64 = 100*(msr_now-msr_last);
 344                do_div(val64, (tsc_now-tsc_last));
 345                current_ratio = val64;
 346        }
 347
 348        /* update record */
 349        msr_last = msr_now;
 350        tsc_last = tsc_now;
 351
 352        adjust_compensation(target_ratio, win);
 353        /*
 354         * too many external interrupts, set flag such
 355         * that we can take measure later.
 356         */
 357        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 358                2 * win * num_online_cpus();
 359
 360        atomic_set(&idle_wakeup_counter, 0);
 361        /* if we are above target+guard, skip */
 362        return set_target_ratio + guard <= current_ratio;
 363}
 364
 365static int clamp_thread(void *arg)
 366{
 367        int cpunr = (unsigned long)arg;
 368        DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 369        static const struct sched_param param = {
 370                .sched_priority = MAX_USER_RT_PRIO/2,
 371        };
 372        unsigned int count = 0;
 373        unsigned int target_ratio;
 374
 375        set_bit(cpunr, cpu_clamping_mask);
 376        set_freezable();
 377        init_timer_on_stack(&wakeup_timer);
 378        sched_setscheduler(current, SCHED_FIFO, &param);
 379
 380        while (true == clamping && !kthread_should_stop() &&
 381                cpu_online(cpunr)) {
 382                int sleeptime;
 383                unsigned long target_jiffies;
 384                unsigned int guard;
 385                unsigned int compensation = 0;
 386                int interval; /* jiffies to sleep for each attempt */
 387                unsigned int duration_jiffies = msecs_to_jiffies(duration);
 388                unsigned int window_size_now;
 389
 390                try_to_freeze();
 391                /*
 392                 * make sure user selected ratio does not take effect until
 393                 * the next round. adjust target_ratio if user has changed
 394                 * target such that we can converge quickly.
 395                 */
 396                target_ratio = set_target_ratio;
 397                guard = 1 + target_ratio/20;
 398                window_size_now = window_size;
 399                count++;
 400
 401                /*
 402                 * systems may have different ability to enter package level
 403                 * c-states, thus we need to compensate the injected idle ratio
 404                 * to achieve the actual target reported by the HW.
 405                 */
 406                compensation = get_compensation(target_ratio);
 407                interval = duration_jiffies*100/(target_ratio+compensation);
 408
 409                /* align idle time */
 410                target_jiffies = roundup(jiffies, interval);
 411                sleeptime = target_jiffies - jiffies;
 412                if (sleeptime <= 0)
 413                        sleeptime = 1;
 414                schedule_timeout_interruptible(sleeptime);
 415                /*
 416                 * only elected controlling cpu can collect stats and update
 417                 * control parameters.
 418                 */
 419                if (cpunr == control_cpu && !(count%window_size_now)) {
 420                        should_skip =
 421                                powerclamp_adjust_controls(target_ratio,
 422                                                        guard, window_size_now);
 423                        smp_mb();
 424                }
 425
 426                if (should_skip)
 427                        continue;
 428
 429                target_jiffies = jiffies + duration_jiffies;
 430                mod_timer(&wakeup_timer, target_jiffies);
 431                if (unlikely(local_softirq_pending()))
 432                        continue;
 433                /*
 434                 * stop tick sched during idle time, interrupts are still
 435                 * allowed. thus jiffies are updated properly.
 436                 */
 437                preempt_disable();
 438                /* mwait until target jiffies is reached */
 439                while (time_before(jiffies, target_jiffies)) {
 440                        unsigned long ecx = 1;
 441                        unsigned long eax = target_mwait;
 442
 443                        /*
 444                         * REVISIT: may call enter_idle() to notify drivers who
 445                         * can save power during cpu idle. same for exit_idle()
 446                         */
 447                        local_touch_nmi();
 448                        stop_critical_timings();
 449                        mwait_idle_with_hints(eax, ecx);
 450                        start_critical_timings();
 451                        atomic_inc(&idle_wakeup_counter);
 452                }
 453                preempt_enable();
 454        }
 455        del_timer_sync(&wakeup_timer);
 456        clear_bit(cpunr, cpu_clamping_mask);
 457
 458        return 0;
 459}
 460
 461/*
 462 * 1 HZ polling while clamping is active, useful for userspace
 463 * to monitor actual idle ratio.
 464 */
 465static void poll_pkg_cstate(struct work_struct *dummy);
 466static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 467static void poll_pkg_cstate(struct work_struct *dummy)
 468{
 469        static u64 msr_last;
 470        static u64 tsc_last;
 471        static unsigned long jiffies_last;
 472
 473        u64 msr_now;
 474        unsigned long jiffies_now;
 475        u64 tsc_now;
 476        u64 val64;
 477
 478        msr_now = pkg_state_counter();
 479        rdtscll(tsc_now);
 480        jiffies_now = jiffies;
 481
 482        /* calculate pkg cstate vs tsc ratio */
 483        if (!msr_last || !tsc_last)
 484                pkg_cstate_ratio_cur = 1;
 485        else {
 486                if (tsc_now - tsc_last) {
 487                        val64 = 100 * (msr_now - msr_last);
 488                        do_div(val64, (tsc_now - tsc_last));
 489                        pkg_cstate_ratio_cur = val64;
 490                }
 491        }
 492
 493        /* update record */
 494        msr_last = msr_now;
 495        jiffies_last = jiffies_now;
 496        tsc_last = tsc_now;
 497
 498        if (true == clamping)
 499                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 500}
 501
 502static int start_power_clamp(void)
 503{
 504        unsigned long cpu;
 505        struct task_struct *thread;
 506
 507        /* check if pkg cstate counter is completely 0, abort in this case */
 508        if (!has_pkg_state_counter()) {
 509                pr_err("pkg cstate counter not functional, abort\n");
 510                return -EINVAL;
 511        }
 512
 513        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 514        /* prevent cpu hotplug */
 515        get_online_cpus();
 516
 517        /* prefer BSP */
 518        control_cpu = 0;
 519        if (!cpu_online(control_cpu))
 520                control_cpu = smp_processor_id();
 521
 522        clamping = true;
 523        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 524
 525        /* start one thread per online cpu */
 526        for_each_online_cpu(cpu) {
 527                struct task_struct **p =
 528                        per_cpu_ptr(powerclamp_thread, cpu);
 529
 530                thread = kthread_create_on_node(clamp_thread,
 531                                                (void *) cpu,
 532                                                cpu_to_node(cpu),
 533                                                "kidle_inject/%ld", cpu);
 534                /* bind to cpu here */
 535                if (likely(!IS_ERR(thread))) {
 536                        kthread_bind(thread, cpu);
 537                        wake_up_process(thread);
 538                        *p = thread;
 539                }
 540
 541        }
 542        put_online_cpus();
 543
 544        return 0;
 545}
 546
 547static void end_power_clamp(void)
 548{
 549        int i;
 550        struct task_struct *thread;
 551
 552        clamping = false;
 553        /*
 554         * make clamping visible to other cpus and give per cpu clamping threads
 555         * sometime to exit, or gets killed later.
 556         */
 557        smp_mb();
 558        msleep(20);
 559        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 560                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 561                        pr_debug("clamping thread for cpu %d alive, kill\n", i);
 562                        thread = *per_cpu_ptr(powerclamp_thread, i);
 563                        kthread_stop(thread);
 564                }
 565        }
 566}
 567
 568static int powerclamp_cpu_callback(struct notifier_block *nfb,
 569                                unsigned long action, void *hcpu)
 570{
 571        unsigned long cpu = (unsigned long)hcpu;
 572        struct task_struct *thread;
 573        struct task_struct **percpu_thread =
 574                per_cpu_ptr(powerclamp_thread, cpu);
 575
 576        if (false == clamping)
 577                goto exit_ok;
 578
 579        switch (action) {
 580        case CPU_ONLINE:
 581                thread = kthread_create_on_node(clamp_thread,
 582                                                (void *) cpu,
 583                                                cpu_to_node(cpu),
 584                                                "kidle_inject/%lu", cpu);
 585                if (likely(!IS_ERR(thread))) {
 586                        kthread_bind(thread, cpu);
 587                        wake_up_process(thread);
 588                        *percpu_thread = thread;
 589                }
 590                /* prefer BSP as controlling CPU */
 591                if (cpu == 0) {
 592                        control_cpu = 0;
 593                        smp_mb();
 594                }
 595                break;
 596        case CPU_DEAD:
 597                if (test_bit(cpu, cpu_clamping_mask)) {
 598                        pr_err("cpu %lu dead but powerclamping thread is not\n",
 599                                cpu);
 600                        kthread_stop(*percpu_thread);
 601                }
 602                if (cpu == control_cpu) {
 603                        control_cpu = smp_processor_id();
 604                        smp_mb();
 605                }
 606        }
 607
 608exit_ok:
 609        return NOTIFY_OK;
 610}
 611
 612static struct notifier_block powerclamp_cpu_notifier = {
 613        .notifier_call = powerclamp_cpu_callback,
 614};
 615
 616static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 617                                 unsigned long *state)
 618{
 619        *state = MAX_TARGET_RATIO;
 620
 621        return 0;
 622}
 623
 624static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 625                                 unsigned long *state)
 626{
 627        if (true == clamping)
 628                *state = pkg_cstate_ratio_cur;
 629        else
 630                /* to save power, do not poll idle ratio while not clamping */
 631                *state = -1; /* indicates invalid state */
 632
 633        return 0;
 634}
 635
 636static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 637                                 unsigned long new_target_ratio)
 638{
 639        int ret = 0;
 640
 641        new_target_ratio = clamp(new_target_ratio, 0UL,
 642                                (unsigned long) (MAX_TARGET_RATIO-1));
 643        if (set_target_ratio == 0 && new_target_ratio > 0) {
 644                pr_info("Start idle injection to reduce power\n");
 645                set_target_ratio = new_target_ratio;
 646                ret = start_power_clamp();
 647                goto exit_set;
 648        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 649                pr_info("Stop forced idle injection\n");
 650                set_target_ratio = 0;
 651                end_power_clamp();
 652        } else  /* adjust currently running */ {
 653                set_target_ratio = new_target_ratio;
 654                /* make new set_target_ratio visible to other cpus */
 655                smp_mb();
 656        }
 657
 658exit_set:
 659        return ret;
 660}
 661
 662/* bind to generic thermal layer as cooling device*/
 663static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 664        .get_max_state = powerclamp_get_max_state,
 665        .get_cur_state = powerclamp_get_cur_state,
 666        .set_cur_state = powerclamp_set_cur_state,
 667};
 668
 669/* runs on Nehalem and later */
 670static const struct x86_cpu_id intel_powerclamp_ids[] = {
 671        { X86_VENDOR_INTEL, 6, 0x1a},
 672        { X86_VENDOR_INTEL, 6, 0x1c},
 673        { X86_VENDOR_INTEL, 6, 0x1e},
 674        { X86_VENDOR_INTEL, 6, 0x1f},
 675        { X86_VENDOR_INTEL, 6, 0x25},
 676        { X86_VENDOR_INTEL, 6, 0x26},
 677        { X86_VENDOR_INTEL, 6, 0x2a},
 678        { X86_VENDOR_INTEL, 6, 0x2c},
 679        { X86_VENDOR_INTEL, 6, 0x2d},
 680        { X86_VENDOR_INTEL, 6, 0x2e},
 681        { X86_VENDOR_INTEL, 6, 0x2f},
 682        { X86_VENDOR_INTEL, 6, 0x37},
 683        { X86_VENDOR_INTEL, 6, 0x3a},
 684        { X86_VENDOR_INTEL, 6, 0x3c},
 685        { X86_VENDOR_INTEL, 6, 0x3d},
 686        { X86_VENDOR_INTEL, 6, 0x3e},
 687        { X86_VENDOR_INTEL, 6, 0x3f},
 688        { X86_VENDOR_INTEL, 6, 0x45},
 689        { X86_VENDOR_INTEL, 6, 0x46},
 690        { X86_VENDOR_INTEL, 6, 0x4c},
 691        { X86_VENDOR_INTEL, 6, 0x4d},
 692        { X86_VENDOR_INTEL, 6, 0x56},
 693        {}
 694};
 695MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 696
 697static int powerclamp_probe(void)
 698{
 699        if (!x86_match_cpu(intel_powerclamp_ids)) {
 700                pr_err("Intel powerclamp does not run on family %d model %d\n",
 701                                boot_cpu_data.x86, boot_cpu_data.x86_model);
 702                return -ENODEV;
 703        }
 704        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
 705                !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
 706                !boot_cpu_has(X86_FEATURE_MWAIT) ||
 707                !boot_cpu_has(X86_FEATURE_ARAT))
 708                return -ENODEV;
 709
 710        /* find the deepest mwait value */
 711        find_target_mwait();
 712
 713        return 0;
 714}
 715
 716static int powerclamp_debug_show(struct seq_file *m, void *unused)
 717{
 718        int i = 0;
 719
 720        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 721        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 722        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 723                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 724                        i,
 725                        cal_data[i].confidence,
 726                        cal_data[i].steady_comp,
 727                        cal_data[i].dynamic_comp);
 728        }
 729
 730        return 0;
 731}
 732
 733static int powerclamp_debug_open(struct inode *inode,
 734                        struct file *file)
 735{
 736        return single_open(file, powerclamp_debug_show, inode->i_private);
 737}
 738
 739static const struct file_operations powerclamp_debug_fops = {
 740        .open           = powerclamp_debug_open,
 741        .read           = seq_read,
 742        .llseek         = seq_lseek,
 743        .release        = single_release,
 744        .owner          = THIS_MODULE,
 745};
 746
 747static inline void powerclamp_create_debug_files(void)
 748{
 749        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 750        if (!debug_dir)
 751                return;
 752
 753        if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
 754                                        cal_data, &powerclamp_debug_fops))
 755                goto file_error;
 756
 757        return;
 758
 759file_error:
 760        debugfs_remove_recursive(debug_dir);
 761}
 762
 763static int powerclamp_init(void)
 764{
 765        int retval;
 766        int bitmap_size;
 767
 768        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 769        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 770        if (!cpu_clamping_mask)
 771                return -ENOMEM;
 772
 773        /* probe cpu features and ids here */
 774        retval = powerclamp_probe();
 775        if (retval)
 776                goto exit_free;
 777
 778        /* set default limit, maybe adjusted during runtime based on feedback */
 779        window_size = 2;
 780        register_hotcpu_notifier(&powerclamp_cpu_notifier);
 781
 782        powerclamp_thread = alloc_percpu(struct task_struct *);
 783        if (!powerclamp_thread) {
 784                retval = -ENOMEM;
 785                goto exit_unregister;
 786        }
 787
 788        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 789                                                &powerclamp_cooling_ops);
 790        if (IS_ERR(cooling_dev)) {
 791                retval = -ENODEV;
 792                goto exit_free_thread;
 793        }
 794
 795        if (!duration)
 796                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 797
 798        powerclamp_create_debug_files();
 799
 800        return 0;
 801
 802exit_free_thread:
 803        free_percpu(powerclamp_thread);
 804exit_unregister:
 805        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 806exit_free:
 807        kfree(cpu_clamping_mask);
 808        return retval;
 809}
 810module_init(powerclamp_init);
 811
 812static void powerclamp_exit(void)
 813{
 814        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 815        end_power_clamp();
 816        free_percpu(powerclamp_thread);
 817        thermal_cooling_device_unregister(cooling_dev);
 818        kfree(cpu_clamping_mask);
 819
 820        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 821        debugfs_remove_recursive(debug_dir);
 822}
 823module_exit(powerclamp_exit);
 824
 825MODULE_LICENSE("GPL");
 826MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 827MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 828MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 829