linux/drivers/thermal/intel_powerclamp.c
<<
>>
Prefs
   1/*
   2 * intel_powerclamp.c - package c-state idle injection
   3 *
   4 * Copyright (c) 2012, Intel Corporation.
   5 *
   6 * Authors:
   7 *     Arjan van de Ven <arjan@linux.intel.com>
   8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms and conditions of the GNU General Public License,
  12 * version 2, as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope it will be useful, but WITHOUT
  15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17 * more details.
  18 *
  19 * You should have received a copy of the GNU General Public License along with
  20 * this program; if not, write to the Free Software Foundation, Inc.,
  21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  22 *
  23 *
  24 *      TODO:
  25 *           1. better handle wakeup from external interrupts, currently a fixed
  26 *              compensation is added to clamping duration when excessive amount
  27 *              of wakeups are observed during idle time. the reason is that in
  28 *              case of external interrupts without need for ack, clamping down
  29 *              cpu in non-irq context does not reduce irq. for majority of the
  30 *              cases, clamping down cpu does help reduce irq as well, we should
  31 *              be able to differenciate the two cases and give a quantitative
  32 *              solution for the irqs that we can control. perhaps based on
  33 *              get_cpu_iowait_time_us()
  34 *
  35 *           2. synchronization with other hw blocks
  36 *
  37 *
  38 */
  39
  40#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  41
  42#include <linux/module.h>
  43#include <linux/kernel.h>
  44#include <linux/delay.h>
  45#include <linux/kthread.h>
  46#include <linux/freezer.h>
  47#include <linux/cpu.h>
  48#include <linux/thermal.h>
  49#include <linux/slab.h>
  50#include <linux/tick.h>
  51#include <linux/debugfs.h>
  52#include <linux/seq_file.h>
  53#include <linux/sched/rt.h>
  54
  55#include <asm/nmi.h>
  56#include <asm/msr.h>
  57#include <asm/mwait.h>
  58#include <asm/cpu_device_id.h>
  59#include <asm/idle.h>
  60#include <asm/hardirq.h>
  61
  62#define MAX_TARGET_RATIO (50U)
  63/* For each undisturbed clamping period (no extra wake ups during idle time),
  64 * we increment the confidence counter for the given target ratio.
  65 * CONFIDENCE_OK defines the level where runtime calibration results are
  66 * valid.
  67 */
  68#define CONFIDENCE_OK (3)
  69/* Default idle injection duration, driver adjust sleep time to meet target
  70 * idle ratio. Similar to frequency modulation.
  71 */
  72#define DEFAULT_DURATION_JIFFIES (6)
  73
  74static unsigned int target_mwait;
  75static struct dentry *debug_dir;
  76
  77/* user selected target */
  78static unsigned int set_target_ratio;
  79static unsigned int current_ratio;
  80static bool should_skip;
  81static bool reduce_irq;
  82static atomic_t idle_wakeup_counter;
  83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  84                                  * control parameters. default to BSP but BSP
  85                                  * can be offlined.
  86                                  */
  87static bool clamping;
  88
  89
  90static struct task_struct * __percpu *powerclamp_thread;
  91static struct thermal_cooling_device *cooling_dev;
  92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
  93                                           * clamping thread
  94                                           */
  95
  96static unsigned int duration;
  97static unsigned int pkg_cstate_ratio_cur;
  98static unsigned int window_size;
  99
 100static int duration_set(const char *arg, const struct kernel_param *kp)
 101{
 102        int ret = 0;
 103        unsigned long new_duration;
 104
 105        ret = kstrtoul(arg, 10, &new_duration);
 106        if (ret)
 107                goto exit;
 108        if (new_duration > 25 || new_duration < 6) {
 109                pr_err("Out of recommended range %lu, between 6-25ms\n",
 110                        new_duration);
 111                ret = -EINVAL;
 112        }
 113
 114        duration = clamp(new_duration, 6ul, 25ul);
 115        smp_mb();
 116
 117exit:
 118
 119        return ret;
 120}
 121
 122static struct kernel_param_ops duration_ops = {
 123        .set = duration_set,
 124        .get = param_get_int,
 125};
 126
 127
 128module_param_cb(duration, &duration_ops, &duration, 0644);
 129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 130
 131struct powerclamp_calibration_data {
 132        unsigned long confidence;  /* used for calibration, basically a counter
 133                                    * gets incremented each time a clamping
 134                                    * period is completed without extra wakeups
 135                                    * once that counter is reached given level,
 136                                    * compensation is deemed usable.
 137                                    */
 138        unsigned long steady_comp; /* steady state compensation used when
 139                                    * no extra wakeups occurred.
 140                                    */
 141        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 142                                     * mostly from external interrupts.
 143                                     */
 144};
 145
 146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 147
 148static int window_size_set(const char *arg, const struct kernel_param *kp)
 149{
 150        int ret = 0;
 151        unsigned long new_window_size;
 152
 153        ret = kstrtoul(arg, 10, &new_window_size);
 154        if (ret)
 155                goto exit_win;
 156        if (new_window_size > 10 || new_window_size < 2) {
 157                pr_err("Out of recommended window size %lu, between 2-10\n",
 158                        new_window_size);
 159                ret = -EINVAL;
 160        }
 161
 162        window_size = clamp(new_window_size, 2ul, 10ul);
 163        smp_mb();
 164
 165exit_win:
 166
 167        return ret;
 168}
 169
 170static struct kernel_param_ops window_size_ops = {
 171        .set = window_size_set,
 172        .get = param_get_int,
 173};
 174
 175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 177        "\tpowerclamp controls idle ratio within this window. larger\n"
 178        "\twindow size results in slower response time but more smooth\n"
 179        "\tclamping results. default to 2.");
 180
 181static void find_target_mwait(void)
 182{
 183        unsigned int eax, ebx, ecx, edx;
 184        unsigned int highest_cstate = 0;
 185        unsigned int highest_subcstate = 0;
 186        int i;
 187
 188        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 189                return;
 190
 191        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 192
 193        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 194            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 195                return;
 196
 197        edx >>= MWAIT_SUBSTATE_SIZE;
 198        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 199                if (edx & MWAIT_SUBSTATE_MASK) {
 200                        highest_cstate = i;
 201                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 202                }
 203        }
 204        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 205                (highest_subcstate - 1);
 206
 207}
 208
 209static u64 pkg_state_counter(void)
 210{
 211        u64 val;
 212        u64 count = 0;
 213
 214        static bool skip_c2;
 215        static bool skip_c3;
 216        static bool skip_c6;
 217        static bool skip_c7;
 218
 219        if (!skip_c2) {
 220                if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
 221                        count += val;
 222                else
 223                        skip_c2 = true;
 224        }
 225
 226        if (!skip_c3) {
 227                if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
 228                        count += val;
 229                else
 230                        skip_c3 = true;
 231        }
 232
 233        if (!skip_c6) {
 234                if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
 235                        count += val;
 236                else
 237                        skip_c6 = true;
 238        }
 239
 240        if (!skip_c7) {
 241                if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
 242                        count += val;
 243                else
 244                        skip_c7 = true;
 245        }
 246
 247        return count;
 248}
 249
 250static void noop_timer(unsigned long foo)
 251{
 252        /* empty... just the fact that we get the interrupt wakes us up */
 253}
 254
 255static unsigned int get_compensation(int ratio)
 256{
 257        unsigned int comp = 0;
 258
 259        /* we only use compensation if all adjacent ones are good */
 260        if (ratio == 1 &&
 261                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 262                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 263                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 264                comp = (cal_data[ratio].steady_comp +
 265                        cal_data[ratio + 1].steady_comp +
 266                        cal_data[ratio + 2].steady_comp) / 3;
 267        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 268                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 269                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 270                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 271                comp = (cal_data[ratio].steady_comp +
 272                        cal_data[ratio - 1].steady_comp +
 273                        cal_data[ratio - 2].steady_comp) / 3;
 274        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 275                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 276                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 277                comp = (cal_data[ratio].steady_comp +
 278                        cal_data[ratio - 1].steady_comp +
 279                        cal_data[ratio + 1].steady_comp) / 3;
 280        }
 281
 282        /* REVISIT: simple penalty of double idle injection */
 283        if (reduce_irq)
 284                comp = ratio;
 285        /* do not exceed limit */
 286        if (comp + ratio >= MAX_TARGET_RATIO)
 287                comp = MAX_TARGET_RATIO - ratio - 1;
 288
 289        return comp;
 290}
 291
 292static void adjust_compensation(int target_ratio, unsigned int win)
 293{
 294        int delta;
 295        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 296
 297        /*
 298         * adjust compensations if confidence level has not been reached or
 299         * there are too many wakeups during the last idle injection period, we
 300         * cannot trust the data for compensation.
 301         */
 302        if (d->confidence >= CONFIDENCE_OK ||
 303                atomic_read(&idle_wakeup_counter) >
 304                win * num_online_cpus())
 305                return;
 306
 307        delta = set_target_ratio - current_ratio;
 308        /* filter out bad data */
 309        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 310                if (d->steady_comp)
 311                        d->steady_comp =
 312                                roundup(delta+d->steady_comp, 2)/2;
 313                else
 314                        d->steady_comp = delta;
 315                d->confidence++;
 316        }
 317}
 318
 319static bool powerclamp_adjust_controls(unsigned int target_ratio,
 320                                unsigned int guard, unsigned int win)
 321{
 322        static u64 msr_last, tsc_last;
 323        u64 msr_now, tsc_now;
 324        u64 val64;
 325
 326        /* check result for the last window */
 327        msr_now = pkg_state_counter();
 328        rdtscll(tsc_now);
 329
 330        /* calculate pkg cstate vs tsc ratio */
 331        if (!msr_last || !tsc_last)
 332                current_ratio = 1;
 333        else if (tsc_now-tsc_last) {
 334                val64 = 100*(msr_now-msr_last);
 335                do_div(val64, (tsc_now-tsc_last));
 336                current_ratio = val64;
 337        }
 338
 339        /* update record */
 340        msr_last = msr_now;
 341        tsc_last = tsc_now;
 342
 343        adjust_compensation(target_ratio, win);
 344        /*
 345         * too many external interrupts, set flag such
 346         * that we can take measure later.
 347         */
 348        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 349                2 * win * num_online_cpus();
 350
 351        atomic_set(&idle_wakeup_counter, 0);
 352        /* if we are above target+guard, skip */
 353        return set_target_ratio + guard <= current_ratio;
 354}
 355
 356static int clamp_thread(void *arg)
 357{
 358        int cpunr = (unsigned long)arg;
 359        DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 360        static const struct sched_param param = {
 361                .sched_priority = MAX_USER_RT_PRIO/2,
 362        };
 363        unsigned int count = 0;
 364        unsigned int target_ratio;
 365
 366        set_bit(cpunr, cpu_clamping_mask);
 367        set_freezable();
 368        init_timer_on_stack(&wakeup_timer);
 369        sched_setscheduler(current, SCHED_FIFO, &param);
 370
 371        while (true == clamping && !kthread_should_stop() &&
 372                cpu_online(cpunr)) {
 373                int sleeptime;
 374                unsigned long target_jiffies;
 375                unsigned int guard;
 376                unsigned int compensation = 0;
 377                int interval; /* jiffies to sleep for each attempt */
 378                unsigned int duration_jiffies = msecs_to_jiffies(duration);
 379                unsigned int window_size_now;
 380
 381                try_to_freeze();
 382                /*
 383                 * make sure user selected ratio does not take effect until
 384                 * the next round. adjust target_ratio if user has changed
 385                 * target such that we can converge quickly.
 386                 */
 387                target_ratio = set_target_ratio;
 388                guard = 1 + target_ratio/20;
 389                window_size_now = window_size;
 390                count++;
 391
 392                /*
 393                 * systems may have different ability to enter package level
 394                 * c-states, thus we need to compensate the injected idle ratio
 395                 * to achieve the actual target reported by the HW.
 396                 */
 397                compensation = get_compensation(target_ratio);
 398                interval = duration_jiffies*100/(target_ratio+compensation);
 399
 400                /* align idle time */
 401                target_jiffies = roundup(jiffies, interval);
 402                sleeptime = target_jiffies - jiffies;
 403                if (sleeptime <= 0)
 404                        sleeptime = 1;
 405                schedule_timeout_interruptible(sleeptime);
 406                /*
 407                 * only elected controlling cpu can collect stats and update
 408                 * control parameters.
 409                 */
 410                if (cpunr == control_cpu && !(count%window_size_now)) {
 411                        should_skip =
 412                                powerclamp_adjust_controls(target_ratio,
 413                                                        guard, window_size_now);
 414                        smp_mb();
 415                }
 416
 417                if (should_skip)
 418                        continue;
 419
 420                target_jiffies = jiffies + duration_jiffies;
 421                mod_timer(&wakeup_timer, target_jiffies);
 422                if (unlikely(local_softirq_pending()))
 423                        continue;
 424                /*
 425                 * stop tick sched during idle time, interrupts are still
 426                 * allowed. thus jiffies are updated properly.
 427                 */
 428                preempt_disable();
 429                tick_nohz_idle_enter();
 430                /* mwait until target jiffies is reached */
 431                while (time_before(jiffies, target_jiffies)) {
 432                        unsigned long ecx = 1;
 433                        unsigned long eax = target_mwait;
 434
 435                        /*
 436                         * REVISIT: may call enter_idle() to notify drivers who
 437                         * can save power during cpu idle. same for exit_idle()
 438                         */
 439                        local_touch_nmi();
 440                        stop_critical_timings();
 441                        __monitor((void *)&current_thread_info()->flags, 0, 0);
 442                        cpu_relax(); /* allow HT sibling to run */
 443                        __mwait(eax, ecx);
 444                        start_critical_timings();
 445                        atomic_inc(&idle_wakeup_counter);
 446                }
 447                tick_nohz_idle_exit();
 448                preempt_enable_no_resched();
 449        }
 450        del_timer_sync(&wakeup_timer);
 451        clear_bit(cpunr, cpu_clamping_mask);
 452
 453        return 0;
 454}
 455
 456/*
 457 * 1 HZ polling while clamping is active, useful for userspace
 458 * to monitor actual idle ratio.
 459 */
 460static void poll_pkg_cstate(struct work_struct *dummy);
 461static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 462static void poll_pkg_cstate(struct work_struct *dummy)
 463{
 464        static u64 msr_last;
 465        static u64 tsc_last;
 466        static unsigned long jiffies_last;
 467
 468        u64 msr_now;
 469        unsigned long jiffies_now;
 470        u64 tsc_now;
 471        u64 val64;
 472
 473        msr_now = pkg_state_counter();
 474        rdtscll(tsc_now);
 475        jiffies_now = jiffies;
 476
 477        /* calculate pkg cstate vs tsc ratio */
 478        if (!msr_last || !tsc_last)
 479                pkg_cstate_ratio_cur = 1;
 480        else {
 481                if (tsc_now - tsc_last) {
 482                        val64 = 100 * (msr_now - msr_last);
 483                        do_div(val64, (tsc_now - tsc_last));
 484                        pkg_cstate_ratio_cur = val64;
 485                }
 486        }
 487
 488        /* update record */
 489        msr_last = msr_now;
 490        jiffies_last = jiffies_now;
 491        tsc_last = tsc_now;
 492
 493        if (true == clamping)
 494                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 495}
 496
 497static int start_power_clamp(void)
 498{
 499        unsigned long cpu;
 500        struct task_struct *thread;
 501
 502        /* check if pkg cstate counter is completely 0, abort in this case */
 503        if (!pkg_state_counter()) {
 504                pr_err("pkg cstate counter not functional, abort\n");
 505                return -EINVAL;
 506        }
 507
 508        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 509        /* prevent cpu hotplug */
 510        get_online_cpus();
 511
 512        /* prefer BSP */
 513        control_cpu = 0;
 514        if (!cpu_online(control_cpu))
 515                control_cpu = smp_processor_id();
 516
 517        clamping = true;
 518        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 519
 520        /* start one thread per online cpu */
 521        for_each_online_cpu(cpu) {
 522                struct task_struct **p =
 523                        per_cpu_ptr(powerclamp_thread, cpu);
 524
 525                thread = kthread_create_on_node(clamp_thread,
 526                                                (void *) cpu,
 527                                                cpu_to_node(cpu),
 528                                                "kidle_inject/%ld", cpu);
 529                /* bind to cpu here */
 530                if (likely(!IS_ERR(thread))) {
 531                        kthread_bind(thread, cpu);
 532                        wake_up_process(thread);
 533                        *p = thread;
 534                }
 535
 536        }
 537        put_online_cpus();
 538
 539        return 0;
 540}
 541
 542static void end_power_clamp(void)
 543{
 544        int i;
 545        struct task_struct *thread;
 546
 547        clamping = false;
 548        /*
 549         * make clamping visible to other cpus and give per cpu clamping threads
 550         * sometime to exit, or gets killed later.
 551         */
 552        smp_mb();
 553        msleep(20);
 554        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 555                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 556                        pr_debug("clamping thread for cpu %d alive, kill\n", i);
 557                        thread = *per_cpu_ptr(powerclamp_thread, i);
 558                        kthread_stop(thread);
 559                }
 560        }
 561}
 562
 563static int powerclamp_cpu_callback(struct notifier_block *nfb,
 564                                unsigned long action, void *hcpu)
 565{
 566        unsigned long cpu = (unsigned long)hcpu;
 567        struct task_struct *thread;
 568        struct task_struct **percpu_thread =
 569                per_cpu_ptr(powerclamp_thread, cpu);
 570
 571        if (false == clamping)
 572                goto exit_ok;
 573
 574        switch (action) {
 575        case CPU_ONLINE:
 576                thread = kthread_create_on_node(clamp_thread,
 577                                                (void *) cpu,
 578                                                cpu_to_node(cpu),
 579                                                "kidle_inject/%lu", cpu);
 580                if (likely(!IS_ERR(thread))) {
 581                        kthread_bind(thread, cpu);
 582                        wake_up_process(thread);
 583                        *percpu_thread = thread;
 584                }
 585                /* prefer BSP as controlling CPU */
 586                if (cpu == 0) {
 587                        control_cpu = 0;
 588                        smp_mb();
 589                }
 590                break;
 591        case CPU_DEAD:
 592                if (test_bit(cpu, cpu_clamping_mask)) {
 593                        pr_err("cpu %lu dead but powerclamping thread is not\n",
 594                                cpu);
 595                        kthread_stop(*percpu_thread);
 596                }
 597                if (cpu == control_cpu) {
 598                        control_cpu = smp_processor_id();
 599                        smp_mb();
 600                }
 601        }
 602
 603exit_ok:
 604        return NOTIFY_OK;
 605}
 606
 607static struct notifier_block powerclamp_cpu_notifier = {
 608        .notifier_call = powerclamp_cpu_callback,
 609};
 610
 611static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 612                                 unsigned long *state)
 613{
 614        *state = MAX_TARGET_RATIO;
 615
 616        return 0;
 617}
 618
 619static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 620                                 unsigned long *state)
 621{
 622        if (true == clamping)
 623                *state = pkg_cstate_ratio_cur;
 624        else
 625                /* to save power, do not poll idle ratio while not clamping */
 626                *state = -1; /* indicates invalid state */
 627
 628        return 0;
 629}
 630
 631static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 632                                 unsigned long new_target_ratio)
 633{
 634        int ret = 0;
 635
 636        new_target_ratio = clamp(new_target_ratio, 0UL,
 637                                (unsigned long) (MAX_TARGET_RATIO-1));
 638        if (set_target_ratio == 0 && new_target_ratio > 0) {
 639                pr_info("Start idle injection to reduce power\n");
 640                set_target_ratio = new_target_ratio;
 641                ret = start_power_clamp();
 642                goto exit_set;
 643        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 644                pr_info("Stop forced idle injection\n");
 645                set_target_ratio = 0;
 646                end_power_clamp();
 647        } else  /* adjust currently running */ {
 648                set_target_ratio = new_target_ratio;
 649                /* make new set_target_ratio visible to other cpus */
 650                smp_mb();
 651        }
 652
 653exit_set:
 654        return ret;
 655}
 656
 657/* bind to generic thermal layer as cooling device*/
 658static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 659        .get_max_state = powerclamp_get_max_state,
 660        .get_cur_state = powerclamp_get_cur_state,
 661        .set_cur_state = powerclamp_set_cur_state,
 662};
 663
 664/* runs on Nehalem and later */
 665static const struct x86_cpu_id intel_powerclamp_ids[] = {
 666        { X86_VENDOR_INTEL, 6, 0x1a},
 667        { X86_VENDOR_INTEL, 6, 0x1c},
 668        { X86_VENDOR_INTEL, 6, 0x1e},
 669        { X86_VENDOR_INTEL, 6, 0x1f},
 670        { X86_VENDOR_INTEL, 6, 0x25},
 671        { X86_VENDOR_INTEL, 6, 0x26},
 672        { X86_VENDOR_INTEL, 6, 0x2a},
 673        { X86_VENDOR_INTEL, 6, 0x2c},
 674        { X86_VENDOR_INTEL, 6, 0x2d},
 675        { X86_VENDOR_INTEL, 6, 0x2e},
 676        { X86_VENDOR_INTEL, 6, 0x2f},
 677        { X86_VENDOR_INTEL, 6, 0x3a},
 678        {}
 679};
 680MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 681
 682static int powerclamp_probe(void)
 683{
 684        if (!x86_match_cpu(intel_powerclamp_ids)) {
 685                pr_err("Intel powerclamp does not run on family %d model %d\n",
 686                                boot_cpu_data.x86, boot_cpu_data.x86_model);
 687                return -ENODEV;
 688        }
 689        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
 690                !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
 691                !boot_cpu_has(X86_FEATURE_MWAIT) ||
 692                !boot_cpu_has(X86_FEATURE_ARAT))
 693                return -ENODEV;
 694
 695        /* find the deepest mwait value */
 696        find_target_mwait();
 697
 698        return 0;
 699}
 700
 701static int powerclamp_debug_show(struct seq_file *m, void *unused)
 702{
 703        int i = 0;
 704
 705        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 706        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 707        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 708                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 709                        i,
 710                        cal_data[i].confidence,
 711                        cal_data[i].steady_comp,
 712                        cal_data[i].dynamic_comp);
 713        }
 714
 715        return 0;
 716}
 717
 718static int powerclamp_debug_open(struct inode *inode,
 719                        struct file *file)
 720{
 721        return single_open(file, powerclamp_debug_show, inode->i_private);
 722}
 723
 724static const struct file_operations powerclamp_debug_fops = {
 725        .open           = powerclamp_debug_open,
 726        .read           = seq_read,
 727        .llseek         = seq_lseek,
 728        .release        = single_release,
 729        .owner          = THIS_MODULE,
 730};
 731
 732static inline void powerclamp_create_debug_files(void)
 733{
 734        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 735        if (!debug_dir)
 736                return;
 737
 738        if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
 739                                        cal_data, &powerclamp_debug_fops))
 740                goto file_error;
 741
 742        return;
 743
 744file_error:
 745        debugfs_remove_recursive(debug_dir);
 746}
 747
 748static int powerclamp_init(void)
 749{
 750        int retval;
 751        int bitmap_size;
 752
 753        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 754        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 755        if (!cpu_clamping_mask)
 756                return -ENOMEM;
 757
 758        /* probe cpu features and ids here */
 759        retval = powerclamp_probe();
 760        if (retval)
 761                return retval;
 762        /* set default limit, maybe adjusted during runtime based on feedback */
 763        window_size = 2;
 764        register_hotcpu_notifier(&powerclamp_cpu_notifier);
 765        powerclamp_thread = alloc_percpu(struct task_struct *);
 766        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 767                                                &powerclamp_cooling_ops);
 768        if (IS_ERR(cooling_dev))
 769                return -ENODEV;
 770
 771        if (!duration)
 772                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 773        powerclamp_create_debug_files();
 774
 775        return 0;
 776}
 777module_init(powerclamp_init);
 778
 779static void powerclamp_exit(void)
 780{
 781        unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 782        end_power_clamp();
 783        free_percpu(powerclamp_thread);
 784        thermal_cooling_device_unregister(cooling_dev);
 785        kfree(cpu_clamping_mask);
 786
 787        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 788        debugfs_remove_recursive(debug_dir);
 789}
 790module_exit(powerclamp_exit);
 791
 792MODULE_LICENSE("GPL");
 793MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 794MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 795MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 796