linux/drivers/thermal/intel/intel_powerclamp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * intel_powerclamp.c - package c-state idle injection
   4 *
   5 * Copyright (c) 2012, Intel Corporation.
   6 *
   7 * Authors:
   8 *     Arjan van de Ven <arjan@linux.intel.com>
   9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
  10 *
  11 *      TODO:
  12 *           1. better handle wakeup from external interrupts, currently a fixed
  13 *              compensation is added to clamping duration when excessive amount
  14 *              of wakeups are observed during idle time. the reason is that in
  15 *              case of external interrupts without need for ack, clamping down
  16 *              cpu in non-irq context does not reduce irq. for majority of the
  17 *              cases, clamping down cpu does help reduce irq as well, we should
  18 *              be able to differentiate the two cases and give a quantitative
  19 *              solution for the irqs that we can control. perhaps based on
  20 *              get_cpu_iowait_time_us()
  21 *
  22 *           2. synchronization with other hw blocks
  23 */
  24
  25#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  26
  27#include <linux/module.h>
  28#include <linux/kernel.h>
  29#include <linux/delay.h>
  30#include <linux/kthread.h>
  31#include <linux/cpu.h>
  32#include <linux/thermal.h>
  33#include <linux/slab.h>
  34#include <linux/tick.h>
  35#include <linux/debugfs.h>
  36#include <linux/seq_file.h>
  37#include <linux/sched/rt.h>
  38#include <uapi/linux/sched/types.h>
  39
  40#include <asm/nmi.h>
  41#include <asm/msr.h>
  42#include <asm/mwait.h>
  43#include <asm/cpu_device_id.h>
  44#include <asm/hardirq.h>
  45
  46#define MAX_TARGET_RATIO (50U)
  47/* For each undisturbed clamping period (no extra wake ups during idle time),
  48 * we increment the confidence counter for the given target ratio.
  49 * CONFIDENCE_OK defines the level where runtime calibration results are
  50 * valid.
  51 */
  52#define CONFIDENCE_OK (3)
  53/* Default idle injection duration, driver adjust sleep time to meet target
  54 * idle ratio. Similar to frequency modulation.
  55 */
  56#define DEFAULT_DURATION_JIFFIES (6)
  57
  58static unsigned int target_mwait;
  59static struct dentry *debug_dir;
  60
  61/* user selected target */
  62static unsigned int set_target_ratio;
  63static unsigned int current_ratio;
  64static bool should_skip;
  65static bool reduce_irq;
  66static atomic_t idle_wakeup_counter;
  67static unsigned int control_cpu; /* The cpu assigned to collect stat and update
  68                                  * control parameters. default to BSP but BSP
  69                                  * can be offlined.
  70                                  */
  71static bool clamping;
  72
  73static const struct sched_param sparam = {
  74        .sched_priority = MAX_USER_RT_PRIO / 2,
  75};
  76struct powerclamp_worker_data {
  77        struct kthread_worker *worker;
  78        struct kthread_work balancing_work;
  79        struct kthread_delayed_work idle_injection_work;
  80        unsigned int cpu;
  81        unsigned int count;
  82        unsigned int guard;
  83        unsigned int window_size_now;
  84        unsigned int target_ratio;
  85        unsigned int duration_jiffies;
  86        bool clamping;
  87};
  88
  89static struct powerclamp_worker_data __percpu *worker_data;
  90static struct thermal_cooling_device *cooling_dev;
  91static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
  92                                           * clamping kthread worker
  93                                           */
  94
  95static unsigned int duration;
  96static unsigned int pkg_cstate_ratio_cur;
  97static unsigned int window_size;
  98
  99static int duration_set(const char *arg, const struct kernel_param *kp)
 100{
 101        int ret = 0;
 102        unsigned long new_duration;
 103
 104        ret = kstrtoul(arg, 10, &new_duration);
 105        if (ret)
 106                goto exit;
 107        if (new_duration > 25 || new_duration < 6) {
 108                pr_err("Out of recommended range %lu, between 6-25ms\n",
 109                        new_duration);
 110                ret = -EINVAL;
 111        }
 112
 113        duration = clamp(new_duration, 6ul, 25ul);
 114        smp_mb();
 115
 116exit:
 117
 118        return ret;
 119}
 120
 121static const struct kernel_param_ops duration_ops = {
 122        .set = duration_set,
 123        .get = param_get_int,
 124};
 125
 126
 127module_param_cb(duration, &duration_ops, &duration, 0644);
 128MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
 129
 130struct powerclamp_calibration_data {
 131        unsigned long confidence;  /* used for calibration, basically a counter
 132                                    * gets incremented each time a clamping
 133                                    * period is completed without extra wakeups
 134                                    * once that counter is reached given level,
 135                                    * compensation is deemed usable.
 136                                    */
 137        unsigned long steady_comp; /* steady state compensation used when
 138                                    * no extra wakeups occurred.
 139                                    */
 140        unsigned long dynamic_comp; /* compensate excessive wakeup from idle
 141                                     * mostly from external interrupts.
 142                                     */
 143};
 144
 145static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
 146
 147static int window_size_set(const char *arg, const struct kernel_param *kp)
 148{
 149        int ret = 0;
 150        unsigned long new_window_size;
 151
 152        ret = kstrtoul(arg, 10, &new_window_size);
 153        if (ret)
 154                goto exit_win;
 155        if (new_window_size > 10 || new_window_size < 2) {
 156                pr_err("Out of recommended window size %lu, between 2-10\n",
 157                        new_window_size);
 158                ret = -EINVAL;
 159        }
 160
 161        window_size = clamp(new_window_size, 2ul, 10ul);
 162        smp_mb();
 163
 164exit_win:
 165
 166        return ret;
 167}
 168
 169static const struct kernel_param_ops window_size_ops = {
 170        .set = window_size_set,
 171        .get = param_get_int,
 172};
 173
 174module_param_cb(window_size, &window_size_ops, &window_size, 0644);
 175MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
 176        "\tpowerclamp controls idle ratio within this window. larger\n"
 177        "\twindow size results in slower response time but more smooth\n"
 178        "\tclamping results. default to 2.");
 179
 180static void find_target_mwait(void)
 181{
 182        unsigned int eax, ebx, ecx, edx;
 183        unsigned int highest_cstate = 0;
 184        unsigned int highest_subcstate = 0;
 185        int i;
 186
 187        if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 188                return;
 189
 190        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 191
 192        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
 193            !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
 194                return;
 195
 196        edx >>= MWAIT_SUBSTATE_SIZE;
 197        for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
 198                if (edx & MWAIT_SUBSTATE_MASK) {
 199                        highest_cstate = i;
 200                        highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
 201                }
 202        }
 203        target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
 204                (highest_subcstate - 1);
 205
 206}
 207
 208struct pkg_cstate_info {
 209        bool skip;
 210        int msr_index;
 211        int cstate_id;
 212};
 213
 214#define PKG_CSTATE_INIT(id) {                           \
 215                .msr_index = MSR_PKG_C##id##_RESIDENCY, \
 216                .cstate_id = id                         \
 217                        }
 218
 219static struct pkg_cstate_info pkg_cstates[] = {
 220        PKG_CSTATE_INIT(2),
 221        PKG_CSTATE_INIT(3),
 222        PKG_CSTATE_INIT(6),
 223        PKG_CSTATE_INIT(7),
 224        PKG_CSTATE_INIT(8),
 225        PKG_CSTATE_INIT(9),
 226        PKG_CSTATE_INIT(10),
 227        {NULL},
 228};
 229
 230static bool has_pkg_state_counter(void)
 231{
 232        u64 val;
 233        struct pkg_cstate_info *info = pkg_cstates;
 234
 235        /* check if any one of the counter msrs exists */
 236        while (info->msr_index) {
 237                if (!rdmsrl_safe(info->msr_index, &val))
 238                        return true;
 239                info++;
 240        }
 241
 242        return false;
 243}
 244
 245static u64 pkg_state_counter(void)
 246{
 247        u64 val;
 248        u64 count = 0;
 249        struct pkg_cstate_info *info = pkg_cstates;
 250
 251        while (info->msr_index) {
 252                if (!info->skip) {
 253                        if (!rdmsrl_safe(info->msr_index, &val))
 254                                count += val;
 255                        else
 256                                info->skip = true;
 257                }
 258                info++;
 259        }
 260
 261        return count;
 262}
 263
 264static unsigned int get_compensation(int ratio)
 265{
 266        unsigned int comp = 0;
 267
 268        /* we only use compensation if all adjacent ones are good */
 269        if (ratio == 1 &&
 270                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 271                cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
 272                cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
 273                comp = (cal_data[ratio].steady_comp +
 274                        cal_data[ratio + 1].steady_comp +
 275                        cal_data[ratio + 2].steady_comp) / 3;
 276        } else if (ratio == MAX_TARGET_RATIO - 1 &&
 277                cal_data[ratio].confidence >= CONFIDENCE_OK &&
 278                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 279                cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
 280                comp = (cal_data[ratio].steady_comp +
 281                        cal_data[ratio - 1].steady_comp +
 282                        cal_data[ratio - 2].steady_comp) / 3;
 283        } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
 284                cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
 285                cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
 286                comp = (cal_data[ratio].steady_comp +
 287                        cal_data[ratio - 1].steady_comp +
 288                        cal_data[ratio + 1].steady_comp) / 3;
 289        }
 290
 291        /* REVISIT: simple penalty of double idle injection */
 292        if (reduce_irq)
 293                comp = ratio;
 294        /* do not exceed limit */
 295        if (comp + ratio >= MAX_TARGET_RATIO)
 296                comp = MAX_TARGET_RATIO - ratio - 1;
 297
 298        return comp;
 299}
 300
 301static void adjust_compensation(int target_ratio, unsigned int win)
 302{
 303        int delta;
 304        struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 305
 306        /*
 307         * adjust compensations if confidence level has not been reached or
 308         * there are too many wakeups during the last idle injection period, we
 309         * cannot trust the data for compensation.
 310         */
 311        if (d->confidence >= CONFIDENCE_OK ||
 312                atomic_read(&idle_wakeup_counter) >
 313                win * num_online_cpus())
 314                return;
 315
 316        delta = set_target_ratio - current_ratio;
 317        /* filter out bad data */
 318        if (delta >= 0 && delta <= (1+target_ratio/10)) {
 319                if (d->steady_comp)
 320                        d->steady_comp =
 321                                roundup(delta+d->steady_comp, 2)/2;
 322                else
 323                        d->steady_comp = delta;
 324                d->confidence++;
 325        }
 326}
 327
 328static bool powerclamp_adjust_controls(unsigned int target_ratio,
 329                                unsigned int guard, unsigned int win)
 330{
 331        static u64 msr_last, tsc_last;
 332        u64 msr_now, tsc_now;
 333        u64 val64;
 334
 335        /* check result for the last window */
 336        msr_now = pkg_state_counter();
 337        tsc_now = rdtsc();
 338
 339        /* calculate pkg cstate vs tsc ratio */
 340        if (!msr_last || !tsc_last)
 341                current_ratio = 1;
 342        else if (tsc_now-tsc_last) {
 343                val64 = 100*(msr_now-msr_last);
 344                do_div(val64, (tsc_now-tsc_last));
 345                current_ratio = val64;
 346        }
 347
 348        /* update record */
 349        msr_last = msr_now;
 350        tsc_last = tsc_now;
 351
 352        adjust_compensation(target_ratio, win);
 353        /*
 354         * too many external interrupts, set flag such
 355         * that we can take measure later.
 356         */
 357        reduce_irq = atomic_read(&idle_wakeup_counter) >=
 358                2 * win * num_online_cpus();
 359
 360        atomic_set(&idle_wakeup_counter, 0);
 361        /* if we are above target+guard, skip */
 362        return set_target_ratio + guard <= current_ratio;
 363}
 364
 365static void clamp_balancing_func(struct kthread_work *work)
 366{
 367        struct powerclamp_worker_data *w_data;
 368        int sleeptime;
 369        unsigned long target_jiffies;
 370        unsigned int compensated_ratio;
 371        int interval; /* jiffies to sleep for each attempt */
 372
 373        w_data = container_of(work, struct powerclamp_worker_data,
 374                              balancing_work);
 375
 376        /*
 377         * make sure user selected ratio does not take effect until
 378         * the next round. adjust target_ratio if user has changed
 379         * target such that we can converge quickly.
 380         */
 381        w_data->target_ratio = READ_ONCE(set_target_ratio);
 382        w_data->guard = 1 + w_data->target_ratio / 20;
 383        w_data->window_size_now = window_size;
 384        w_data->duration_jiffies = msecs_to_jiffies(duration);
 385        w_data->count++;
 386
 387        /*
 388         * systems may have different ability to enter package level
 389         * c-states, thus we need to compensate the injected idle ratio
 390         * to achieve the actual target reported by the HW.
 391         */
 392        compensated_ratio = w_data->target_ratio +
 393                get_compensation(w_data->target_ratio);
 394        if (compensated_ratio <= 0)
 395                compensated_ratio = 1;
 396        interval = w_data->duration_jiffies * 100 / compensated_ratio;
 397
 398        /* align idle time */
 399        target_jiffies = roundup(jiffies, interval);
 400        sleeptime = target_jiffies - jiffies;
 401        if (sleeptime <= 0)
 402                sleeptime = 1;
 403
 404        if (clamping && w_data->clamping && cpu_online(w_data->cpu))
 405                kthread_queue_delayed_work(w_data->worker,
 406                                           &w_data->idle_injection_work,
 407                                           sleeptime);
 408}
 409
 410static void clamp_idle_injection_func(struct kthread_work *work)
 411{
 412        struct powerclamp_worker_data *w_data;
 413
 414        w_data = container_of(work, struct powerclamp_worker_data,
 415                              idle_injection_work.work);
 416
 417        /*
 418         * only elected controlling cpu can collect stats and update
 419         * control parameters.
 420         */
 421        if (w_data->cpu == control_cpu &&
 422            !(w_data->count % w_data->window_size_now)) {
 423                should_skip =
 424                        powerclamp_adjust_controls(w_data->target_ratio,
 425                                                   w_data->guard,
 426                                                   w_data->window_size_now);
 427                smp_mb();
 428        }
 429
 430        if (should_skip)
 431                goto balance;
 432
 433        play_idle(jiffies_to_usecs(w_data->duration_jiffies));
 434
 435balance:
 436        if (clamping && w_data->clamping && cpu_online(w_data->cpu))
 437                kthread_queue_work(w_data->worker, &w_data->balancing_work);
 438}
 439
 440/*
 441 * 1 HZ polling while clamping is active, useful for userspace
 442 * to monitor actual idle ratio.
 443 */
 444static void poll_pkg_cstate(struct work_struct *dummy);
 445static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
 446static void poll_pkg_cstate(struct work_struct *dummy)
 447{
 448        static u64 msr_last;
 449        static u64 tsc_last;
 450
 451        u64 msr_now;
 452        u64 tsc_now;
 453        u64 val64;
 454
 455        msr_now = pkg_state_counter();
 456        tsc_now = rdtsc();
 457
 458        /* calculate pkg cstate vs tsc ratio */
 459        if (!msr_last || !tsc_last)
 460                pkg_cstate_ratio_cur = 1;
 461        else {
 462                if (tsc_now - tsc_last) {
 463                        val64 = 100 * (msr_now - msr_last);
 464                        do_div(val64, (tsc_now - tsc_last));
 465                        pkg_cstate_ratio_cur = val64;
 466                }
 467        }
 468
 469        /* update record */
 470        msr_last = msr_now;
 471        tsc_last = tsc_now;
 472
 473        if (true == clamping)
 474                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 475}
 476
 477static void start_power_clamp_worker(unsigned long cpu)
 478{
 479        struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 480        struct kthread_worker *worker;
 481
 482        worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
 483        if (IS_ERR(worker))
 484                return;
 485
 486        w_data->worker = worker;
 487        w_data->count = 0;
 488        w_data->cpu = cpu;
 489        w_data->clamping = true;
 490        set_bit(cpu, cpu_clamping_mask);
 491        sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
 492        kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
 493        kthread_init_delayed_work(&w_data->idle_injection_work,
 494                                  clamp_idle_injection_func);
 495        kthread_queue_work(w_data->worker, &w_data->balancing_work);
 496}
 497
 498static void stop_power_clamp_worker(unsigned long cpu)
 499{
 500        struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 501
 502        if (!w_data->worker)
 503                return;
 504
 505        w_data->clamping = false;
 506        /*
 507         * Make sure that all works that get queued after this point see
 508         * the clamping disabled. The counter part is not needed because
 509         * there is an implicit memory barrier when the queued work
 510         * is proceed.
 511         */
 512        smp_wmb();
 513        kthread_cancel_work_sync(&w_data->balancing_work);
 514        kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
 515        /*
 516         * The balancing work still might be queued here because
 517         * the handling of the "clapming" variable, cancel, and queue
 518         * operations are not synchronized via a lock. But it is not
 519         * a big deal. The balancing work is fast and destroy kthread
 520         * will wait for it.
 521         */
 522        clear_bit(w_data->cpu, cpu_clamping_mask);
 523        kthread_destroy_worker(w_data->worker);
 524
 525        w_data->worker = NULL;
 526}
 527
 528static int start_power_clamp(void)
 529{
 530        unsigned long cpu;
 531
 532        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 533        /* prevent cpu hotplug */
 534        get_online_cpus();
 535
 536        /* prefer BSP */
 537        control_cpu = 0;
 538        if (!cpu_online(control_cpu))
 539                control_cpu = smp_processor_id();
 540
 541        clamping = true;
 542        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 543
 544        /* start one kthread worker per online cpu */
 545        for_each_online_cpu(cpu) {
 546                start_power_clamp_worker(cpu);
 547        }
 548        put_online_cpus();
 549
 550        return 0;
 551}
 552
 553static void end_power_clamp(void)
 554{
 555        int i;
 556
 557        /*
 558         * Block requeuing in all the kthread workers. They will flush and
 559         * stop faster.
 560         */
 561        clamping = false;
 562        if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 563                for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
 564                        pr_debug("clamping worker for cpu %d alive, destroy\n",
 565                                 i);
 566                        stop_power_clamp_worker(i);
 567                }
 568        }
 569}
 570
 571static int powerclamp_cpu_online(unsigned int cpu)
 572{
 573        if (clamping == false)
 574                return 0;
 575        start_power_clamp_worker(cpu);
 576        /* prefer BSP as controlling CPU */
 577        if (cpu == 0) {
 578                control_cpu = 0;
 579                smp_mb();
 580        }
 581        return 0;
 582}
 583
 584static int powerclamp_cpu_predown(unsigned int cpu)
 585{
 586        if (clamping == false)
 587                return 0;
 588
 589        stop_power_clamp_worker(cpu);
 590        if (cpu != control_cpu)
 591                return 0;
 592
 593        control_cpu = cpumask_first(cpu_online_mask);
 594        if (control_cpu == cpu)
 595                control_cpu = cpumask_next(cpu, cpu_online_mask);
 596        smp_mb();
 597        return 0;
 598}
 599
 600static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 601                                 unsigned long *state)
 602{
 603        *state = MAX_TARGET_RATIO;
 604
 605        return 0;
 606}
 607
 608static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 609                                 unsigned long *state)
 610{
 611        if (true == clamping)
 612                *state = pkg_cstate_ratio_cur;
 613        else
 614                /* to save power, do not poll idle ratio while not clamping */
 615                *state = -1; /* indicates invalid state */
 616
 617        return 0;
 618}
 619
 620static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 621                                 unsigned long new_target_ratio)
 622{
 623        int ret = 0;
 624
 625        new_target_ratio = clamp(new_target_ratio, 0UL,
 626                                (unsigned long) (MAX_TARGET_RATIO-1));
 627        if (set_target_ratio == 0 && new_target_ratio > 0) {
 628                pr_info("Start idle injection to reduce power\n");
 629                set_target_ratio = new_target_ratio;
 630                ret = start_power_clamp();
 631                goto exit_set;
 632        } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
 633                pr_info("Stop forced idle injection\n");
 634                end_power_clamp();
 635                set_target_ratio = 0;
 636        } else  /* adjust currently running */ {
 637                set_target_ratio = new_target_ratio;
 638                /* make new set_target_ratio visible to other cpus */
 639                smp_mb();
 640        }
 641
 642exit_set:
 643        return ret;
 644}
 645
 646/* bind to generic thermal layer as cooling device*/
 647static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 648        .get_max_state = powerclamp_get_max_state,
 649        .get_cur_state = powerclamp_get_cur_state,
 650        .set_cur_state = powerclamp_set_cur_state,
 651};
 652
 653static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
 654        { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
 655        {}
 656};
 657MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 658
 659static int __init powerclamp_probe(void)
 660{
 661
 662        if (!x86_match_cpu(intel_powerclamp_ids)) {
 663                pr_err("CPU does not support MWAIT\n");
 664                return -ENODEV;
 665        }
 666
 667        /* The goal for idle time alignment is to achieve package cstate. */
 668        if (!has_pkg_state_counter()) {
 669                pr_info("No package C-state available\n");
 670                return -ENODEV;
 671        }
 672
 673        /* find the deepest mwait value */
 674        find_target_mwait();
 675
 676        return 0;
 677}
 678
 679static int powerclamp_debug_show(struct seq_file *m, void *unused)
 680{
 681        int i = 0;
 682
 683        seq_printf(m, "controlling cpu: %d\n", control_cpu);
 684        seq_printf(m, "pct confidence steady dynamic (compensation)\n");
 685        for (i = 0; i < MAX_TARGET_RATIO; i++) {
 686                seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
 687                        i,
 688                        cal_data[i].confidence,
 689                        cal_data[i].steady_comp,
 690                        cal_data[i].dynamic_comp);
 691        }
 692
 693        return 0;
 694}
 695
 696DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
 697
 698static inline void powerclamp_create_debug_files(void)
 699{
 700        debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
 701
 702        debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
 703                            &powerclamp_debug_fops);
 704}
 705
 706static enum cpuhp_state hp_state;
 707
 708static int __init powerclamp_init(void)
 709{
 710        int retval;
 711        int bitmap_size;
 712
 713        bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
 714        cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
 715        if (!cpu_clamping_mask)
 716                return -ENOMEM;
 717
 718        /* probe cpu features and ids here */
 719        retval = powerclamp_probe();
 720        if (retval)
 721                goto exit_free;
 722
 723        /* set default limit, maybe adjusted during runtime based on feedback */
 724        window_size = 2;
 725        retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 726                                           "thermal/intel_powerclamp:online",
 727                                           powerclamp_cpu_online,
 728                                           powerclamp_cpu_predown);
 729        if (retval < 0)
 730                goto exit_free;
 731
 732        hp_state = retval;
 733
 734        worker_data = alloc_percpu(struct powerclamp_worker_data);
 735        if (!worker_data) {
 736                retval = -ENOMEM;
 737                goto exit_unregister;
 738        }
 739
 740        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 741                                                &powerclamp_cooling_ops);
 742        if (IS_ERR(cooling_dev)) {
 743                retval = -ENODEV;
 744                goto exit_free_thread;
 745        }
 746
 747        if (!duration)
 748                duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
 749
 750        powerclamp_create_debug_files();
 751
 752        return 0;
 753
 754exit_free_thread:
 755        free_percpu(worker_data);
 756exit_unregister:
 757        cpuhp_remove_state_nocalls(hp_state);
 758exit_free:
 759        kfree(cpu_clamping_mask);
 760        return retval;
 761}
 762module_init(powerclamp_init);
 763
 764static void __exit powerclamp_exit(void)
 765{
 766        end_power_clamp();
 767        cpuhp_remove_state_nocalls(hp_state);
 768        free_percpu(worker_data);
 769        thermal_cooling_device_unregister(cooling_dev);
 770        kfree(cpu_clamping_mask);
 771
 772        cancel_delayed_work_sync(&poll_pkg_cstate_work);
 773        debugfs_remove_recursive(debug_dir);
 774}
 775module_exit(powerclamp_exit);
 776
 777MODULE_LICENSE("GPL");
 778MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
 779MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
 780MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
 781