1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/delay.h>
30#include <linux/kthread.h>
31#include <linux/cpu.h>
32#include <linux/thermal.h>
33#include <linux/slab.h>
34#include <linux/tick.h>
35#include <linux/debugfs.h>
36#include <linux/seq_file.h>
37#include <linux/sched/rt.h>
38#include <uapi/linux/sched/types.h>
39
40#include <asm/nmi.h>
41#include <asm/msr.h>
42#include <asm/mwait.h>
43#include <asm/cpu_device_id.h>
44#include <asm/hardirq.h>
45
46#define MAX_TARGET_RATIO (50U)
47
48
49
50
51
52#define CONFIDENCE_OK (3)
53
54
55
56#define DEFAULT_DURATION_JIFFIES (6)
57
58static unsigned int target_mwait;
59static struct dentry *debug_dir;
60
61
62static unsigned int set_target_ratio;
63static unsigned int current_ratio;
64static bool should_skip;
65static bool reduce_irq;
66static atomic_t idle_wakeup_counter;
67static unsigned int control_cpu;
68
69
70
71static bool clamping;
72
73struct powerclamp_worker_data {
74 struct kthread_worker *worker;
75 struct kthread_work balancing_work;
76 struct kthread_delayed_work idle_injection_work;
77 unsigned int cpu;
78 unsigned int count;
79 unsigned int guard;
80 unsigned int window_size_now;
81 unsigned int target_ratio;
82 unsigned int duration_jiffies;
83 bool clamping;
84};
85
86static struct powerclamp_worker_data __percpu *worker_data;
87static struct thermal_cooling_device *cooling_dev;
88static unsigned long *cpu_clamping_mask;
89
90
91
92static unsigned int duration;
93static unsigned int pkg_cstate_ratio_cur;
94static unsigned int window_size;
95
96static int duration_set(const char *arg, const struct kernel_param *kp)
97{
98 int ret = 0;
99 unsigned long new_duration;
100
101 ret = kstrtoul(arg, 10, &new_duration);
102 if (ret)
103 goto exit;
104 if (new_duration > 25 || new_duration < 6) {
105 pr_err("Out of recommended range %lu, between 6-25ms\n",
106 new_duration);
107 ret = -EINVAL;
108 }
109
110 duration = clamp(new_duration, 6ul, 25ul);
111 smp_mb();
112
113exit:
114
115 return ret;
116}
117
118static const struct kernel_param_ops duration_ops = {
119 .set = duration_set,
120 .get = param_get_int,
121};
122
123
124module_param_cb(duration, &duration_ops, &duration, 0644);
125MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
126
127struct powerclamp_calibration_data {
128 unsigned long confidence;
129
130
131
132
133
134 unsigned long steady_comp;
135
136
137 unsigned long dynamic_comp;
138
139
140};
141
142static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
143
144static int window_size_set(const char *arg, const struct kernel_param *kp)
145{
146 int ret = 0;
147 unsigned long new_window_size;
148
149 ret = kstrtoul(arg, 10, &new_window_size);
150 if (ret)
151 goto exit_win;
152 if (new_window_size > 10 || new_window_size < 2) {
153 pr_err("Out of recommended window size %lu, between 2-10\n",
154 new_window_size);
155 ret = -EINVAL;
156 }
157
158 window_size = clamp(new_window_size, 2ul, 10ul);
159 smp_mb();
160
161exit_win:
162
163 return ret;
164}
165
166static const struct kernel_param_ops window_size_ops = {
167 .set = window_size_set,
168 .get = param_get_int,
169};
170
171module_param_cb(window_size, &window_size_ops, &window_size, 0644);
172MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
173 "\tpowerclamp controls idle ratio within this window. larger\n"
174 "\twindow size results in slower response time but more smooth\n"
175 "\tclamping results. default to 2.");
176
177static void find_target_mwait(void)
178{
179 unsigned int eax, ebx, ecx, edx;
180 unsigned int highest_cstate = 0;
181 unsigned int highest_subcstate = 0;
182 int i;
183
184 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
185 return;
186
187 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
188
189 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
190 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
191 return;
192
193 edx >>= MWAIT_SUBSTATE_SIZE;
194 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
195 if (edx & MWAIT_SUBSTATE_MASK) {
196 highest_cstate = i;
197 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
198 }
199 }
200 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
201 (highest_subcstate - 1);
202
203}
204
205struct pkg_cstate_info {
206 bool skip;
207 int msr_index;
208 int cstate_id;
209};
210
211#define PKG_CSTATE_INIT(id) { \
212 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
213 .cstate_id = id \
214 }
215
216static struct pkg_cstate_info pkg_cstates[] = {
217 PKG_CSTATE_INIT(2),
218 PKG_CSTATE_INIT(3),
219 PKG_CSTATE_INIT(6),
220 PKG_CSTATE_INIT(7),
221 PKG_CSTATE_INIT(8),
222 PKG_CSTATE_INIT(9),
223 PKG_CSTATE_INIT(10),
224 {NULL},
225};
226
227static bool has_pkg_state_counter(void)
228{
229 u64 val;
230 struct pkg_cstate_info *info = pkg_cstates;
231
232
233 while (info->msr_index) {
234 if (!rdmsrl_safe(info->msr_index, &val))
235 return true;
236 info++;
237 }
238
239 return false;
240}
241
242static u64 pkg_state_counter(void)
243{
244 u64 val;
245 u64 count = 0;
246 struct pkg_cstate_info *info = pkg_cstates;
247
248 while (info->msr_index) {
249 if (!info->skip) {
250 if (!rdmsrl_safe(info->msr_index, &val))
251 count += val;
252 else
253 info->skip = true;
254 }
255 info++;
256 }
257
258 return count;
259}
260
261static unsigned int get_compensation(int ratio)
262{
263 unsigned int comp = 0;
264
265
266 if (ratio == 1 &&
267 cal_data[ratio].confidence >= CONFIDENCE_OK &&
268 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
269 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
270 comp = (cal_data[ratio].steady_comp +
271 cal_data[ratio + 1].steady_comp +
272 cal_data[ratio + 2].steady_comp) / 3;
273 } else if (ratio == MAX_TARGET_RATIO - 1 &&
274 cal_data[ratio].confidence >= CONFIDENCE_OK &&
275 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
276 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
277 comp = (cal_data[ratio].steady_comp +
278 cal_data[ratio - 1].steady_comp +
279 cal_data[ratio - 2].steady_comp) / 3;
280 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
281 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
282 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
283 comp = (cal_data[ratio].steady_comp +
284 cal_data[ratio - 1].steady_comp +
285 cal_data[ratio + 1].steady_comp) / 3;
286 }
287
288
289 if (reduce_irq)
290 comp = ratio;
291
292 if (comp + ratio >= MAX_TARGET_RATIO)
293 comp = MAX_TARGET_RATIO - ratio - 1;
294
295 return comp;
296}
297
298static void adjust_compensation(int target_ratio, unsigned int win)
299{
300 int delta;
301 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
302
303
304
305
306
307
308 if (d->confidence >= CONFIDENCE_OK ||
309 atomic_read(&idle_wakeup_counter) >
310 win * num_online_cpus())
311 return;
312
313 delta = set_target_ratio - current_ratio;
314
315 if (delta >= 0 && delta <= (1+target_ratio/10)) {
316 if (d->steady_comp)
317 d->steady_comp =
318 roundup(delta+d->steady_comp, 2)/2;
319 else
320 d->steady_comp = delta;
321 d->confidence++;
322 }
323}
324
325static bool powerclamp_adjust_controls(unsigned int target_ratio,
326 unsigned int guard, unsigned int win)
327{
328 static u64 msr_last, tsc_last;
329 u64 msr_now, tsc_now;
330 u64 val64;
331
332
333 msr_now = pkg_state_counter();
334 tsc_now = rdtsc();
335
336
337 if (!msr_last || !tsc_last)
338 current_ratio = 1;
339 else if (tsc_now-tsc_last) {
340 val64 = 100*(msr_now-msr_last);
341 do_div(val64, (tsc_now-tsc_last));
342 current_ratio = val64;
343 }
344
345
346 msr_last = msr_now;
347 tsc_last = tsc_now;
348
349 adjust_compensation(target_ratio, win);
350
351
352
353
354 reduce_irq = atomic_read(&idle_wakeup_counter) >=
355 2 * win * num_online_cpus();
356
357 atomic_set(&idle_wakeup_counter, 0);
358
359 return set_target_ratio + guard <= current_ratio;
360}
361
362static void clamp_balancing_func(struct kthread_work *work)
363{
364 struct powerclamp_worker_data *w_data;
365 int sleeptime;
366 unsigned long target_jiffies;
367 unsigned int compensated_ratio;
368 int interval;
369
370 w_data = container_of(work, struct powerclamp_worker_data,
371 balancing_work);
372
373
374
375
376
377
378 w_data->target_ratio = READ_ONCE(set_target_ratio);
379 w_data->guard = 1 + w_data->target_ratio / 20;
380 w_data->window_size_now = window_size;
381 w_data->duration_jiffies = msecs_to_jiffies(duration);
382 w_data->count++;
383
384
385
386
387
388
389 compensated_ratio = w_data->target_ratio +
390 get_compensation(w_data->target_ratio);
391 if (compensated_ratio <= 0)
392 compensated_ratio = 1;
393 interval = w_data->duration_jiffies * 100 / compensated_ratio;
394
395
396 target_jiffies = roundup(jiffies, interval);
397 sleeptime = target_jiffies - jiffies;
398 if (sleeptime <= 0)
399 sleeptime = 1;
400
401 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
402 kthread_queue_delayed_work(w_data->worker,
403 &w_data->idle_injection_work,
404 sleeptime);
405}
406
407static void clamp_idle_injection_func(struct kthread_work *work)
408{
409 struct powerclamp_worker_data *w_data;
410
411 w_data = container_of(work, struct powerclamp_worker_data,
412 idle_injection_work.work);
413
414
415
416
417
418 if (w_data->cpu == control_cpu &&
419 !(w_data->count % w_data->window_size_now)) {
420 should_skip =
421 powerclamp_adjust_controls(w_data->target_ratio,
422 w_data->guard,
423 w_data->window_size_now);
424 smp_mb();
425 }
426
427 if (should_skip)
428 goto balance;
429
430 play_idle(jiffies_to_usecs(w_data->duration_jiffies));
431
432balance:
433 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
434 kthread_queue_work(w_data->worker, &w_data->balancing_work);
435}
436
437
438
439
440
441static void poll_pkg_cstate(struct work_struct *dummy);
442static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
443static void poll_pkg_cstate(struct work_struct *dummy)
444{
445 static u64 msr_last;
446 static u64 tsc_last;
447
448 u64 msr_now;
449 u64 tsc_now;
450 u64 val64;
451
452 msr_now = pkg_state_counter();
453 tsc_now = rdtsc();
454
455
456 if (!msr_last || !tsc_last)
457 pkg_cstate_ratio_cur = 1;
458 else {
459 if (tsc_now - tsc_last) {
460 val64 = 100 * (msr_now - msr_last);
461 do_div(val64, (tsc_now - tsc_last));
462 pkg_cstate_ratio_cur = val64;
463 }
464 }
465
466
467 msr_last = msr_now;
468 tsc_last = tsc_now;
469
470 if (true == clamping)
471 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
472}
473
474static void start_power_clamp_worker(unsigned long cpu)
475{
476 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
477 struct kthread_worker *worker;
478
479 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
480 if (IS_ERR(worker))
481 return;
482
483 w_data->worker = worker;
484 w_data->count = 0;
485 w_data->cpu = cpu;
486 w_data->clamping = true;
487 set_bit(cpu, cpu_clamping_mask);
488 sched_set_fifo(worker->task);
489 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
490 kthread_init_delayed_work(&w_data->idle_injection_work,
491 clamp_idle_injection_func);
492 kthread_queue_work(w_data->worker, &w_data->balancing_work);
493}
494
495static void stop_power_clamp_worker(unsigned long cpu)
496{
497 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
498
499 if (!w_data->worker)
500 return;
501
502 w_data->clamping = false;
503
504
505
506
507
508
509 smp_wmb();
510 kthread_cancel_work_sync(&w_data->balancing_work);
511 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
512
513
514
515
516
517
518
519 clear_bit(w_data->cpu, cpu_clamping_mask);
520 kthread_destroy_worker(w_data->worker);
521
522 w_data->worker = NULL;
523}
524
525static int start_power_clamp(void)
526{
527 unsigned long cpu;
528
529 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
530
531 get_online_cpus();
532
533
534 control_cpu = 0;
535 if (!cpu_online(control_cpu))
536 control_cpu = smp_processor_id();
537
538 clamping = true;
539 schedule_delayed_work(&poll_pkg_cstate_work, 0);
540
541
542 for_each_online_cpu(cpu) {
543 start_power_clamp_worker(cpu);
544 }
545 put_online_cpus();
546
547 return 0;
548}
549
550static void end_power_clamp(void)
551{
552 int i;
553
554
555
556
557
558 clamping = false;
559 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
560 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
561 pr_debug("clamping worker for cpu %d alive, destroy\n",
562 i);
563 stop_power_clamp_worker(i);
564 }
565 }
566}
567
568static int powerclamp_cpu_online(unsigned int cpu)
569{
570 if (clamping == false)
571 return 0;
572 start_power_clamp_worker(cpu);
573
574 if (cpu == 0) {
575 control_cpu = 0;
576 smp_mb();
577 }
578 return 0;
579}
580
581static int powerclamp_cpu_predown(unsigned int cpu)
582{
583 if (clamping == false)
584 return 0;
585
586 stop_power_clamp_worker(cpu);
587 if (cpu != control_cpu)
588 return 0;
589
590 control_cpu = cpumask_first(cpu_online_mask);
591 if (control_cpu == cpu)
592 control_cpu = cpumask_next(cpu, cpu_online_mask);
593 smp_mb();
594 return 0;
595}
596
597static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
598 unsigned long *state)
599{
600 *state = MAX_TARGET_RATIO;
601
602 return 0;
603}
604
605static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
606 unsigned long *state)
607{
608 if (true == clamping)
609 *state = pkg_cstate_ratio_cur;
610 else
611
612 *state = -1;
613
614 return 0;
615}
616
617static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
618 unsigned long new_target_ratio)
619{
620 int ret = 0;
621
622 new_target_ratio = clamp(new_target_ratio, 0UL,
623 (unsigned long) (MAX_TARGET_RATIO-1));
624 if (set_target_ratio == 0 && new_target_ratio > 0) {
625 pr_info("Start idle injection to reduce power\n");
626 set_target_ratio = new_target_ratio;
627 ret = start_power_clamp();
628 goto exit_set;
629 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
630 pr_info("Stop forced idle injection\n");
631 end_power_clamp();
632 set_target_ratio = 0;
633 } else {
634 set_target_ratio = new_target_ratio;
635
636 smp_mb();
637 }
638
639exit_set:
640 return ret;
641}
642
643
644static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
645 .get_max_state = powerclamp_get_max_state,
646 .get_cur_state = powerclamp_get_cur_state,
647 .set_cur_state = powerclamp_set_cur_state,
648};
649
650static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
651 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
652 {}
653};
654MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
655
656static int __init powerclamp_probe(void)
657{
658
659 if (!x86_match_cpu(intel_powerclamp_ids)) {
660 pr_err("CPU does not support MWAIT\n");
661 return -ENODEV;
662 }
663
664
665 if (!has_pkg_state_counter()) {
666 pr_info("No package C-state available\n");
667 return -ENODEV;
668 }
669
670
671 find_target_mwait();
672
673 return 0;
674}
675
676static int powerclamp_debug_show(struct seq_file *m, void *unused)
677{
678 int i = 0;
679
680 seq_printf(m, "controlling cpu: %d\n", control_cpu);
681 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
682 for (i = 0; i < MAX_TARGET_RATIO; i++) {
683 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
684 i,
685 cal_data[i].confidence,
686 cal_data[i].steady_comp,
687 cal_data[i].dynamic_comp);
688 }
689
690 return 0;
691}
692
693DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
694
695static inline void powerclamp_create_debug_files(void)
696{
697 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
698
699 debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
700 &powerclamp_debug_fops);
701}
702
703static enum cpuhp_state hp_state;
704
705static int __init powerclamp_init(void)
706{
707 int retval;
708 int bitmap_size;
709
710 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
711 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
712 if (!cpu_clamping_mask)
713 return -ENOMEM;
714
715
716 retval = powerclamp_probe();
717 if (retval)
718 goto exit_free;
719
720
721 window_size = 2;
722 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
723 "thermal/intel_powerclamp:online",
724 powerclamp_cpu_online,
725 powerclamp_cpu_predown);
726 if (retval < 0)
727 goto exit_free;
728
729 hp_state = retval;
730
731 worker_data = alloc_percpu(struct powerclamp_worker_data);
732 if (!worker_data) {
733 retval = -ENOMEM;
734 goto exit_unregister;
735 }
736
737 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
738 &powerclamp_cooling_ops);
739 if (IS_ERR(cooling_dev)) {
740 retval = -ENODEV;
741 goto exit_free_thread;
742 }
743
744 if (!duration)
745 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
746
747 powerclamp_create_debug_files();
748
749 return 0;
750
751exit_free_thread:
752 free_percpu(worker_data);
753exit_unregister:
754 cpuhp_remove_state_nocalls(hp_state);
755exit_free:
756 kfree(cpu_clamping_mask);
757 return retval;
758}
759module_init(powerclamp_init);
760
761static void __exit powerclamp_exit(void)
762{
763 end_power_clamp();
764 cpuhp_remove_state_nocalls(hp_state);
765 free_percpu(worker_data);
766 thermal_cooling_device_unregister(cooling_dev);
767 kfree(cpu_clamping_mask);
768
769 cancel_delayed_work_sync(&poll_pkg_cstate_work);
770 debugfs_remove_recursive(debug_dir);
771}
772module_exit(powerclamp_exit);
773
774MODULE_LICENSE("GPL");
775MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
776MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
777MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
778