1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/delay.h>
45#include <linux/kthread.h>
46#include <linux/freezer.h>
47#include <linux/cpu.h>
48#include <linux/thermal.h>
49#include <linux/slab.h>
50#include <linux/tick.h>
51#include <linux/debugfs.h>
52#include <linux/seq_file.h>
53#include <linux/sched/rt.h>
54
55#include <asm/nmi.h>
56#include <asm/msr.h>
57#include <asm/mwait.h>
58#include <asm/cpu_device_id.h>
59#include <asm/idle.h>
60#include <asm/hardirq.h>
61
62#define MAX_TARGET_RATIO (50U)
63
64
65
66
67
68#define CONFIDENCE_OK (3)
69
70
71
72#define DEFAULT_DURATION_JIFFIES (6)
73
74static unsigned int target_mwait;
75static struct dentry *debug_dir;
76
77
78static unsigned int set_target_ratio;
79static unsigned int current_ratio;
80static bool should_skip;
81static bool reduce_irq;
82static atomic_t idle_wakeup_counter;
83static unsigned int control_cpu;
84
85
86
87static bool clamping;
88
89
90static struct task_struct * __percpu *powerclamp_thread;
91static struct thermal_cooling_device *cooling_dev;
92static unsigned long *cpu_clamping_mask;
93
94
95
96static unsigned int duration;
97static unsigned int pkg_cstate_ratio_cur;
98static unsigned int window_size;
99
100static int duration_set(const char *arg, const struct kernel_param *kp)
101{
102 int ret = 0;
103 unsigned long new_duration;
104
105 ret = kstrtoul(arg, 10, &new_duration);
106 if (ret)
107 goto exit;
108 if (new_duration > 25 || new_duration < 6) {
109 pr_err("Out of recommended range %lu, between 6-25ms\n",
110 new_duration);
111 ret = -EINVAL;
112 }
113
114 duration = clamp(new_duration, 6ul, 25ul);
115 smp_mb();
116
117exit:
118
119 return ret;
120}
121
122static struct kernel_param_ops duration_ops = {
123 .set = duration_set,
124 .get = param_get_int,
125};
126
127
128module_param_cb(duration, &duration_ops, &duration, 0644);
129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130
131struct powerclamp_calibration_data {
132 unsigned long confidence;
133
134
135
136
137
138 unsigned long steady_comp;
139
140
141 unsigned long dynamic_comp;
142
143
144};
145
146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147
148static int window_size_set(const char *arg, const struct kernel_param *kp)
149{
150 int ret = 0;
151 unsigned long new_window_size;
152
153 ret = kstrtoul(arg, 10, &new_window_size);
154 if (ret)
155 goto exit_win;
156 if (new_window_size > 10 || new_window_size < 2) {
157 pr_err("Out of recommended window size %lu, between 2-10\n",
158 new_window_size);
159 ret = -EINVAL;
160 }
161
162 window_size = clamp(new_window_size, 2ul, 10ul);
163 smp_mb();
164
165exit_win:
166
167 return ret;
168}
169
170static struct kernel_param_ops window_size_ops = {
171 .set = window_size_set,
172 .get = param_get_int,
173};
174
175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177 "\tpowerclamp controls idle ratio within this window. larger\n"
178 "\twindow size results in slower response time but more smooth\n"
179 "\tclamping results. default to 2.");
180
181static void find_target_mwait(void)
182{
183 unsigned int eax, ebx, ecx, edx;
184 unsigned int highest_cstate = 0;
185 unsigned int highest_subcstate = 0;
186 int i;
187
188 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189 return;
190
191 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192
193 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195 return;
196
197 edx >>= MWAIT_SUBSTATE_SIZE;
198 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199 if (edx & MWAIT_SUBSTATE_MASK) {
200 highest_cstate = i;
201 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202 }
203 }
204 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205 (highest_subcstate - 1);
206
207}
208
209static bool has_pkg_state_counter(void)
210{
211 u64 tmp;
212 return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) ||
213 !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) ||
214 !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) ||
215 !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp);
216}
217
218static u64 pkg_state_counter(void)
219{
220 u64 val;
221 u64 count = 0;
222
223 static bool skip_c2;
224 static bool skip_c3;
225 static bool skip_c6;
226 static bool skip_c7;
227
228 if (!skip_c2) {
229 if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
230 count += val;
231 else
232 skip_c2 = true;
233 }
234
235 if (!skip_c3) {
236 if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
237 count += val;
238 else
239 skip_c3 = true;
240 }
241
242 if (!skip_c6) {
243 if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
244 count += val;
245 else
246 skip_c6 = true;
247 }
248
249 if (!skip_c7) {
250 if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
251 count += val;
252 else
253 skip_c7 = true;
254 }
255
256 return count;
257}
258
259static void noop_timer(unsigned long foo)
260{
261
262}
263
264static unsigned int get_compensation(int ratio)
265{
266 unsigned int comp = 0;
267
268
269 if (ratio == 1 &&
270 cal_data[ratio].confidence >= CONFIDENCE_OK &&
271 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273 comp = (cal_data[ratio].steady_comp +
274 cal_data[ratio + 1].steady_comp +
275 cal_data[ratio + 2].steady_comp) / 3;
276 } else if (ratio == MAX_TARGET_RATIO - 1 &&
277 cal_data[ratio].confidence >= CONFIDENCE_OK &&
278 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280 comp = (cal_data[ratio].steady_comp +
281 cal_data[ratio - 1].steady_comp +
282 cal_data[ratio - 2].steady_comp) / 3;
283 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286 comp = (cal_data[ratio].steady_comp +
287 cal_data[ratio - 1].steady_comp +
288 cal_data[ratio + 1].steady_comp) / 3;
289 }
290
291
292 if (reduce_irq)
293 comp = ratio;
294
295 if (comp + ratio >= MAX_TARGET_RATIO)
296 comp = MAX_TARGET_RATIO - ratio - 1;
297
298 return comp;
299}
300
301static void adjust_compensation(int target_ratio, unsigned int win)
302{
303 int delta;
304 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305
306
307
308
309
310
311 if (d->confidence >= CONFIDENCE_OK ||
312 atomic_read(&idle_wakeup_counter) >
313 win * num_online_cpus())
314 return;
315
316 delta = set_target_ratio - current_ratio;
317
318 if (delta >= 0 && delta <= (1+target_ratio/10)) {
319 if (d->steady_comp)
320 d->steady_comp =
321 roundup(delta+d->steady_comp, 2)/2;
322 else
323 d->steady_comp = delta;
324 d->confidence++;
325 }
326}
327
328static bool powerclamp_adjust_controls(unsigned int target_ratio,
329 unsigned int guard, unsigned int win)
330{
331 static u64 msr_last, tsc_last;
332 u64 msr_now, tsc_now;
333 u64 val64;
334
335
336 msr_now = pkg_state_counter();
337 rdtscll(tsc_now);
338
339
340 if (!msr_last || !tsc_last)
341 current_ratio = 1;
342 else if (tsc_now-tsc_last) {
343 val64 = 100*(msr_now-msr_last);
344 do_div(val64, (tsc_now-tsc_last));
345 current_ratio = val64;
346 }
347
348
349 msr_last = msr_now;
350 tsc_last = tsc_now;
351
352 adjust_compensation(target_ratio, win);
353
354
355
356
357 reduce_irq = atomic_read(&idle_wakeup_counter) >=
358 2 * win * num_online_cpus();
359
360 atomic_set(&idle_wakeup_counter, 0);
361
362 return set_target_ratio + guard <= current_ratio;
363}
364
365static int clamp_thread(void *arg)
366{
367 int cpunr = (unsigned long)arg;
368 DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
369 static const struct sched_param param = {
370 .sched_priority = MAX_USER_RT_PRIO/2,
371 };
372 unsigned int count = 0;
373 unsigned int target_ratio;
374
375 set_bit(cpunr, cpu_clamping_mask);
376 set_freezable();
377 init_timer_on_stack(&wakeup_timer);
378 sched_setscheduler(current, SCHED_FIFO, ¶m);
379
380 while (true == clamping && !kthread_should_stop() &&
381 cpu_online(cpunr)) {
382 int sleeptime;
383 unsigned long target_jiffies;
384 unsigned int guard;
385 unsigned int compensation = 0;
386 int interval;
387 unsigned int duration_jiffies = msecs_to_jiffies(duration);
388 unsigned int window_size_now;
389
390 try_to_freeze();
391
392
393
394
395
396 target_ratio = set_target_ratio;
397 guard = 1 + target_ratio/20;
398 window_size_now = window_size;
399 count++;
400
401
402
403
404
405
406 compensation = get_compensation(target_ratio);
407 interval = duration_jiffies*100/(target_ratio+compensation);
408
409
410 target_jiffies = roundup(jiffies, interval);
411 sleeptime = target_jiffies - jiffies;
412 if (sleeptime <= 0)
413 sleeptime = 1;
414 schedule_timeout_interruptible(sleeptime);
415
416
417
418
419 if (cpunr == control_cpu && !(count%window_size_now)) {
420 should_skip =
421 powerclamp_adjust_controls(target_ratio,
422 guard, window_size_now);
423 smp_mb();
424 }
425
426 if (should_skip)
427 continue;
428
429 target_jiffies = jiffies + duration_jiffies;
430 mod_timer(&wakeup_timer, target_jiffies);
431 if (unlikely(local_softirq_pending()))
432 continue;
433
434
435
436
437 preempt_disable();
438
439 while (time_before(jiffies, target_jiffies)) {
440 unsigned long ecx = 1;
441 unsigned long eax = target_mwait;
442
443
444
445
446
447 local_touch_nmi();
448 stop_critical_timings();
449 mwait_idle_with_hints(eax, ecx);
450 start_critical_timings();
451 atomic_inc(&idle_wakeup_counter);
452 }
453 preempt_enable();
454 }
455 del_timer_sync(&wakeup_timer);
456 clear_bit(cpunr, cpu_clamping_mask);
457
458 return 0;
459}
460
461
462
463
464
465static void poll_pkg_cstate(struct work_struct *dummy);
466static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
467static void poll_pkg_cstate(struct work_struct *dummy)
468{
469 static u64 msr_last;
470 static u64 tsc_last;
471 static unsigned long jiffies_last;
472
473 u64 msr_now;
474 unsigned long jiffies_now;
475 u64 tsc_now;
476 u64 val64;
477
478 msr_now = pkg_state_counter();
479 rdtscll(tsc_now);
480 jiffies_now = jiffies;
481
482
483 if (!msr_last || !tsc_last)
484 pkg_cstate_ratio_cur = 1;
485 else {
486 if (tsc_now - tsc_last) {
487 val64 = 100 * (msr_now - msr_last);
488 do_div(val64, (tsc_now - tsc_last));
489 pkg_cstate_ratio_cur = val64;
490 }
491 }
492
493
494 msr_last = msr_now;
495 jiffies_last = jiffies_now;
496 tsc_last = tsc_now;
497
498 if (true == clamping)
499 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
500}
501
502static int start_power_clamp(void)
503{
504 unsigned long cpu;
505 struct task_struct *thread;
506
507
508 if (!has_pkg_state_counter()) {
509 pr_err("pkg cstate counter not functional, abort\n");
510 return -EINVAL;
511 }
512
513 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
514
515 get_online_cpus();
516
517
518 control_cpu = 0;
519 if (!cpu_online(control_cpu))
520 control_cpu = smp_processor_id();
521
522 clamping = true;
523 schedule_delayed_work(&poll_pkg_cstate_work, 0);
524
525
526 for_each_online_cpu(cpu) {
527 struct task_struct **p =
528 per_cpu_ptr(powerclamp_thread, cpu);
529
530 thread = kthread_create_on_node(clamp_thread,
531 (void *) cpu,
532 cpu_to_node(cpu),
533 "kidle_inject/%ld", cpu);
534
535 if (likely(!IS_ERR(thread))) {
536 kthread_bind(thread, cpu);
537 wake_up_process(thread);
538 *p = thread;
539 }
540
541 }
542 put_online_cpus();
543
544 return 0;
545}
546
547static void end_power_clamp(void)
548{
549 int i;
550 struct task_struct *thread;
551
552 clamping = false;
553
554
555
556
557 smp_mb();
558 msleep(20);
559 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
560 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
561 pr_debug("clamping thread for cpu %d alive, kill\n", i);
562 thread = *per_cpu_ptr(powerclamp_thread, i);
563 kthread_stop(thread);
564 }
565 }
566}
567
568static int powerclamp_cpu_callback(struct notifier_block *nfb,
569 unsigned long action, void *hcpu)
570{
571 unsigned long cpu = (unsigned long)hcpu;
572 struct task_struct *thread;
573 struct task_struct **percpu_thread =
574 per_cpu_ptr(powerclamp_thread, cpu);
575
576 if (false == clamping)
577 goto exit_ok;
578
579 switch (action) {
580 case CPU_ONLINE:
581 thread = kthread_create_on_node(clamp_thread,
582 (void *) cpu,
583 cpu_to_node(cpu),
584 "kidle_inject/%lu", cpu);
585 if (likely(!IS_ERR(thread))) {
586 kthread_bind(thread, cpu);
587 wake_up_process(thread);
588 *percpu_thread = thread;
589 }
590
591 if (cpu == 0) {
592 control_cpu = 0;
593 smp_mb();
594 }
595 break;
596 case CPU_DEAD:
597 if (test_bit(cpu, cpu_clamping_mask)) {
598 pr_err("cpu %lu dead but powerclamping thread is not\n",
599 cpu);
600 kthread_stop(*percpu_thread);
601 }
602 if (cpu == control_cpu) {
603 control_cpu = smp_processor_id();
604 smp_mb();
605 }
606 }
607
608exit_ok:
609 return NOTIFY_OK;
610}
611
612static struct notifier_block powerclamp_cpu_notifier = {
613 .notifier_call = powerclamp_cpu_callback,
614};
615
616static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
617 unsigned long *state)
618{
619 *state = MAX_TARGET_RATIO;
620
621 return 0;
622}
623
624static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
625 unsigned long *state)
626{
627 if (true == clamping)
628 *state = pkg_cstate_ratio_cur;
629 else
630
631 *state = -1;
632
633 return 0;
634}
635
636static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
637 unsigned long new_target_ratio)
638{
639 int ret = 0;
640
641 new_target_ratio = clamp(new_target_ratio, 0UL,
642 (unsigned long) (MAX_TARGET_RATIO-1));
643 if (set_target_ratio == 0 && new_target_ratio > 0) {
644 pr_info("Start idle injection to reduce power\n");
645 set_target_ratio = new_target_ratio;
646 ret = start_power_clamp();
647 goto exit_set;
648 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
649 pr_info("Stop forced idle injection\n");
650 set_target_ratio = 0;
651 end_power_clamp();
652 } else {
653 set_target_ratio = new_target_ratio;
654
655 smp_mb();
656 }
657
658exit_set:
659 return ret;
660}
661
662
663static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
664 .get_max_state = powerclamp_get_max_state,
665 .get_cur_state = powerclamp_get_cur_state,
666 .set_cur_state = powerclamp_set_cur_state,
667};
668
669
670static const struct x86_cpu_id intel_powerclamp_ids[] = {
671 { X86_VENDOR_INTEL, 6, 0x1a},
672 { X86_VENDOR_INTEL, 6, 0x1c},
673 { X86_VENDOR_INTEL, 6, 0x1e},
674 { X86_VENDOR_INTEL, 6, 0x1f},
675 { X86_VENDOR_INTEL, 6, 0x25},
676 { X86_VENDOR_INTEL, 6, 0x26},
677 { X86_VENDOR_INTEL, 6, 0x2a},
678 { X86_VENDOR_INTEL, 6, 0x2c},
679 { X86_VENDOR_INTEL, 6, 0x2d},
680 { X86_VENDOR_INTEL, 6, 0x2e},
681 { X86_VENDOR_INTEL, 6, 0x2f},
682 { X86_VENDOR_INTEL, 6, 0x37},
683 { X86_VENDOR_INTEL, 6, 0x3a},
684 { X86_VENDOR_INTEL, 6, 0x3c},
685 { X86_VENDOR_INTEL, 6, 0x3d},
686 { X86_VENDOR_INTEL, 6, 0x3e},
687 { X86_VENDOR_INTEL, 6, 0x3f},
688 { X86_VENDOR_INTEL, 6, 0x45},
689 { X86_VENDOR_INTEL, 6, 0x46},
690 { X86_VENDOR_INTEL, 6, 0x4c},
691 { X86_VENDOR_INTEL, 6, 0x4d},
692 { X86_VENDOR_INTEL, 6, 0x56},
693 {}
694};
695MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
696
697static int powerclamp_probe(void)
698{
699 if (!x86_match_cpu(intel_powerclamp_ids)) {
700 pr_err("Intel powerclamp does not run on family %d model %d\n",
701 boot_cpu_data.x86, boot_cpu_data.x86_model);
702 return -ENODEV;
703 }
704 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
705 !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
706 !boot_cpu_has(X86_FEATURE_MWAIT) ||
707 !boot_cpu_has(X86_FEATURE_ARAT))
708 return -ENODEV;
709
710
711 find_target_mwait();
712
713 return 0;
714}
715
716static int powerclamp_debug_show(struct seq_file *m, void *unused)
717{
718 int i = 0;
719
720 seq_printf(m, "controlling cpu: %d\n", control_cpu);
721 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
722 for (i = 0; i < MAX_TARGET_RATIO; i++) {
723 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
724 i,
725 cal_data[i].confidence,
726 cal_data[i].steady_comp,
727 cal_data[i].dynamic_comp);
728 }
729
730 return 0;
731}
732
733static int powerclamp_debug_open(struct inode *inode,
734 struct file *file)
735{
736 return single_open(file, powerclamp_debug_show, inode->i_private);
737}
738
739static const struct file_operations powerclamp_debug_fops = {
740 .open = powerclamp_debug_open,
741 .read = seq_read,
742 .llseek = seq_lseek,
743 .release = single_release,
744 .owner = THIS_MODULE,
745};
746
747static inline void powerclamp_create_debug_files(void)
748{
749 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
750 if (!debug_dir)
751 return;
752
753 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
754 cal_data, &powerclamp_debug_fops))
755 goto file_error;
756
757 return;
758
759file_error:
760 debugfs_remove_recursive(debug_dir);
761}
762
763static int powerclamp_init(void)
764{
765 int retval;
766 int bitmap_size;
767
768 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
769 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
770 if (!cpu_clamping_mask)
771 return -ENOMEM;
772
773
774 retval = powerclamp_probe();
775 if (retval)
776 goto exit_free;
777
778
779 window_size = 2;
780 register_hotcpu_notifier(&powerclamp_cpu_notifier);
781
782 powerclamp_thread = alloc_percpu(struct task_struct *);
783 if (!powerclamp_thread) {
784 retval = -ENOMEM;
785 goto exit_unregister;
786 }
787
788 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
789 &powerclamp_cooling_ops);
790 if (IS_ERR(cooling_dev)) {
791 retval = -ENODEV;
792 goto exit_free_thread;
793 }
794
795 if (!duration)
796 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
797
798 powerclamp_create_debug_files();
799
800 return 0;
801
802exit_free_thread:
803 free_percpu(powerclamp_thread);
804exit_unregister:
805 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
806exit_free:
807 kfree(cpu_clamping_mask);
808 return retval;
809}
810module_init(powerclamp_init);
811
812static void powerclamp_exit(void)
813{
814 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
815 end_power_clamp();
816 free_percpu(powerclamp_thread);
817 thermal_cooling_device_unregister(cooling_dev);
818 kfree(cpu_clamping_mask);
819
820 cancel_delayed_work_sync(&poll_pkg_cstate_work);
821 debugfs_remove_recursive(debug_dir);
822}
823module_exit(powerclamp_exit);
824
825MODULE_LICENSE("GPL");
826MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
827MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
828MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
829