linux/drivers/thermal/x86_pkg_temp_thermal.c
<<
>>
Prefs
   1/*
   2 * x86_pkg_temp_thermal driver
   3 * Copyright (c) 2013, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 *
  14 * You should have received a copy of the GNU General Public License along with
  15 * this program; if not, write to the Free Software Foundation, Inc.
  16 *
  17 */
  18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  19
  20#include <linux/module.h>
  21#include <linux/init.h>
  22#include <linux/err.h>
  23#include <linux/param.h>
  24#include <linux/device.h>
  25#include <linux/platform_device.h>
  26#include <linux/cpu.h>
  27#include <linux/smp.h>
  28#include <linux/slab.h>
  29#include <linux/pm.h>
  30#include <linux/thermal.h>
  31#include <linux/debugfs.h>
  32#include <asm/cpu_device_id.h>
  33#include <asm/mce.h>
  34
  35/*
  36* Rate control delay: Idea is to introduce denounce effect
  37* This should be long enough to avoid reduce events, when
  38* threshold is set to a temperature, which is constantly
  39* violated, but at the short enough to take any action.
  40* The action can be remove threshold or change it to next
  41* interesting setting. Based on experiments, in around
  42* every 5 seconds under load will give us a significant
  43* temperature change.
  44*/
  45#define PKG_TEMP_THERMAL_NOTIFY_DELAY   5000
  46static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
  47module_param(notify_delay_ms, int, 0644);
  48MODULE_PARM_DESC(notify_delay_ms,
  49        "User space notification delay in milli seconds.");
  50
  51/* Number of trip points in thermal zone. Currently it can't
  52* be more than 2. MSR can allow setting and getting notifications
  53* for only 2 thresholds. This define enforces this, if there
  54* is some wrong values returned by cpuid for number of thresholds.
  55*/
  56#define MAX_NUMBER_OF_TRIPS     2
  57
  58struct pkg_device {
  59        int                             cpu;
  60        bool                            work_scheduled;
  61        u32                             tj_max;
  62        u32                             msr_pkg_therm_low;
  63        u32                             msr_pkg_therm_high;
  64        struct delayed_work             work;
  65        struct thermal_zone_device      *tzone;
  66        struct cpumask                  cpumask;
  67};
  68
  69static struct thermal_zone_params pkg_temp_tz_params = {
  70        .no_hwmon       = true,
  71};
  72
  73/* Keep track of how many package pointers we allocated in init() */
  74static int max_packages __read_mostly;
  75/* Array of package pointers */
  76static struct pkg_device **packages;
  77/* Serializes interrupt notification, work and hotplug */
  78static DEFINE_SPINLOCK(pkg_temp_lock);
  79/* Protects zone operation in the work function against hotplug removal */
  80static DEFINE_MUTEX(thermal_zone_mutex);
  81
  82/* The dynamically assigned cpu hotplug state for module_exit() */
  83static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
  84
  85/* Debug counters to show using debugfs */
  86static struct dentry *debugfs;
  87static unsigned int pkg_interrupt_cnt;
  88static unsigned int pkg_work_cnt;
  89
  90static int pkg_temp_debugfs_init(void)
  91{
  92        struct dentry *d;
  93
  94        debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
  95        if (!debugfs)
  96                return -ENOENT;
  97
  98        d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
  99                               &pkg_interrupt_cnt);
 100        if (!d)
 101                goto err_out;
 102
 103        d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
 104                               &pkg_work_cnt);
 105        if (!d)
 106                goto err_out;
 107
 108        return 0;
 109
 110err_out:
 111        debugfs_remove_recursive(debugfs);
 112        return -ENOENT;
 113}
 114
 115/*
 116 * Protection:
 117 *
 118 * - cpu hotplug: Read serialized by cpu hotplug lock
 119 *                Write must hold pkg_temp_lock
 120 *
 121 * - Other callsites: Must hold pkg_temp_lock
 122 */
 123static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu)
 124{
 125        int pkgid = topology_logical_package_id(cpu);
 126
 127        if (pkgid >= 0 && pkgid < max_packages)
 128                return packages[pkgid];
 129        return NULL;
 130}
 131
 132/*
 133* tj-max is is interesting because threshold is set relative to this
 134* temperature.
 135*/
 136static int get_tj_max(int cpu, u32 *tj_max)
 137{
 138        u32 eax, edx, val;
 139        int err;
 140
 141        err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
 142        if (err)
 143                return err;
 144
 145        val = (eax >> 16) & 0xff;
 146        *tj_max = val * 1000;
 147
 148        return val ? 0 : -EINVAL;
 149}
 150
 151static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 152{
 153        struct pkg_device *pkgdev = tzd->devdata;
 154        u32 eax, edx;
 155
 156        rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx);
 157        if (eax & 0x80000000) {
 158                *temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000;
 159                pr_debug("sys_get_curr_temp %d\n", *temp);
 160                return 0;
 161        }
 162        return -EINVAL;
 163}
 164
 165static int sys_get_trip_temp(struct thermal_zone_device *tzd,
 166                             int trip, int *temp)
 167{
 168        struct pkg_device *pkgdev = tzd->devdata;
 169        unsigned long thres_reg_value;
 170        u32 mask, shift, eax, edx;
 171        int ret;
 172
 173        if (trip >= MAX_NUMBER_OF_TRIPS)
 174                return -EINVAL;
 175
 176        if (trip) {
 177                mask = THERM_MASK_THRESHOLD1;
 178                shift = THERM_SHIFT_THRESHOLD1;
 179        } else {
 180                mask = THERM_MASK_THRESHOLD0;
 181                shift = THERM_SHIFT_THRESHOLD0;
 182        }
 183
 184        ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 185                           &eax, &edx);
 186        if (ret < 0)
 187                return ret;
 188
 189        thres_reg_value = (eax & mask) >> shift;
 190        if (thres_reg_value)
 191                *temp = pkgdev->tj_max - thres_reg_value * 1000;
 192        else
 193                *temp = 0;
 194        pr_debug("sys_get_trip_temp %d\n", *temp);
 195
 196        return 0;
 197}
 198
 199static int
 200sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
 201{
 202        struct pkg_device *pkgdev = tzd->devdata;
 203        u32 l, h, mask, shift, intr;
 204        int ret;
 205
 206        if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max)
 207                return -EINVAL;
 208
 209        ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 210                           &l, &h);
 211        if (ret < 0)
 212                return ret;
 213
 214        if (trip) {
 215                mask = THERM_MASK_THRESHOLD1;
 216                shift = THERM_SHIFT_THRESHOLD1;
 217                intr = THERM_INT_THRESHOLD1_ENABLE;
 218        } else {
 219                mask = THERM_MASK_THRESHOLD0;
 220                shift = THERM_SHIFT_THRESHOLD0;
 221                intr = THERM_INT_THRESHOLD0_ENABLE;
 222        }
 223        l &= ~mask;
 224        /*
 225        * When users space sets a trip temperature == 0, which is indication
 226        * that, it is no longer interested in receiving notifications.
 227        */
 228        if (!temp) {
 229                l &= ~intr;
 230        } else {
 231                l |= (pkgdev->tj_max - temp)/1000 << shift;
 232                l |= intr;
 233        }
 234
 235        return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 236}
 237
 238static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
 239                             enum thermal_trip_type *type)
 240{
 241        *type = THERMAL_TRIP_PASSIVE;
 242        return 0;
 243}
 244
 245/* Thermal zone callback registry */
 246static struct thermal_zone_device_ops tzone_ops = {
 247        .get_temp = sys_get_curr_temp,
 248        .get_trip_temp = sys_get_trip_temp,
 249        .get_trip_type = sys_get_trip_type,
 250        .set_trip_temp = sys_set_trip_temp,
 251};
 252
 253static bool pkg_thermal_rate_control(void)
 254{
 255        return true;
 256}
 257
 258/* Enable threshold interrupt on local package/cpu */
 259static inline void enable_pkg_thres_interrupt(void)
 260{
 261        u8 thres_0, thres_1;
 262        u32 l, h;
 263
 264        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 265        /* only enable/disable if it had valid threshold value */
 266        thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
 267        thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
 268        if (thres_0)
 269                l |= THERM_INT_THRESHOLD0_ENABLE;
 270        if (thres_1)
 271                l |= THERM_INT_THRESHOLD1_ENABLE;
 272        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 273}
 274
 275/* Disable threshold interrupt on local package/cpu */
 276static inline void disable_pkg_thres_interrupt(void)
 277{
 278        u32 l, h;
 279
 280        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 281
 282        l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
 283        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 284}
 285
 286static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
 287{
 288        struct thermal_zone_device *tzone = NULL;
 289        int cpu = smp_processor_id();
 290        struct pkg_device *pkgdev;
 291        u64 msr_val, wr_val;
 292
 293        mutex_lock(&thermal_zone_mutex);
 294        spin_lock_irq(&pkg_temp_lock);
 295        ++pkg_work_cnt;
 296
 297        pkgdev = pkg_temp_thermal_get_dev(cpu);
 298        if (!pkgdev) {
 299                spin_unlock_irq(&pkg_temp_lock);
 300                mutex_unlock(&thermal_zone_mutex);
 301                return;
 302        }
 303        pkgdev->work_scheduled = false;
 304
 305        rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 306        wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
 307        if (wr_val != msr_val) {
 308                wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
 309                tzone = pkgdev->tzone;
 310        }
 311
 312        enable_pkg_thres_interrupt();
 313        spin_unlock_irq(&pkg_temp_lock);
 314
 315        /*
 316         * If tzone is not NULL, then thermal_zone_mutex will prevent the
 317         * concurrent removal in the cpu offline callback.
 318         */
 319        if (tzone)
 320                thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
 321
 322        mutex_unlock(&thermal_zone_mutex);
 323}
 324
 325static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
 326{
 327        unsigned long ms = msecs_to_jiffies(notify_delay_ms);
 328
 329        schedule_delayed_work_on(cpu, work, ms);
 330}
 331
 332static int pkg_thermal_notify(u64 msr_val)
 333{
 334        int cpu = smp_processor_id();
 335        struct pkg_device *pkgdev;
 336        unsigned long flags;
 337
 338        spin_lock_irqsave(&pkg_temp_lock, flags);
 339        ++pkg_interrupt_cnt;
 340
 341        disable_pkg_thres_interrupt();
 342
 343        /* Work is per package, so scheduling it once is enough. */
 344        pkgdev = pkg_temp_thermal_get_dev(cpu);
 345        if (pkgdev && !pkgdev->work_scheduled) {
 346                pkgdev->work_scheduled = true;
 347                pkg_thermal_schedule_work(pkgdev->cpu, &pkgdev->work);
 348        }
 349
 350        spin_unlock_irqrestore(&pkg_temp_lock, flags);
 351        return 0;
 352}
 353
 354static int pkg_temp_thermal_device_add(unsigned int cpu)
 355{
 356        int pkgid = topology_logical_package_id(cpu);
 357        u32 tj_max, eax, ebx, ecx, edx;
 358        struct pkg_device *pkgdev;
 359        int thres_count, err;
 360
 361        if (pkgid >= max_packages)
 362                return -ENOMEM;
 363
 364        cpuid(6, &eax, &ebx, &ecx, &edx);
 365        thres_count = ebx & 0x07;
 366        if (!thres_count)
 367                return -ENODEV;
 368
 369        thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
 370
 371        err = get_tj_max(cpu, &tj_max);
 372        if (err)
 373                return err;
 374
 375        pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL);
 376        if (!pkgdev)
 377                return -ENOMEM;
 378
 379        INIT_DELAYED_WORK(&pkgdev->work, pkg_temp_thermal_threshold_work_fn);
 380        pkgdev->cpu = cpu;
 381        pkgdev->tj_max = tj_max;
 382        pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp",
 383                        thres_count,
 384                        (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
 385                        pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
 386        if (IS_ERR(pkgdev->tzone)) {
 387                err = PTR_ERR(pkgdev->tzone);
 388                kfree(pkgdev);
 389                return err;
 390        }
 391        /* Store MSR value for package thermal interrupt, to restore at exit */
 392        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, pkgdev->msr_pkg_therm_low,
 393              pkgdev->msr_pkg_therm_high);
 394
 395        cpumask_set_cpu(cpu, &pkgdev->cpumask);
 396        spin_lock_irq(&pkg_temp_lock);
 397        packages[pkgid] = pkgdev;
 398        spin_unlock_irq(&pkg_temp_lock);
 399        return 0;
 400}
 401
 402static int pkg_thermal_cpu_offline(unsigned int cpu)
 403{
 404        struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
 405        bool lastcpu, was_target;
 406        int target;
 407
 408        if (!pkgdev)
 409                return 0;
 410
 411        target = cpumask_any_but(&pkgdev->cpumask, cpu);
 412        cpumask_clear_cpu(cpu, &pkgdev->cpumask);
 413        lastcpu = target >= nr_cpu_ids;
 414        /*
 415         * Remove the sysfs files, if this is the last cpu in the package
 416         * before doing further cleanups.
 417         */
 418        if (lastcpu) {
 419                struct thermal_zone_device *tzone = pkgdev->tzone;
 420
 421                /*
 422                 * We must protect against a work function calling
 423                 * thermal_zone_update, after/while unregister. We null out
 424                 * the pointer under the zone mutex, so the worker function
 425                 * won't try to call.
 426                 */
 427                mutex_lock(&thermal_zone_mutex);
 428                pkgdev->tzone = NULL;
 429                mutex_unlock(&thermal_zone_mutex);
 430
 431                thermal_zone_device_unregister(tzone);
 432        }
 433
 434        /* Protect against work and interrupts */
 435        spin_lock_irq(&pkg_temp_lock);
 436
 437        /*
 438         * Check whether this cpu was the current target and store the new
 439         * one. When we drop the lock, then the interrupt notify function
 440         * will see the new target.
 441         */
 442        was_target = pkgdev->cpu == cpu;
 443        pkgdev->cpu = target;
 444
 445        /*
 446         * If this is the last CPU in the package remove the package
 447         * reference from the array and restore the interrupt MSR. When we
 448         * drop the lock neither the interrupt notify function nor the
 449         * worker will see the package anymore.
 450         */
 451        if (lastcpu) {
 452                packages[topology_logical_package_id(cpu)] = NULL;
 453                /* After this point nothing touches the MSR anymore. */
 454                wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
 455                      pkgdev->msr_pkg_therm_low, pkgdev->msr_pkg_therm_high);
 456        }
 457
 458        /*
 459         * Check whether there is work scheduled and whether the work is
 460         * targeted at the outgoing CPU.
 461         */
 462        if (pkgdev->work_scheduled && was_target) {
 463                /*
 464                 * To cancel the work we need to drop the lock, otherwise
 465                 * we might deadlock if the work needs to be flushed.
 466                 */
 467                spin_unlock_irq(&pkg_temp_lock);
 468                cancel_delayed_work_sync(&pkgdev->work);
 469                spin_lock_irq(&pkg_temp_lock);
 470                /*
 471                 * If this is not the last cpu in the package and the work
 472                 * did not run after we dropped the lock above, then we
 473                 * need to reschedule the work, otherwise the interrupt
 474                 * stays disabled forever.
 475                 */
 476                if (!lastcpu && pkgdev->work_scheduled)
 477                        pkg_thermal_schedule_work(target, &pkgdev->work);
 478        }
 479
 480        spin_unlock_irq(&pkg_temp_lock);
 481
 482        /* Final cleanup if this is the last cpu */
 483        if (lastcpu)
 484                kfree(pkgdev);
 485        return 0;
 486}
 487
 488static int pkg_thermal_cpu_online(unsigned int cpu)
 489{
 490        struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
 491        struct cpuinfo_x86 *c = &cpu_data(cpu);
 492
 493        /* Paranoia check */
 494        if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
 495                return -ENODEV;
 496
 497        /* If the package exists, nothing to do */
 498        if (pkgdev) {
 499                cpumask_set_cpu(cpu, &pkgdev->cpumask);
 500                return 0;
 501        }
 502        return pkg_temp_thermal_device_add(cpu);
 503}
 504
 505static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
 506        { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS },
 507        {}
 508};
 509MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
 510
 511static int __init pkg_temp_thermal_init(void)
 512{
 513        int ret;
 514
 515        if (!x86_match_cpu(pkg_temp_thermal_ids))
 516                return -ENODEV;
 517
 518        max_packages = topology_max_packages();
 519        packages = kcalloc(max_packages, sizeof(struct pkg_device *),
 520                           GFP_KERNEL);
 521        if (!packages)
 522                return -ENOMEM;
 523
 524        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
 525                                pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
 526        if (ret < 0)
 527                goto err;
 528
 529        /* Store the state for module exit */
 530        pkg_thermal_hp_state = ret;
 531
 532        platform_thermal_package_notify = pkg_thermal_notify;
 533        platform_thermal_package_rate_control = pkg_thermal_rate_control;
 534
 535         /* Don't care if it fails */
 536        pkg_temp_debugfs_init();
 537        return 0;
 538
 539err:
 540        kfree(packages);
 541        return ret;
 542}
 543module_init(pkg_temp_thermal_init)
 544
 545static void __exit pkg_temp_thermal_exit(void)
 546{
 547        platform_thermal_package_notify = NULL;
 548        platform_thermal_package_rate_control = NULL;
 549
 550        cpuhp_remove_state(pkg_thermal_hp_state);
 551        debugfs_remove_recursive(debugfs);
 552        kfree(packages);
 553}
 554module_exit(pkg_temp_thermal_exit)
 555
 556MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
 557MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
 558MODULE_LICENSE("GPL v2");
 559