linux/drivers/thermal/intel/x86_pkg_temp_thermal.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * x86_pkg_temp_thermal driver
   4 * Copyright (c) 2013, Intel Corporation.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/module.h>
   9#include <linux/init.h>
  10#include <linux/err.h>
  11#include <linux/param.h>
  12#include <linux/device.h>
  13#include <linux/platform_device.h>
  14#include <linux/cpu.h>
  15#include <linux/smp.h>
  16#include <linux/slab.h>
  17#include <linux/pm.h>
  18#include <linux/thermal.h>
  19#include <linux/debugfs.h>
  20#include <asm/cpu_device_id.h>
  21#include <asm/mce.h>
  22
  23/*
  24* Rate control delay: Idea is to introduce denounce effect
  25* This should be long enough to avoid reduce events, when
  26* threshold is set to a temperature, which is constantly
  27* violated, but at the short enough to take any action.
  28* The action can be remove threshold or change it to next
  29* interesting setting. Based on experiments, in around
  30* every 5 seconds under load will give us a significant
  31* temperature change.
  32*/
  33#define PKG_TEMP_THERMAL_NOTIFY_DELAY   5000
  34static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
  35module_param(notify_delay_ms, int, 0644);
  36MODULE_PARM_DESC(notify_delay_ms,
  37        "User space notification delay in milli seconds.");
  38
  39/* Number of trip points in thermal zone. Currently it can't
  40* be more than 2. MSR can allow setting and getting notifications
  41* for only 2 thresholds. This define enforces this, if there
  42* is some wrong values returned by cpuid for number of thresholds.
  43*/
  44#define MAX_NUMBER_OF_TRIPS     2
  45
  46struct zone_device {
  47        int                             cpu;
  48        bool                            work_scheduled;
  49        u32                             tj_max;
  50        u32                             msr_pkg_therm_low;
  51        u32                             msr_pkg_therm_high;
  52        struct delayed_work             work;
  53        struct thermal_zone_device      *tzone;
  54        struct cpumask                  cpumask;
  55};
  56
  57static struct thermal_zone_params pkg_temp_tz_params = {
  58        .no_hwmon       = true,
  59};
  60
  61/* Keep track of how many zone pointers we allocated in init() */
  62static int max_id __read_mostly;
  63/* Array of zone pointers */
  64static struct zone_device **zones;
  65/* Serializes interrupt notification, work and hotplug */
  66static DEFINE_SPINLOCK(pkg_temp_lock);
  67/* Protects zone operation in the work function against hotplug removal */
  68static DEFINE_MUTEX(thermal_zone_mutex);
  69
  70/* The dynamically assigned cpu hotplug state for module_exit() */
  71static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
  72
  73/* Debug counters to show using debugfs */
  74static struct dentry *debugfs;
  75static unsigned int pkg_interrupt_cnt;
  76static unsigned int pkg_work_cnt;
  77
  78static void pkg_temp_debugfs_init(void)
  79{
  80        debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
  81
  82        debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
  83                           &pkg_interrupt_cnt);
  84        debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
  85                           &pkg_work_cnt);
  86}
  87
  88/*
  89 * Protection:
  90 *
  91 * - cpu hotplug: Read serialized by cpu hotplug lock
  92 *                Write must hold pkg_temp_lock
  93 *
  94 * - Other callsites: Must hold pkg_temp_lock
  95 */
  96static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
  97{
  98        int id = topology_logical_die_id(cpu);
  99
 100        if (id >= 0 && id < max_id)
 101                return zones[id];
 102        return NULL;
 103}
 104
 105/*
 106* tj-max is is interesting because threshold is set relative to this
 107* temperature.
 108*/
 109static int get_tj_max(int cpu, u32 *tj_max)
 110{
 111        u32 eax, edx, val;
 112        int err;
 113
 114        err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
 115        if (err)
 116                return err;
 117
 118        val = (eax >> 16) & 0xff;
 119        *tj_max = val * 1000;
 120
 121        return val ? 0 : -EINVAL;
 122}
 123
 124static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 125{
 126        struct zone_device *zonedev = tzd->devdata;
 127        u32 eax, edx;
 128
 129        rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
 130                        &eax, &edx);
 131        if (eax & 0x80000000) {
 132                *temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
 133                pr_debug("sys_get_curr_temp %d\n", *temp);
 134                return 0;
 135        }
 136        return -EINVAL;
 137}
 138
 139static int sys_get_trip_temp(struct thermal_zone_device *tzd,
 140                             int trip, int *temp)
 141{
 142        struct zone_device *zonedev = tzd->devdata;
 143        unsigned long thres_reg_value;
 144        u32 mask, shift, eax, edx;
 145        int ret;
 146
 147        if (trip >= MAX_NUMBER_OF_TRIPS)
 148                return -EINVAL;
 149
 150        if (trip) {
 151                mask = THERM_MASK_THRESHOLD1;
 152                shift = THERM_SHIFT_THRESHOLD1;
 153        } else {
 154                mask = THERM_MASK_THRESHOLD0;
 155                shift = THERM_SHIFT_THRESHOLD0;
 156        }
 157
 158        ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 159                           &eax, &edx);
 160        if (ret < 0)
 161                return ret;
 162
 163        thres_reg_value = (eax & mask) >> shift;
 164        if (thres_reg_value)
 165                *temp = zonedev->tj_max - thres_reg_value * 1000;
 166        else
 167                *temp = 0;
 168        pr_debug("sys_get_trip_temp %d\n", *temp);
 169
 170        return 0;
 171}
 172
 173static int
 174sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
 175{
 176        struct zone_device *zonedev = tzd->devdata;
 177        u32 l, h, mask, shift, intr;
 178        int ret;
 179
 180        if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
 181                return -EINVAL;
 182
 183        ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 184                           &l, &h);
 185        if (ret < 0)
 186                return ret;
 187
 188        if (trip) {
 189                mask = THERM_MASK_THRESHOLD1;
 190                shift = THERM_SHIFT_THRESHOLD1;
 191                intr = THERM_INT_THRESHOLD1_ENABLE;
 192        } else {
 193                mask = THERM_MASK_THRESHOLD0;
 194                shift = THERM_SHIFT_THRESHOLD0;
 195                intr = THERM_INT_THRESHOLD0_ENABLE;
 196        }
 197        l &= ~mask;
 198        /*
 199        * When users space sets a trip temperature == 0, which is indication
 200        * that, it is no longer interested in receiving notifications.
 201        */
 202        if (!temp) {
 203                l &= ~intr;
 204        } else {
 205                l |= (zonedev->tj_max - temp)/1000 << shift;
 206                l |= intr;
 207        }
 208
 209        return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 210                        l, h);
 211}
 212
 213static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
 214                             enum thermal_trip_type *type)
 215{
 216        *type = THERMAL_TRIP_PASSIVE;
 217        return 0;
 218}
 219
 220/* Thermal zone callback registry */
 221static struct thermal_zone_device_ops tzone_ops = {
 222        .get_temp = sys_get_curr_temp,
 223        .get_trip_temp = sys_get_trip_temp,
 224        .get_trip_type = sys_get_trip_type,
 225        .set_trip_temp = sys_set_trip_temp,
 226};
 227
 228static bool pkg_thermal_rate_control(void)
 229{
 230        return true;
 231}
 232
 233/* Enable threshold interrupt on local package/cpu */
 234static inline void enable_pkg_thres_interrupt(void)
 235{
 236        u8 thres_0, thres_1;
 237        u32 l, h;
 238
 239        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 240        /* only enable/disable if it had valid threshold value */
 241        thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
 242        thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
 243        if (thres_0)
 244                l |= THERM_INT_THRESHOLD0_ENABLE;
 245        if (thres_1)
 246                l |= THERM_INT_THRESHOLD1_ENABLE;
 247        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 248}
 249
 250/* Disable threshold interrupt on local package/cpu */
 251static inline void disable_pkg_thres_interrupt(void)
 252{
 253        u32 l, h;
 254
 255        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 256
 257        l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
 258        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 259}
 260
 261static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
 262{
 263        struct thermal_zone_device *tzone = NULL;
 264        int cpu = smp_processor_id();
 265        struct zone_device *zonedev;
 266        u64 msr_val, wr_val;
 267
 268        mutex_lock(&thermal_zone_mutex);
 269        spin_lock_irq(&pkg_temp_lock);
 270        ++pkg_work_cnt;
 271
 272        zonedev = pkg_temp_thermal_get_dev(cpu);
 273        if (!zonedev) {
 274                spin_unlock_irq(&pkg_temp_lock);
 275                mutex_unlock(&thermal_zone_mutex);
 276                return;
 277        }
 278        zonedev->work_scheduled = false;
 279
 280        rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 281        wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
 282        if (wr_val != msr_val) {
 283                wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
 284                tzone = zonedev->tzone;
 285        }
 286
 287        enable_pkg_thres_interrupt();
 288        spin_unlock_irq(&pkg_temp_lock);
 289
 290        /*
 291         * If tzone is not NULL, then thermal_zone_mutex will prevent the
 292         * concurrent removal in the cpu offline callback.
 293         */
 294        if (tzone)
 295                thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
 296
 297        mutex_unlock(&thermal_zone_mutex);
 298}
 299
 300static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
 301{
 302        unsigned long ms = msecs_to_jiffies(notify_delay_ms);
 303
 304        schedule_delayed_work_on(cpu, work, ms);
 305}
 306
 307static int pkg_thermal_notify(u64 msr_val)
 308{
 309        int cpu = smp_processor_id();
 310        struct zone_device *zonedev;
 311        unsigned long flags;
 312
 313        spin_lock_irqsave(&pkg_temp_lock, flags);
 314        ++pkg_interrupt_cnt;
 315
 316        disable_pkg_thres_interrupt();
 317
 318        /* Work is per package, so scheduling it once is enough. */
 319        zonedev = pkg_temp_thermal_get_dev(cpu);
 320        if (zonedev && !zonedev->work_scheduled) {
 321                zonedev->work_scheduled = true;
 322                pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
 323        }
 324
 325        spin_unlock_irqrestore(&pkg_temp_lock, flags);
 326        return 0;
 327}
 328
 329static int pkg_temp_thermal_device_add(unsigned int cpu)
 330{
 331        int id = topology_logical_die_id(cpu);
 332        u32 tj_max, eax, ebx, ecx, edx;
 333        struct zone_device *zonedev;
 334        int thres_count, err;
 335
 336        if (id >= max_id)
 337                return -ENOMEM;
 338
 339        cpuid(6, &eax, &ebx, &ecx, &edx);
 340        thres_count = ebx & 0x07;
 341        if (!thres_count)
 342                return -ENODEV;
 343
 344        thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
 345
 346        err = get_tj_max(cpu, &tj_max);
 347        if (err)
 348                return err;
 349
 350        zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
 351        if (!zonedev)
 352                return -ENOMEM;
 353
 354        INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
 355        zonedev->cpu = cpu;
 356        zonedev->tj_max = tj_max;
 357        zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
 358                        thres_count,
 359                        (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
 360                        zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
 361        if (IS_ERR(zonedev->tzone)) {
 362                err = PTR_ERR(zonedev->tzone);
 363                kfree(zonedev);
 364                return err;
 365        }
 366        /* Store MSR value for package thermal interrupt, to restore at exit */
 367        rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
 368              zonedev->msr_pkg_therm_high);
 369
 370        cpumask_set_cpu(cpu, &zonedev->cpumask);
 371        spin_lock_irq(&pkg_temp_lock);
 372        zones[id] = zonedev;
 373        spin_unlock_irq(&pkg_temp_lock);
 374        return 0;
 375}
 376
 377static int pkg_thermal_cpu_offline(unsigned int cpu)
 378{
 379        struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
 380        bool lastcpu, was_target;
 381        int target;
 382
 383        if (!zonedev)
 384                return 0;
 385
 386        target = cpumask_any_but(&zonedev->cpumask, cpu);
 387        cpumask_clear_cpu(cpu, &zonedev->cpumask);
 388        lastcpu = target >= nr_cpu_ids;
 389        /*
 390         * Remove the sysfs files, if this is the last cpu in the package
 391         * before doing further cleanups.
 392         */
 393        if (lastcpu) {
 394                struct thermal_zone_device *tzone = zonedev->tzone;
 395
 396                /*
 397                 * We must protect against a work function calling
 398                 * thermal_zone_update, after/while unregister. We null out
 399                 * the pointer under the zone mutex, so the worker function
 400                 * won't try to call.
 401                 */
 402                mutex_lock(&thermal_zone_mutex);
 403                zonedev->tzone = NULL;
 404                mutex_unlock(&thermal_zone_mutex);
 405
 406                thermal_zone_device_unregister(tzone);
 407        }
 408
 409        /* Protect against work and interrupts */
 410        spin_lock_irq(&pkg_temp_lock);
 411
 412        /*
 413         * Check whether this cpu was the current target and store the new
 414         * one. When we drop the lock, then the interrupt notify function
 415         * will see the new target.
 416         */
 417        was_target = zonedev->cpu == cpu;
 418        zonedev->cpu = target;
 419
 420        /*
 421         * If this is the last CPU in the package remove the package
 422         * reference from the array and restore the interrupt MSR. When we
 423         * drop the lock neither the interrupt notify function nor the
 424         * worker will see the package anymore.
 425         */
 426        if (lastcpu) {
 427                zones[topology_logical_die_id(cpu)] = NULL;
 428                /* After this point nothing touches the MSR anymore. */
 429                wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
 430                      zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
 431        }
 432
 433        /*
 434         * Check whether there is work scheduled and whether the work is
 435         * targeted at the outgoing CPU.
 436         */
 437        if (zonedev->work_scheduled && was_target) {
 438                /*
 439                 * To cancel the work we need to drop the lock, otherwise
 440                 * we might deadlock if the work needs to be flushed.
 441                 */
 442                spin_unlock_irq(&pkg_temp_lock);
 443                cancel_delayed_work_sync(&zonedev->work);
 444                spin_lock_irq(&pkg_temp_lock);
 445                /*
 446                 * If this is not the last cpu in the package and the work
 447                 * did not run after we dropped the lock above, then we
 448                 * need to reschedule the work, otherwise the interrupt
 449                 * stays disabled forever.
 450                 */
 451                if (!lastcpu && zonedev->work_scheduled)
 452                        pkg_thermal_schedule_work(target, &zonedev->work);
 453        }
 454
 455        spin_unlock_irq(&pkg_temp_lock);
 456
 457        /* Final cleanup if this is the last cpu */
 458        if (lastcpu)
 459                kfree(zonedev);
 460        return 0;
 461}
 462
 463static int pkg_thermal_cpu_online(unsigned int cpu)
 464{
 465        struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
 466        struct cpuinfo_x86 *c = &cpu_data(cpu);
 467
 468        /* Paranoia check */
 469        if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
 470                return -ENODEV;
 471
 472        /* If the package exists, nothing to do */
 473        if (zonedev) {
 474                cpumask_set_cpu(cpu, &zonedev->cpumask);
 475                return 0;
 476        }
 477        return pkg_temp_thermal_device_add(cpu);
 478}
 479
 480static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
 481        { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS },
 482        {}
 483};
 484MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
 485
 486static int __init pkg_temp_thermal_init(void)
 487{
 488        int ret;
 489
 490        if (!x86_match_cpu(pkg_temp_thermal_ids))
 491                return -ENODEV;
 492
 493        max_id = topology_max_packages() * topology_max_die_per_package();
 494        zones = kcalloc(max_id, sizeof(struct zone_device *),
 495                           GFP_KERNEL);
 496        if (!zones)
 497                return -ENOMEM;
 498
 499        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
 500                                pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
 501        if (ret < 0)
 502                goto err;
 503
 504        /* Store the state for module exit */
 505        pkg_thermal_hp_state = ret;
 506
 507        platform_thermal_package_notify = pkg_thermal_notify;
 508        platform_thermal_package_rate_control = pkg_thermal_rate_control;
 509
 510         /* Don't care if it fails */
 511        pkg_temp_debugfs_init();
 512        return 0;
 513
 514err:
 515        kfree(zones);
 516        return ret;
 517}
 518module_init(pkg_temp_thermal_init)
 519
 520static void __exit pkg_temp_thermal_exit(void)
 521{
 522        platform_thermal_package_notify = NULL;
 523        platform_thermal_package_rate_control = NULL;
 524
 525        cpuhp_remove_state(pkg_thermal_hp_state);
 526        debugfs_remove_recursive(debugfs);
 527        kfree(zones);
 528}
 529module_exit(pkg_temp_thermal_exit)
 530
 531MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
 532MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
 533MODULE_LICENSE("GPL v2");
 534