linux/drivers/cpuidle/cpuidle-pseries.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  cpuidle-pseries - idle state cpuidle driver.
   4 *  Adapted from drivers/idle/intel_idle.c and
   5 *  drivers/acpi/processor_idle.c
   6 *
   7 */
   8
   9#include <linux/kernel.h>
  10#include <linux/module.h>
  11#include <linux/init.h>
  12#include <linux/moduleparam.h>
  13#include <linux/cpuidle.h>
  14#include <linux/cpu.h>
  15#include <linux/notifier.h>
  16
  17#include <asm/paca.h>
  18#include <asm/reg.h>
  19#include <asm/machdep.h>
  20#include <asm/firmware.h>
  21#include <asm/runlatch.h>
  22#include <asm/idle.h>
  23#include <asm/plpar_wrappers.h>
  24#include <asm/rtas.h>
  25
  26static struct cpuidle_driver pseries_idle_driver = {
  27        .name             = "pseries_idle",
  28        .owner            = THIS_MODULE,
  29};
  30
  31static int max_idle_state __read_mostly;
  32static struct cpuidle_state *cpuidle_state_table __read_mostly;
  33static u64 snooze_timeout __read_mostly;
  34static bool snooze_timeout_en __read_mostly;
  35
  36static int snooze_loop(struct cpuidle_device *dev,
  37                        struct cpuidle_driver *drv,
  38                        int index)
  39{
  40        u64 snooze_exit_time;
  41
  42        set_thread_flag(TIF_POLLING_NRFLAG);
  43
  44        pseries_idle_prolog();
  45        local_irq_enable();
  46        snooze_exit_time = get_tb() + snooze_timeout;
  47
  48        while (!need_resched()) {
  49                HMT_low();
  50                HMT_very_low();
  51                if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
  52                        /*
  53                         * Task has not woken up but we are exiting the polling
  54                         * loop anyway. Require a barrier after polling is
  55                         * cleared to order subsequent test of need_resched().
  56                         */
  57                        clear_thread_flag(TIF_POLLING_NRFLAG);
  58                        smp_mb();
  59                        break;
  60                }
  61        }
  62
  63        HMT_medium();
  64        clear_thread_flag(TIF_POLLING_NRFLAG);
  65
  66        local_irq_disable();
  67
  68        pseries_idle_epilog();
  69
  70        return index;
  71}
  72
  73static void check_and_cede_processor(void)
  74{
  75        /*
  76         * Ensure our interrupt state is properly tracked,
  77         * also checks if no interrupt has occurred while we
  78         * were soft-disabled
  79         */
  80        if (prep_irq_for_idle()) {
  81                cede_processor();
  82#ifdef CONFIG_TRACE_IRQFLAGS
  83                /* Ensure that H_CEDE returns with IRQs on */
  84                if (WARN_ON(!(mfmsr() & MSR_EE)))
  85                        __hard_irq_enable();
  86#endif
  87        }
  88}
  89
  90/*
  91 * XCEDE: Extended CEDE states discovered through the
  92 *        "ibm,get-systems-parameter" RTAS call with the token
  93 *        CEDE_LATENCY_TOKEN
  94 */
  95
  96/*
  97 * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
  98 * table with all the parameters to ibm,get-system-parameters.
  99 * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
 100 * Settings Information.
 101 */
 102#define CEDE_LATENCY_TOKEN      45
 103
 104/*
 105 * If the platform supports the cede latency settings information system
 106 * parameter it must provide the following information in the NULL terminated
 107 * parameter string:
 108 *
 109 * a. The first byte is the length “N” of each cede latency setting record minus
 110 *    one (zero indicates a length of 1 byte).
 111 *
 112 * b. For each supported cede latency setting a cede latency setting record
 113 *    consisting of the first “N” bytes as per the following table.
 114 *
 115 *    -----------------------------
 116 *    | Field           | Field   |
 117 *    | Name            | Length  |
 118 *    -----------------------------
 119 *    | Cede Latency    | 1 Byte  |
 120 *    | Specifier Value |         |
 121 *    -----------------------------
 122 *    | Maximum wakeup  |         |
 123 *    | latency in      | 8 Bytes |
 124 *    | tb-ticks        |         |
 125 *    -----------------------------
 126 *    | Responsive to   |         |
 127 *    | external        | 1 Byte  |
 128 *    | interrupts      |         |
 129 *    -----------------------------
 130 *
 131 * This version has cede latency record size = 10.
 132 *
 133 * The structure xcede_latency_payload represents a) and b) with
 134 * xcede_latency_record representing the table in b).
 135 *
 136 * xcede_latency_parameter is what gets returned by
 137 * ibm,get-systems-parameter RTAS call when made with
 138 * CEDE_LATENCY_TOKEN.
 139 *
 140 * These structures are only used to represent the data obtained by the RTAS
 141 * call. The data is in big-endian.
 142 */
 143struct xcede_latency_record {
 144        u8      hint;
 145        __be64  latency_ticks;
 146        u8      wake_on_irqs;
 147} __packed;
 148
 149// Make space for 16 records, which "should be enough".
 150struct xcede_latency_payload {
 151        u8     record_size;
 152        struct xcede_latency_record records[16];
 153} __packed;
 154
 155struct xcede_latency_parameter {
 156        __be16  payload_size;
 157        struct xcede_latency_payload payload;
 158        u8 null_char;
 159} __packed;
 160
 161static unsigned int nr_xcede_records;
 162static struct xcede_latency_parameter xcede_latency_parameter __initdata;
 163
 164static int __init parse_cede_parameters(void)
 165{
 166        struct xcede_latency_payload *payload;
 167        u32 total_xcede_records_size;
 168        u8 xcede_record_size;
 169        u16 payload_size;
 170        int ret, i;
 171
 172        ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
 173                        NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
 174                        sizeof(xcede_latency_parameter));
 175        if (ret) {
 176                pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
 177                return ret;
 178        }
 179
 180        payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
 181        payload = &xcede_latency_parameter.payload;
 182
 183        xcede_record_size = payload->record_size + 1;
 184
 185        if (xcede_record_size != sizeof(struct xcede_latency_record)) {
 186                pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
 187                       sizeof(struct xcede_latency_record), xcede_record_size);
 188                return -EINVAL;
 189        }
 190
 191        pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
 192
 193        /*
 194         * Since the payload_size includes the last NULL byte and the
 195         * xcede_record_size, the remaining bytes correspond to array of all
 196         * cede_latency settings.
 197         */
 198        total_xcede_records_size = payload_size - 2;
 199        nr_xcede_records = total_xcede_records_size / xcede_record_size;
 200
 201        for (i = 0; i < nr_xcede_records; i++) {
 202                struct xcede_latency_record *record = &payload->records[i];
 203                u64 latency_ticks = be64_to_cpu(record->latency_ticks);
 204                u8 wake_on_irqs = record->wake_on_irqs;
 205                u8 hint = record->hint;
 206
 207                pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
 208                        i, hint, latency_ticks, wake_on_irqs);
 209        }
 210
 211        return 0;
 212}
 213
 214#define NR_DEDICATED_STATES     2 /* snooze, CEDE */
 215static u8 cede_latency_hint[NR_DEDICATED_STATES];
 216
 217static int dedicated_cede_loop(struct cpuidle_device *dev,
 218                                struct cpuidle_driver *drv,
 219                                int index)
 220{
 221        u8 old_latency_hint;
 222
 223        pseries_idle_prolog();
 224        get_lppaca()->donate_dedicated_cpu = 1;
 225        old_latency_hint = get_lppaca()->cede_latency_hint;
 226        get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 227
 228        HMT_medium();
 229        check_and_cede_processor();
 230
 231        local_irq_disable();
 232        get_lppaca()->donate_dedicated_cpu = 0;
 233        get_lppaca()->cede_latency_hint = old_latency_hint;
 234
 235        pseries_idle_epilog();
 236
 237        return index;
 238}
 239
 240static int shared_cede_loop(struct cpuidle_device *dev,
 241                        struct cpuidle_driver *drv,
 242                        int index)
 243{
 244
 245        pseries_idle_prolog();
 246
 247        /*
 248         * Yield the processor to the hypervisor.  We return if
 249         * an external interrupt occurs (which are driven prior
 250         * to returning here) or if a prod occurs from another
 251         * processor. When returning here, external interrupts
 252         * are enabled.
 253         */
 254        check_and_cede_processor();
 255
 256        local_irq_disable();
 257        pseries_idle_epilog();
 258
 259        return index;
 260}
 261
 262/*
 263 * States for dedicated partition case.
 264 */
 265static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
 266        { /* Snooze */
 267                .name = "snooze",
 268                .desc = "snooze",
 269                .exit_latency = 0,
 270                .target_residency = 0,
 271                .enter = &snooze_loop },
 272        { /* CEDE */
 273                .name = "CEDE",
 274                .desc = "CEDE",
 275                .exit_latency = 10,
 276                .target_residency = 100,
 277                .enter = &dedicated_cede_loop },
 278};
 279
 280/*
 281 * States for shared partition case.
 282 */
 283static struct cpuidle_state shared_states[] = {
 284        { /* Snooze */
 285                .name = "snooze",
 286                .desc = "snooze",
 287                .exit_latency = 0,
 288                .target_residency = 0,
 289                .enter = &snooze_loop },
 290        { /* Shared Cede */
 291                .name = "Shared Cede",
 292                .desc = "Shared Cede",
 293                .exit_latency = 10,
 294                .target_residency = 100,
 295                .enter = &shared_cede_loop },
 296};
 297
 298static int pseries_cpuidle_cpu_online(unsigned int cpu)
 299{
 300        struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 301
 302        if (dev && cpuidle_get_driver()) {
 303                cpuidle_pause_and_lock();
 304                cpuidle_enable_device(dev);
 305                cpuidle_resume_and_unlock();
 306        }
 307        return 0;
 308}
 309
 310static int pseries_cpuidle_cpu_dead(unsigned int cpu)
 311{
 312        struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 313
 314        if (dev && cpuidle_get_driver()) {
 315                cpuidle_pause_and_lock();
 316                cpuidle_disable_device(dev);
 317                cpuidle_resume_and_unlock();
 318        }
 319        return 0;
 320}
 321
 322/*
 323 * pseries_cpuidle_driver_init()
 324 */
 325static int pseries_cpuidle_driver_init(void)
 326{
 327        int idle_state;
 328        struct cpuidle_driver *drv = &pseries_idle_driver;
 329
 330        drv->state_count = 0;
 331
 332        for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
 333                /* Is the state not enabled? */
 334                if (cpuidle_state_table[idle_state].enter == NULL)
 335                        continue;
 336
 337                drv->states[drv->state_count] = /* structure copy */
 338                        cpuidle_state_table[idle_state];
 339
 340                drv->state_count += 1;
 341        }
 342
 343        return 0;
 344}
 345
 346static void __init fixup_cede0_latency(void)
 347{
 348        struct xcede_latency_payload *payload;
 349        u64 min_xcede_latency_us = UINT_MAX;
 350        int i;
 351
 352        if (parse_cede_parameters())
 353                return;
 354
 355        pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
 356                nr_xcede_records);
 357
 358        payload = &xcede_latency_parameter.payload;
 359
 360        /*
 361         * The CEDE idle state maps to CEDE(0). While the hypervisor
 362         * does not advertise CEDE(0) exit latency values, it does
 363         * advertise the latency values of the extended CEDE states.
 364         * We use the lowest advertised exit latency value as a proxy
 365         * for the exit latency of CEDE(0).
 366         */
 367        for (i = 0; i < nr_xcede_records; i++) {
 368                struct xcede_latency_record *record = &payload->records[i];
 369                u8 hint = record->hint;
 370                u64 latency_tb = be64_to_cpu(record->latency_ticks);
 371                u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
 372
 373                /*
 374                 * We expect the exit latency of an extended CEDE
 375                 * state to be non-zero, it to since it takes at least
 376                 * a few nanoseconds to wakeup the idle CPU and
 377                 * dispatch the virtual processor into the Linux
 378                 * Guest.
 379                 *
 380                 * So we consider only non-zero value for performing
 381                 * the fixup of CEDE(0) latency.
 382                 */
 383                if (latency_us == 0) {
 384                        pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
 385                                i, hint);
 386                        continue;
 387                }
 388
 389                if (latency_us < min_xcede_latency_us)
 390                        min_xcede_latency_us = latency_us;
 391        }
 392
 393        if (min_xcede_latency_us != UINT_MAX) {
 394                dedicated_states[1].exit_latency = min_xcede_latency_us;
 395                dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
 396                pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
 397                        min_xcede_latency_us);
 398        }
 399
 400}
 401
 402/*
 403 * pseries_idle_probe()
 404 * Choose state table for shared versus dedicated partition
 405 */
 406static int __init pseries_idle_probe(void)
 407{
 408
 409        if (cpuidle_disable != IDLE_NO_OVERRIDE)
 410                return -ENODEV;
 411
 412        if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 413                /*
 414                 * Use local_paca instead of get_lppaca() since
 415                 * preemption is not disabled, and it is not required in
 416                 * fact, since lppaca_ptr does not need to be the value
 417                 * associated to the current CPU, it can be from any CPU.
 418                 */
 419                if (lppaca_shared_proc(local_paca->lppaca_ptr)) {
 420                        cpuidle_state_table = shared_states;
 421                        max_idle_state = ARRAY_SIZE(shared_states);
 422                } else {
 423                        /*
 424                         * Use firmware provided latency values
 425                         * starting with POWER10 platforms. In the
 426                         * case that we are running on a POWER10
 427                         * platform but in an earlier compat mode, we
 428                         * can still use the firmware provided values.
 429                         *
 430                         * However, on platforms prior to POWER10, we
 431                         * cannot rely on the accuracy of the firmware
 432                         * provided latency values. On such platforms,
 433                         * go with the conservative default estimate
 434                         * of 10us.
 435                         */
 436                        if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
 437                                fixup_cede0_latency();
 438                        cpuidle_state_table = dedicated_states;
 439                        max_idle_state = NR_DEDICATED_STATES;
 440                }
 441        } else
 442                return -ENODEV;
 443
 444        if (max_idle_state > 1) {
 445                snooze_timeout_en = true;
 446                snooze_timeout = cpuidle_state_table[1].target_residency *
 447                                 tb_ticks_per_usec;
 448        }
 449        return 0;
 450}
 451
 452static int __init pseries_processor_idle_init(void)
 453{
 454        int retval;
 455
 456        retval = pseries_idle_probe();
 457        if (retval)
 458                return retval;
 459
 460        pseries_cpuidle_driver_init();
 461        retval = cpuidle_register(&pseries_idle_driver, NULL);
 462        if (retval) {
 463                printk(KERN_DEBUG "Registration of pseries driver failed.\n");
 464                return retval;
 465        }
 466
 467        retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 468                                           "cpuidle/pseries:online",
 469                                           pseries_cpuidle_cpu_online, NULL);
 470        WARN_ON(retval < 0);
 471        retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
 472                                           "cpuidle/pseries:DEAD", NULL,
 473                                           pseries_cpuidle_cpu_dead);
 474        WARN_ON(retval < 0);
 475        printk(KERN_DEBUG "pseries_idle_driver registered\n");
 476        return 0;
 477}
 478
 479device_initcall(pseries_processor_idle_init);
 480