linux/arch/powerpc/platforms/pseries/mobility.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Support for Partition Mobility/Migration
   4 *
   5 * Copyright (C) 2010 Nathan Fontenot
   6 * Copyright (C) 2010 IBM Corporation
   7 */
   8
   9
  10#define pr_fmt(fmt) "mobility: " fmt
  11
  12#include <linux/cpu.h>
  13#include <linux/kernel.h>
  14#include <linux/kobject.h>
  15#include <linux/nmi.h>
  16#include <linux/sched.h>
  17#include <linux/smp.h>
  18#include <linux/stat.h>
  19#include <linux/stop_machine.h>
  20#include <linux/completion.h>
  21#include <linux/device.h>
  22#include <linux/delay.h>
  23#include <linux/slab.h>
  24#include <linux/stringify.h>
  25
  26#include <asm/machdep.h>
  27#include <asm/rtas.h>
  28#include "pseries.h"
  29#include "../../kernel/cacheinfo.h"
  30
  31static struct kobject *mobility_kobj;
  32
  33struct update_props_workarea {
  34        __be32 phandle;
  35        __be32 state;
  36        __be64 reserved;
  37        __be32 nprops;
  38} __packed;
  39
  40#define NODE_ACTION_MASK        0xff000000
  41#define NODE_COUNT_MASK         0x00ffffff
  42
  43#define DELETE_DT_NODE  0x01000000
  44#define UPDATE_DT_NODE  0x02000000
  45#define ADD_DT_NODE     0x03000000
  46
  47#define MIGRATION_SCOPE (1)
  48#define PRRN_SCOPE -2
  49
  50static int mobility_rtas_call(int token, char *buf, s32 scope)
  51{
  52        int rc;
  53
  54        spin_lock(&rtas_data_buf_lock);
  55
  56        memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
  57        rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
  58        memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
  59
  60        spin_unlock(&rtas_data_buf_lock);
  61        return rc;
  62}
  63
  64static int delete_dt_node(struct device_node *dn)
  65{
  66        pr_debug("removing node %pOFfp\n", dn);
  67        dlpar_detach_node(dn);
  68        return 0;
  69}
  70
  71static int update_dt_property(struct device_node *dn, struct property **prop,
  72                              const char *name, u32 vd, char *value)
  73{
  74        struct property *new_prop = *prop;
  75        int more = 0;
  76
  77        /* A negative 'vd' value indicates that only part of the new property
  78         * value is contained in the buffer and we need to call
  79         * ibm,update-properties again to get the rest of the value.
  80         *
  81         * A negative value is also the two's compliment of the actual value.
  82         */
  83        if (vd & 0x80000000) {
  84                vd = ~vd + 1;
  85                more = 1;
  86        }
  87
  88        if (new_prop) {
  89                /* partial property fixup */
  90                char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
  91                if (!new_data)
  92                        return -ENOMEM;
  93
  94                memcpy(new_data, new_prop->value, new_prop->length);
  95                memcpy(new_data + new_prop->length, value, vd);
  96
  97                kfree(new_prop->value);
  98                new_prop->value = new_data;
  99                new_prop->length += vd;
 100        } else {
 101                new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
 102                if (!new_prop)
 103                        return -ENOMEM;
 104
 105                new_prop->name = kstrdup(name, GFP_KERNEL);
 106                if (!new_prop->name) {
 107                        kfree(new_prop);
 108                        return -ENOMEM;
 109                }
 110
 111                new_prop->length = vd;
 112                new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
 113                if (!new_prop->value) {
 114                        kfree(new_prop->name);
 115                        kfree(new_prop);
 116                        return -ENOMEM;
 117                }
 118
 119                memcpy(new_prop->value, value, vd);
 120                *prop = new_prop;
 121        }
 122
 123        if (!more) {
 124                pr_debug("updating node %pOF property %s\n", dn, name);
 125                of_update_property(dn, new_prop);
 126                *prop = NULL;
 127        }
 128
 129        return 0;
 130}
 131
 132static int update_dt_node(struct device_node *dn, s32 scope)
 133{
 134        struct update_props_workarea *upwa;
 135        struct property *prop = NULL;
 136        int i, rc, rtas_rc;
 137        char *prop_data;
 138        char *rtas_buf;
 139        int update_properties_token;
 140        u32 nprops;
 141        u32 vd;
 142
 143        update_properties_token = rtas_token("ibm,update-properties");
 144        if (update_properties_token == RTAS_UNKNOWN_SERVICE)
 145                return -EINVAL;
 146
 147        rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
 148        if (!rtas_buf)
 149                return -ENOMEM;
 150
 151        upwa = (struct update_props_workarea *)&rtas_buf[0];
 152        upwa->phandle = cpu_to_be32(dn->phandle);
 153
 154        do {
 155                rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
 156                                        scope);
 157                if (rtas_rc < 0)
 158                        break;
 159
 160                prop_data = rtas_buf + sizeof(*upwa);
 161                nprops = be32_to_cpu(upwa->nprops);
 162
 163                /* On the first call to ibm,update-properties for a node the
 164                 * the first property value descriptor contains an empty
 165                 * property name, the property value length encoded as u32,
 166                 * and the property value is the node path being updated.
 167                 */
 168                if (*prop_data == 0) {
 169                        prop_data++;
 170                        vd = be32_to_cpu(*(__be32 *)prop_data);
 171                        prop_data += vd + sizeof(vd);
 172                        nprops--;
 173                }
 174
 175                for (i = 0; i < nprops; i++) {
 176                        char *prop_name;
 177
 178                        prop_name = prop_data;
 179                        prop_data += strlen(prop_name) + 1;
 180                        vd = be32_to_cpu(*(__be32 *)prop_data);
 181                        prop_data += sizeof(vd);
 182
 183                        switch (vd) {
 184                        case 0x00000000:
 185                                /* name only property, nothing to do */
 186                                break;
 187
 188                        case 0x80000000:
 189                                of_remove_property(dn, of_find_property(dn,
 190                                                        prop_name, NULL));
 191                                prop = NULL;
 192                                break;
 193
 194                        default:
 195                                rc = update_dt_property(dn, &prop, prop_name,
 196                                                        vd, prop_data);
 197                                if (rc) {
 198                                        pr_err("updating %s property failed: %d\n",
 199                                               prop_name, rc);
 200                                }
 201
 202                                prop_data += vd;
 203                                break;
 204                        }
 205
 206                        cond_resched();
 207                }
 208
 209                cond_resched();
 210        } while (rtas_rc == 1);
 211
 212        kfree(rtas_buf);
 213        return 0;
 214}
 215
 216static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)
 217{
 218        struct device_node *dn;
 219        int rc;
 220
 221        dn = dlpar_configure_connector(drc_index, parent_dn);
 222        if (!dn)
 223                return -ENOENT;
 224
 225        rc = dlpar_attach_node(dn, parent_dn);
 226        if (rc)
 227                dlpar_free_cc_nodes(dn);
 228
 229        pr_debug("added node %pOFfp\n", dn);
 230
 231        return rc;
 232}
 233
 234int pseries_devicetree_update(s32 scope)
 235{
 236        char *rtas_buf;
 237        __be32 *data;
 238        int update_nodes_token;
 239        int rc;
 240
 241        update_nodes_token = rtas_token("ibm,update-nodes");
 242        if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
 243                return 0;
 244
 245        rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
 246        if (!rtas_buf)
 247                return -ENOMEM;
 248
 249        do {
 250                rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
 251                if (rc && rc != 1)
 252                        break;
 253
 254                data = (__be32 *)rtas_buf + 4;
 255                while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
 256                        int i;
 257                        u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
 258                        u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
 259
 260                        data++;
 261
 262                        for (i = 0; i < node_count; i++) {
 263                                struct device_node *np;
 264                                __be32 phandle = *data++;
 265                                __be32 drc_index;
 266
 267                                np = of_find_node_by_phandle(be32_to_cpu(phandle));
 268                                if (!np) {
 269                                        pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
 270                                                be32_to_cpu(phandle), action);
 271                                        continue;
 272                                }
 273
 274                                switch (action) {
 275                                case DELETE_DT_NODE:
 276                                        delete_dt_node(np);
 277                                        break;
 278                                case UPDATE_DT_NODE:
 279                                        update_dt_node(np, scope);
 280                                        break;
 281                                case ADD_DT_NODE:
 282                                        drc_index = *data++;
 283                                        add_dt_node(np, drc_index);
 284                                        break;
 285                                }
 286
 287                                of_node_put(np);
 288                                cond_resched();
 289                        }
 290                }
 291
 292                cond_resched();
 293        } while (rc == 1);
 294
 295        kfree(rtas_buf);
 296        return rc;
 297}
 298
 299void post_mobility_fixup(void)
 300{
 301        int rc;
 302
 303        rtas_activate_firmware();
 304
 305        /*
 306         * We don't want CPUs to go online/offline while the device
 307         * tree is being updated.
 308         */
 309        cpus_read_lock();
 310
 311        /*
 312         * It's common for the destination firmware to replace cache
 313         * nodes.  Release all of the cacheinfo hierarchy's references
 314         * before updating the device tree.
 315         */
 316        cacheinfo_teardown();
 317
 318        rc = pseries_devicetree_update(MIGRATION_SCOPE);
 319        if (rc)
 320                pr_err("device tree update failed: %d\n", rc);
 321
 322        cacheinfo_rebuild();
 323
 324        cpus_read_unlock();
 325
 326        /* Possibly switch to a new L1 flush type */
 327        pseries_setup_security_mitigations();
 328
 329        /* Reinitialise system information for hv-24x7 */
 330        read_24x7_sys_info();
 331
 332        return;
 333}
 334
 335static int poll_vasi_state(u64 handle, unsigned long *res)
 336{
 337        unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 338        long hvrc;
 339        int ret;
 340
 341        hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);
 342        switch (hvrc) {
 343        case H_SUCCESS:
 344                ret = 0;
 345                *res = retbuf[0];
 346                break;
 347        case H_PARAMETER:
 348                ret = -EINVAL;
 349                break;
 350        case H_FUNCTION:
 351                ret = -EOPNOTSUPP;
 352                break;
 353        case H_HARDWARE:
 354        default:
 355                pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
 356                ret = -EIO;
 357                break;
 358        }
 359        return ret;
 360}
 361
 362static int wait_for_vasi_session_suspending(u64 handle)
 363{
 364        unsigned long state;
 365        int ret;
 366
 367        /*
 368         * Wait for transition from H_VASI_ENABLED to
 369         * H_VASI_SUSPENDING. Treat anything else as an error.
 370         */
 371        while (true) {
 372                ret = poll_vasi_state(handle, &state);
 373
 374                if (ret != 0 || state == H_VASI_SUSPENDING) {
 375                        break;
 376                } else if (state == H_VASI_ENABLED) {
 377                        ssleep(1);
 378                } else {
 379                        pr_err("unexpected H_VASI_STATE result %lu\n", state);
 380                        ret = -EIO;
 381                        break;
 382                }
 383        }
 384
 385        /*
 386         * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or
 387         * ibm,suspend-me are also unimplemented, we'll recover then.
 388         */
 389        if (ret == -EOPNOTSUPP)
 390                ret = 0;
 391
 392        return ret;
 393}
 394
 395static void prod_single(unsigned int target_cpu)
 396{
 397        long hvrc;
 398        int hwid;
 399
 400        hwid = get_hard_smp_processor_id(target_cpu);
 401        hvrc = plpar_hcall_norets(H_PROD, hwid);
 402        if (hvrc == H_SUCCESS)
 403                return;
 404        pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
 405                           target_cpu, hwid, hvrc);
 406}
 407
 408static void prod_others(void)
 409{
 410        unsigned int cpu;
 411
 412        for_each_online_cpu(cpu) {
 413                if (cpu != smp_processor_id())
 414                        prod_single(cpu);
 415        }
 416}
 417
 418static u16 clamp_slb_size(void)
 419{
 420        u16 prev = mmu_slb_size;
 421
 422        slb_set_size(SLB_MIN_SIZE);
 423
 424        return prev;
 425}
 426
 427static int do_suspend(void)
 428{
 429        u16 saved_slb_size;
 430        int status;
 431        int ret;
 432
 433        pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
 434
 435        /*
 436         * The destination processor model may have fewer SLB entries
 437         * than the source. We reduce mmu_slb_size to a safe minimum
 438         * before suspending in order to minimize the possibility of
 439         * programming non-existent entries on the destination. If
 440         * suspend fails, we restore it before returning. On success
 441         * the OF reconfig path will update it from the new device
 442         * tree after resuming on the destination.
 443         */
 444        saved_slb_size = clamp_slb_size();
 445
 446        ret = rtas_ibm_suspend_me(&status);
 447        if (ret != 0) {
 448                pr_err("ibm,suspend-me error: %d\n", status);
 449                slb_set_size(saved_slb_size);
 450        }
 451
 452        return ret;
 453}
 454
 455/**
 456 * struct pseries_suspend_info - State shared between CPUs for join/suspend.
 457 * @counter: Threads are to increment this upon resuming from suspend
 458 *           or if an error is received from H_JOIN. The thread which performs
 459 *           the first increment (i.e. sets it to 1) is responsible for
 460 *           waking the other threads.
 461 * @done: False if join/suspend is in progress. True if the operation is
 462 *        complete (successful or not).
 463 */
 464struct pseries_suspend_info {
 465        atomic_t counter;
 466        bool done;
 467};
 468
 469static int do_join(void *arg)
 470{
 471        struct pseries_suspend_info *info = arg;
 472        atomic_t *counter = &info->counter;
 473        long hvrc;
 474        int ret;
 475
 476retry:
 477        /* Must ensure MSR.EE off for H_JOIN. */
 478        hard_irq_disable();
 479        hvrc = plpar_hcall_norets(H_JOIN);
 480
 481        switch (hvrc) {
 482        case H_CONTINUE:
 483                /*
 484                 * All other CPUs are offline or in H_JOIN. This CPU
 485                 * attempts the suspend.
 486                 */
 487                ret = do_suspend();
 488                break;
 489        case H_SUCCESS:
 490                /*
 491                 * The suspend is complete and this cpu has received a
 492                 * prod, or we've received a stray prod from unrelated
 493                 * code (e.g. paravirt spinlocks) and we need to join
 494                 * again.
 495                 *
 496                 * This barrier orders the return from H_JOIN above vs
 497                 * the load of info->done. It pairs with the barrier
 498                 * in the wakeup/prod path below.
 499                 */
 500                smp_mb();
 501                if (READ_ONCE(info->done) == false) {
 502                        pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
 503                                            smp_processor_id());
 504                        goto retry;
 505                }
 506                ret = 0;
 507                break;
 508        case H_BAD_MODE:
 509        case H_HARDWARE:
 510        default:
 511                ret = -EIO;
 512                pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
 513                                   hvrc, smp_processor_id());
 514                break;
 515        }
 516
 517        if (atomic_inc_return(counter) == 1) {
 518                pr_info("CPU %u waking all threads\n", smp_processor_id());
 519                WRITE_ONCE(info->done, true);
 520                /*
 521                 * This barrier orders the store to info->done vs subsequent
 522                 * H_PRODs to wake the other CPUs. It pairs with the barrier
 523                 * in the H_SUCCESS case above.
 524                 */
 525                smp_mb();
 526                prod_others();
 527        }
 528        /*
 529         * Execution may have been suspended for several seconds, so
 530         * reset the watchdog.
 531         */
 532        touch_nmi_watchdog();
 533        return ret;
 534}
 535
 536/*
 537 * Abort reason code byte 0. We use only the 'Migrating partition' value.
 538 */
 539enum vasi_aborting_entity {
 540        ORCHESTRATOR        = 1,
 541        VSP_SOURCE          = 2,
 542        PARTITION_FIRMWARE  = 3,
 543        PLATFORM_FIRMWARE   = 4,
 544        VSP_TARGET          = 5,
 545        MIGRATING_PARTITION = 6,
 546};
 547
 548static void pseries_cancel_migration(u64 handle, int err)
 549{
 550        u32 reason_code;
 551        u32 detail;
 552        u8 entity;
 553        long hvrc;
 554
 555        entity = MIGRATING_PARTITION;
 556        detail = abs(err) & 0xffffff;
 557        reason_code = (entity << 24) | detail;
 558
 559        hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,
 560                                  H_VASI_SIGNAL_CANCEL, reason_code);
 561        if (hvrc)
 562                pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);
 563}
 564
 565static int pseries_suspend(u64 handle)
 566{
 567        const unsigned int max_attempts = 5;
 568        unsigned int retry_interval_ms = 1;
 569        unsigned int attempt = 1;
 570        int ret;
 571
 572        while (true) {
 573                struct pseries_suspend_info info;
 574                unsigned long vasi_state;
 575                int vasi_err;
 576
 577                info = (struct pseries_suspend_info) {
 578                        .counter = ATOMIC_INIT(0),
 579                        .done = false,
 580                };
 581
 582                ret = stop_machine(do_join, &info, cpu_online_mask);
 583                if (ret == 0)
 584                        break;
 585                /*
 586                 * Encountered an error. If the VASI stream is still
 587                 * in Suspending state, it's likely a transient
 588                 * condition related to some device in the partition
 589                 * and we can retry in the hope that the cause has
 590                 * cleared after some delay.
 591                 *
 592                 * A better design would allow drivers etc to prepare
 593                 * for the suspend and avoid conditions which prevent
 594                 * the suspend from succeeding. For now, we have this
 595                 * mitigation.
 596                 */
 597                pr_notice("Partition suspend attempt %u of %u error: %d\n",
 598                          attempt, max_attempts, ret);
 599
 600                if (attempt == max_attempts)
 601                        break;
 602
 603                vasi_err = poll_vasi_state(handle, &vasi_state);
 604                if (vasi_err == 0) {
 605                        if (vasi_state != H_VASI_SUSPENDING) {
 606                                pr_notice("VASI state %lu after failed suspend\n",
 607                                          vasi_state);
 608                                break;
 609                        }
 610                } else if (vasi_err != -EOPNOTSUPP) {
 611                        pr_err("VASI state poll error: %d", vasi_err);
 612                        break;
 613                }
 614
 615                pr_notice("Will retry partition suspend after %u ms\n",
 616                          retry_interval_ms);
 617
 618                msleep(retry_interval_ms);
 619                retry_interval_ms *= 10;
 620                attempt++;
 621        }
 622
 623        return ret;
 624}
 625
 626static int pseries_migrate_partition(u64 handle)
 627{
 628        int ret;
 629
 630        ret = wait_for_vasi_session_suspending(handle);
 631        if (ret)
 632                return ret;
 633
 634        ret = pseries_suspend(handle);
 635        if (ret == 0)
 636                post_mobility_fixup();
 637        else
 638                pseries_cancel_migration(handle, ret);
 639
 640        return ret;
 641}
 642
 643int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
 644{
 645        return pseries_migrate_partition(handle);
 646}
 647
 648static ssize_t migration_store(struct class *class,
 649                               struct class_attribute *attr, const char *buf,
 650                               size_t count)
 651{
 652        u64 streamid;
 653        int rc;
 654
 655        rc = kstrtou64(buf, 0, &streamid);
 656        if (rc)
 657                return rc;
 658
 659        rc = pseries_migrate_partition(streamid);
 660        if (rc)
 661                return rc;
 662
 663        return count;
 664}
 665
 666/*
 667 * Used by drmgr to determine the kernel behavior of the migration interface.
 668 *
 669 * Version 1: Performs all PAPR requirements for migration including
 670 *      firmware activation and device tree update.
 671 */
 672#define MIGRATION_API_VERSION   1
 673
 674static CLASS_ATTR_WO(migration);
 675static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
 676
 677static int __init mobility_sysfs_init(void)
 678{
 679        int rc;
 680
 681        mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
 682        if (!mobility_kobj)
 683                return -ENOMEM;
 684
 685        rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
 686        if (rc)
 687                pr_err("unable to create migration sysfs file (%d)\n", rc);
 688
 689        rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
 690        if (rc)
 691                pr_err("unable to create api_version sysfs file (%d)\n", rc);
 692
 693        return 0;
 694}
 695machine_device_initcall(pseries, mobility_sysfs_init);
 696