linux/drivers/misc/mic/cosm/cosm_scif_server.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Intel MIC Platform Software Stack (MPSS)
   4 *
   5 * Copyright(c) 2015 Intel Corporation.
   6 *
   7 * Intel MIC Coprocessor State Management (COSM) Driver
   8 */
   9#include <linux/kthread.h>
  10#include <linux/sched/signal.h>
  11
  12#include "cosm_main.h"
  13
  14/*
  15 * The COSM driver uses SCIF to communicate between the management node and the
  16 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
  17 * receive a shutdown status back from the card upon completion of shutdown and
  18 * (c) receive periodic heartbeat messages from the card used to deduce if the
  19 * card has crashed.
  20 *
  21 * A COSM server consisting of a SCIF listening endpoint waits for incoming
  22 * connections from the card. Upon acceptance of the connection, a separate
  23 * work-item is scheduled to handle SCIF message processing for that card. The
  24 * life-time of this work-item is therefore the time from which the connection
  25 * from a card is accepted to the time at which the connection is closed. A new
  26 * work-item starts each time the card boots and is alive till the card (a)
  27 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
  28 * unloaded.
  29 *
  30 * From the point of view of COSM interactions with SCIF during card
  31 * shutdown, reset and crash are as follows:
  32 *
  33 * Card shutdown
  34 * -------------
  35 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
  36 *    message from the host.
  37 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
  38 *    in scif_remove(..) getting called on the card
  39 * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
  40 *    scif_peer_unregister_device -> device_unregister for the host peer device
  41 * 4. During device_unregister remove(..) method of cosm_client is invoked which
  42 *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
  43 *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
  44 *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
  45 *    up the host COSM thread blocked in scif_poll(..) resulting in
  46 *    scif_poll(..)  returning EPOLLHUP.
  47 * 5. On the card, scif_peer_release_dev is next called which results in an
  48 *    SCIF_EXIT message being sent to the host and after receiving the
  49 *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
  50 *    complete.
  51 * 6. As part of the SCIF_EXIT message processing on the host, host sends a
  52 *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
  53 *    starts a similar SCIF peer device teardown sequence on the host
  54 *    corresponding to the card being shut down.
  55 *
  56 * Card reset
  57 * ----------
  58 * The case of interest here is when the card has not been previously shut down
  59 * since most of the steps below are skipped in that case:
  60
  61 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
  62 *    which unregisters the SCIF HW device resulting in scif_remove(..) being
  63 *    called on the host.
  64 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
  65 *    SCIF_EXIT message being sent to the card.
  66 * 3. The card executes scif_stop() as part of SCIF_EXIT message
  67 *    processing. This results in the COSM endpoint on the card being closed and
  68 *    the SCIF host peer device on the card getting unregistered similar to
  69 *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
  70 *    host returns EPOLLHUP as a result.
  71 * 4. On the host, card peer device unregister and SCIF HW remove(..) also
  72 *    subsequently complete.
  73 *
  74 * Card crash
  75 * ----------
  76 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
  77 * message from the card which would result in scif_poll(..) returning
  78 * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
  79 * message to itself resulting in the card SCIF peer device being unregistered,
  80 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
  81 * scif_invalidate_ep call sequence which sets the endpoint state to
  82 * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP.
  83 */
  84
  85#define COSM_SCIF_BACKLOG 16
  86#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
  87#define COSM_HEARTBEAT_TIMEOUT_SEC \
  88                (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
  89#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
  90
  91static struct task_struct *server_thread;
  92static scif_epd_t listen_epd;
  93
  94/* Publish MIC card's shutdown status to user space MIC daemon */
  95static void cosm_update_mic_status(struct cosm_device *cdev)
  96{
  97        if (cdev->shutdown_status_int != MIC_NOP) {
  98                cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
  99                cdev->shutdown_status_int = MIC_NOP;
 100        }
 101}
 102
 103/* Store MIC card's shutdown status internally when it is received */
 104static void cosm_shutdown_status_int(struct cosm_device *cdev,
 105                                     enum mic_status shutdown_status)
 106{
 107        switch (shutdown_status) {
 108        case MIC_HALTED:
 109        case MIC_POWER_OFF:
 110        case MIC_RESTART:
 111        case MIC_CRASHED:
 112                break;
 113        default:
 114                dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
 115                        __func__, __LINE__, shutdown_status);
 116                return;
 117        };
 118        cdev->shutdown_status_int = shutdown_status;
 119        cdev->heartbeat_watchdog_enable = false;
 120
 121        if (cdev->state != MIC_SHUTTING_DOWN)
 122                cosm_set_state(cdev, MIC_SHUTTING_DOWN);
 123}
 124
 125/* Non-blocking recv. Read and process all available messages */
 126static void cosm_scif_recv(struct cosm_device *cdev)
 127{
 128        struct cosm_msg msg;
 129        int rc;
 130
 131        while (1) {
 132                rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
 133                if (!rc) {
 134                        break;
 135                } else if (rc < 0) {
 136                        dev_dbg(&cdev->dev, "%s: %d rc %d\n",
 137                                __func__, __LINE__, rc);
 138                        break;
 139                }
 140                dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
 141                        __func__, __LINE__, rc, msg.id);
 142
 143                switch (msg.id) {
 144                case COSM_MSG_SHUTDOWN_STATUS:
 145                        cosm_shutdown_status_int(cdev, msg.shutdown_status);
 146                        break;
 147                case COSM_MSG_HEARTBEAT:
 148                        /* Nothing to do, heartbeat only unblocks scif_poll */
 149                        break;
 150                default:
 151                        dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
 152                                __func__, __LINE__, msg.id);
 153                        break;
 154                }
 155        }
 156}
 157
 158/* Publish crashed status for this MIC card */
 159static void cosm_set_crashed(struct cosm_device *cdev)
 160{
 161        dev_err(&cdev->dev, "node alive timeout\n");
 162        cosm_shutdown_status_int(cdev, MIC_CRASHED);
 163        cosm_update_mic_status(cdev);
 164}
 165
 166/* Send host time to the MIC card to sync system time between host and MIC */
 167static void cosm_send_time(struct cosm_device *cdev)
 168{
 169        struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
 170        struct timespec64 ts;
 171        int rc;
 172
 173        ktime_get_real_ts64(&ts);
 174        msg.timespec.tv_sec = ts.tv_sec;
 175        msg.timespec.tv_nsec = ts.tv_nsec;
 176
 177        rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
 178        if (rc < 0)
 179                dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
 180                        __func__, __LINE__, rc);
 181}
 182
 183/*
 184 * Close this cosm_device's endpoint after its peer endpoint on the card has
 185 * been closed. In all cases except MIC card crash EPOLLHUP on the host is
 186 * triggered by the client's endpoint being closed.
 187 */
 188static void cosm_scif_close(struct cosm_device *cdev)
 189{
 190        /*
 191         * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
 192         * reboot notifier when shutdown is still not complete, we notify mpssd
 193         * to reset the card when SCIF endpoint is closed.
 194         */
 195        cosm_update_mic_status(cdev);
 196        scif_close(cdev->epd);
 197        cdev->epd = NULL;
 198        dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 199}
 200
 201/*
 202 * Set card state to ONLINE when a new SCIF connection from a MIC card is
 203 * received. Normally the state is BOOTING when the connection comes in, but can
 204 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
 205 */
 206static int cosm_set_online(struct cosm_device *cdev)
 207{
 208        int rc = 0;
 209
 210        if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
 211                cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
 212                cdev->epd = cdev->newepd;
 213                if (cdev->state == MIC_BOOTING)
 214                        cosm_set_state(cdev, MIC_ONLINE);
 215                cosm_send_time(cdev);
 216                dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 217        } else {
 218                dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
 219                         __func__, __LINE__, cosm_state_string[cdev->state]);
 220                rc = -EINVAL;
 221        }
 222        /* Drop reference acquired by bus_find_device in the server thread */
 223        put_device(&cdev->dev);
 224        return rc;
 225}
 226
 227/*
 228 * Work function for handling work for a SCIF connection from a particular MIC
 229 * card. It first sets the card state to ONLINE and then calls scif_poll to
 230 * block on activity such as incoming messages on the SCIF endpoint. When the
 231 * endpoint is closed, the work function exits, completing its life cycle, from
 232 * MIC card boot to card shutdown/reset/crash.
 233 */
 234void cosm_scif_work(struct work_struct *work)
 235{
 236        struct cosm_device *cdev = container_of(work, struct cosm_device,
 237                                                scif_work);
 238        struct scif_pollepd pollepd;
 239        int rc;
 240
 241        mutex_lock(&cdev->cosm_mutex);
 242        if (cosm_set_online(cdev))
 243                goto exit;
 244
 245        while (1) {
 246                pollepd.epd = cdev->epd;
 247                pollepd.events = EPOLLIN;
 248
 249                /* Drop the mutex before blocking in scif_poll(..) */
 250                mutex_unlock(&cdev->cosm_mutex);
 251                /* poll(..) with timeout on our endpoint */
 252                rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
 253                mutex_lock(&cdev->cosm_mutex);
 254                if (rc < 0) {
 255                        dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
 256                                __func__, __LINE__, rc);
 257                        continue;
 258                }
 259
 260                /* There is a message from the card */
 261                if (pollepd.revents & EPOLLIN)
 262                        cosm_scif_recv(cdev);
 263
 264                /* The peer endpoint is closed or this endpoint disconnected */
 265                if (pollepd.revents & EPOLLHUP) {
 266                        cosm_scif_close(cdev);
 267                        break;
 268                }
 269
 270                /* Did we timeout from poll? */
 271                if (!rc && cdev->heartbeat_watchdog_enable)
 272                        cosm_set_crashed(cdev);
 273        }
 274exit:
 275        dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
 276        mutex_unlock(&cdev->cosm_mutex);
 277}
 278
 279/*
 280 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
 281 * cards, finds the correct cosm_device to associate that connection with and
 282 * schedules individual work items for each MIC card.
 283 */
 284static int cosm_scif_server(void *unused)
 285{
 286        struct cosm_device *cdev;
 287        scif_epd_t newepd;
 288        struct scif_port_id port_id;
 289        int rc;
 290
 291        allow_signal(SIGKILL);
 292
 293        while (!kthread_should_stop()) {
 294                rc = scif_accept(listen_epd, &port_id, &newepd,
 295                                 SCIF_ACCEPT_SYNC);
 296                if (rc < 0) {
 297                        if (-ERESTARTSYS != rc)
 298                                pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
 299                        continue;
 300                }
 301
 302                /*
 303                 * Associate the incoming connection with a particular
 304                 * cosm_device, COSM device ID == SCIF node ID - 1
 305                 */
 306                cdev = cosm_find_cdev_by_id(port_id.node - 1);
 307                if (!cdev)
 308                        continue;
 309                cdev->newepd = newepd;
 310                schedule_work(&cdev->scif_work);
 311        }
 312
 313        pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
 314        return 0;
 315}
 316
 317static int cosm_scif_listen(void)
 318{
 319        int rc;
 320
 321        listen_epd = scif_open();
 322        if (!listen_epd) {
 323                pr_err("%s %d scif_open failed\n", __func__, __LINE__);
 324                return -ENOMEM;
 325        }
 326
 327        rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
 328        if (rc < 0) {
 329                pr_err("%s %d scif_bind failed rc %d\n",
 330                       __func__, __LINE__, rc);
 331                goto err;
 332        }
 333
 334        rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
 335        if (rc < 0) {
 336                pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
 337                goto err;
 338        }
 339        pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
 340        return 0;
 341err:
 342        scif_close(listen_epd);
 343        listen_epd = NULL;
 344        return rc;
 345}
 346
 347static void cosm_scif_listen_exit(void)
 348{
 349        pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
 350        if (listen_epd) {
 351                scif_close(listen_epd);
 352                listen_epd = NULL;
 353        }
 354}
 355
 356/*
 357 * Create a listening SCIF endpoint and a server kthread which accepts incoming
 358 * SCIF connections from MIC cards
 359 */
 360int cosm_scif_init(void)
 361{
 362        int rc = cosm_scif_listen();
 363
 364        if (rc) {
 365                pr_err("%s %d cosm_scif_listen rc %d\n",
 366                       __func__, __LINE__, rc);
 367                goto err;
 368        }
 369
 370        server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
 371        if (IS_ERR(server_thread)) {
 372                rc = PTR_ERR(server_thread);
 373                pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
 374                goto listen_exit;
 375        }
 376        return 0;
 377listen_exit:
 378        cosm_scif_listen_exit();
 379err:
 380        return rc;
 381}
 382
 383/* Stop the running server thread and close the listening SCIF endpoint */
 384void cosm_scif_exit(void)
 385{
 386        int rc;
 387
 388        if (!IS_ERR_OR_NULL(server_thread)) {
 389                rc = send_sig(SIGKILL, server_thread, 0);
 390                if (rc) {
 391                        pr_err("%s %d send_sig rc %d\n",
 392                               __func__, __LINE__, rc);
 393                        return;
 394                }
 395                kthread_stop(server_thread);
 396        }
 397
 398        cosm_scif_listen_exit();
 399}
 400