linux/drivers/misc/mic/cosm/cosm_scif_server.c
<<
>>
Prefs
   1/*
   2 * Intel MIC Platform Software Stack (MPSS)
   3 *
   4 * Copyright(c) 2015 Intel Corporation.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License, version 2, as
   8 * published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13 * General Public License for more details.
  14 *
  15 * The full GNU General Public License is included in this distribution in
  16 * the file called "COPYING".
  17 *
  18 * Intel MIC Coprocessor State Management (COSM) Driver
  19 *
  20 */
  21#include <linux/kthread.h>
  22#include "cosm_main.h"
  23
  24/*
  25 * The COSM driver uses SCIF to communicate between the management node and the
  26 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
  27 * receive a shutdown status back from the card upon completion of shutdown and
  28 * (c) receive periodic heartbeat messages from the card used to deduce if the
  29 * card has crashed.
  30 *
  31 * A COSM server consisting of a SCIF listening endpoint waits for incoming
  32 * connections from the card. Upon acceptance of the connection, a separate
  33 * work-item is scheduled to handle SCIF message processing for that card. The
  34 * life-time of this work-item is therefore the time from which the connection
  35 * from a card is accepted to the time at which the connection is closed. A new
  36 * work-item starts each time the card boots and is alive till the card (a)
  37 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
  38 * unloaded.
  39 *
  40 * From the point of view of COSM interactions with SCIF during card
  41 * shutdown, reset and crash are as follows:
  42 *
  43 * Card shutdown
  44 * -------------
  45 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
  46 *    message from the host.
  47 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
  48 *    in scif_remove(..) getting called on the card
  49 * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
  50 *    scif_peer_unregister_device -> device_unregister for the host peer device
  51 * 4. During device_unregister remove(..) method of cosm_client is invoked which
  52 *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
  53 *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
  54 *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
  55 *    up the host COSM thread blocked in scif_poll(..) resulting in
  56 *    scif_poll(..)  returning POLLHUP.
  57 * 5. On the card, scif_peer_release_dev is next called which results in an
  58 *    SCIF_EXIT message being sent to the host and after receiving the
  59 *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
  60 *    complete.
  61 * 6. As part of the SCIF_EXIT message processing on the host, host sends a
  62 *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
  63 *    starts a similar SCIF peer device teardown sequence on the host
  64 *    corresponding to the card being shut down.
  65 *
  66 * Card reset
  67 * ----------
  68 * The case of interest here is when the card has not been previously shut down
  69 * since most of the steps below are skipped in that case:
  70
  71 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
  72 *    which unregisters the SCIF HW device resulting in scif_remove(..) being
  73 *    called on the host.
  74 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
  75 *    SCIF_EXIT message being sent to the card.
  76 * 3. The card executes scif_stop() as part of SCIF_EXIT message
  77 *    processing. This results in the COSM endpoint on the card being closed and
  78 *    the SCIF host peer device on the card getting unregistered similar to
  79 *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
  80 *    host returns POLLHUP as a result.
  81 * 4. On the host, card peer device unregister and SCIF HW remove(..) also
  82 *    subsequently complete.
  83 *
  84 * Card crash
  85 * ----------
  86 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
  87 * message from the card which would result in scif_poll(..) returning
  88 * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
  89 * message to itself resulting in the card SCIF peer device being unregistered,
  90 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
  91 * scif_invalidate_ep call sequence which sets the endpoint state to
  92 * DISCONNECTED and results in scif_poll(..) returning POLLHUP.
  93 */
  94
  95#define COSM_SCIF_BACKLOG 16
  96#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
  97#define COSM_HEARTBEAT_TIMEOUT_SEC \
  98                (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
  99#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
 100
 101static struct task_struct *server_thread;
 102static scif_epd_t listen_epd;
 103
 104/* Publish MIC card's shutdown status to user space MIC daemon */
 105static void cosm_update_mic_status(struct cosm_device *cdev)
 106{
 107        if (cdev->shutdown_status_int != MIC_NOP) {
 108                cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
 109                cdev->shutdown_status_int = MIC_NOP;
 110        }
 111}
 112
 113/* Store MIC card's shutdown status internally when it is received */
 114static void cosm_shutdown_status_int(struct cosm_device *cdev,
 115                                     enum mic_status shutdown_status)
 116{
 117        switch (shutdown_status) {
 118        case MIC_HALTED:
 119        case MIC_POWER_OFF:
 120        case MIC_RESTART:
 121        case MIC_CRASHED:
 122                break;
 123        default:
 124                dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
 125                        __func__, __LINE__, shutdown_status);
 126                return;
 127        };
 128        cdev->shutdown_status_int = shutdown_status;
 129        cdev->heartbeat_watchdog_enable = false;
 130
 131        if (cdev->state != MIC_SHUTTING_DOWN)
 132                cosm_set_state(cdev, MIC_SHUTTING_DOWN);
 133}
 134
 135/* Non-blocking recv. Read and process all available messages */
 136static void cosm_scif_recv(struct cosm_device *cdev)
 137{
 138        struct cosm_msg msg;
 139        int rc;
 140
 141        while (1) {
 142                rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
 143                if (!rc) {
 144                        break;
 145                } else if (rc < 0) {
 146                        dev_dbg(&cdev->dev, "%s: %d rc %d\n",
 147                                __func__, __LINE__, rc);
 148                        break;
 149                }
 150                dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
 151                        __func__, __LINE__, rc, msg.id);
 152
 153                switch (msg.id) {
 154                case COSM_MSG_SHUTDOWN_STATUS:
 155                        cosm_shutdown_status_int(cdev, msg.shutdown_status);
 156                        break;
 157                case COSM_MSG_HEARTBEAT:
 158                        /* Nothing to do, heartbeat only unblocks scif_poll */
 159                        break;
 160                default:
 161                        dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
 162                                __func__, __LINE__, msg.id);
 163                        break;
 164                }
 165        }
 166}
 167
 168/* Publish crashed status for this MIC card */
 169static void cosm_set_crashed(struct cosm_device *cdev)
 170{
 171        dev_err(&cdev->dev, "node alive timeout\n");
 172        cosm_shutdown_status_int(cdev, MIC_CRASHED);
 173        cosm_update_mic_status(cdev);
 174}
 175
 176/* Send host time to the MIC card to sync system time between host and MIC */
 177static void cosm_send_time(struct cosm_device *cdev)
 178{
 179        struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
 180        int rc;
 181
 182        getnstimeofday64(&msg.timespec);
 183        rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
 184        if (rc < 0)
 185                dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
 186                        __func__, __LINE__, rc);
 187}
 188
 189/*
 190 * Close this cosm_device's endpoint after its peer endpoint on the card has
 191 * been closed. In all cases except MIC card crash POLLHUP on the host is
 192 * triggered by the client's endpoint being closed.
 193 */
 194static void cosm_scif_close(struct cosm_device *cdev)
 195{
 196        /*
 197         * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
 198         * reboot notifier when shutdown is still not complete, we notify mpssd
 199         * to reset the card when SCIF endpoint is closed.
 200         */
 201        cosm_update_mic_status(cdev);
 202        scif_close(cdev->epd);
 203        cdev->epd = NULL;
 204        dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 205}
 206
 207/*
 208 * Set card state to ONLINE when a new SCIF connection from a MIC card is
 209 * received. Normally the state is BOOTING when the connection comes in, but can
 210 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
 211 */
 212static int cosm_set_online(struct cosm_device *cdev)
 213{
 214        int rc = 0;
 215
 216        if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
 217                cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
 218                cdev->epd = cdev->newepd;
 219                if (cdev->state == MIC_BOOTING)
 220                        cosm_set_state(cdev, MIC_ONLINE);
 221                cosm_send_time(cdev);
 222                dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 223        } else {
 224                dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
 225                         __func__, __LINE__, cosm_state_string[cdev->state]);
 226                rc = -EINVAL;
 227        }
 228        /* Drop reference acquired by bus_find_device in the server thread */
 229        put_device(&cdev->dev);
 230        return rc;
 231}
 232
 233/*
 234 * Work function for handling work for a SCIF connection from a particular MIC
 235 * card. It first sets the card state to ONLINE and then calls scif_poll to
 236 * block on activity such as incoming messages on the SCIF endpoint. When the
 237 * endpoint is closed, the work function exits, completing its life cycle, from
 238 * MIC card boot to card shutdown/reset/crash.
 239 */
 240void cosm_scif_work(struct work_struct *work)
 241{
 242        struct cosm_device *cdev = container_of(work, struct cosm_device,
 243                                                scif_work);
 244        struct scif_pollepd pollepd;
 245        int rc;
 246
 247        mutex_lock(&cdev->cosm_mutex);
 248        if (cosm_set_online(cdev))
 249                goto exit;
 250
 251        while (1) {
 252                pollepd.epd = cdev->epd;
 253                pollepd.events = POLLIN;
 254
 255                /* Drop the mutex before blocking in scif_poll(..) */
 256                mutex_unlock(&cdev->cosm_mutex);
 257                /* poll(..) with timeout on our endpoint */
 258                rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
 259                mutex_lock(&cdev->cosm_mutex);
 260                if (rc < 0) {
 261                        dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
 262                                __func__, __LINE__, rc);
 263                        continue;
 264                }
 265
 266                /* There is a message from the card */
 267                if (pollepd.revents & POLLIN)
 268                        cosm_scif_recv(cdev);
 269
 270                /* The peer endpoint is closed or this endpoint disconnected */
 271                if (pollepd.revents & POLLHUP) {
 272                        cosm_scif_close(cdev);
 273                        break;
 274                }
 275
 276                /* Did we timeout from poll? */
 277                if (!rc && cdev->heartbeat_watchdog_enable)
 278                        cosm_set_crashed(cdev);
 279        }
 280exit:
 281        dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
 282        mutex_unlock(&cdev->cosm_mutex);
 283}
 284
 285/*
 286 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
 287 * cards, finds the correct cosm_device to associate that connection with and
 288 * schedules individual work items for each MIC card.
 289 */
 290static int cosm_scif_server(void *unused)
 291{
 292        struct cosm_device *cdev;
 293        scif_epd_t newepd;
 294        struct scif_port_id port_id;
 295        int rc;
 296
 297        allow_signal(SIGKILL);
 298
 299        while (!kthread_should_stop()) {
 300                rc = scif_accept(listen_epd, &port_id, &newepd,
 301                                 SCIF_ACCEPT_SYNC);
 302                if (rc < 0) {
 303                        if (-ERESTARTSYS != rc)
 304                                pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
 305                        continue;
 306                }
 307
 308                /*
 309                 * Associate the incoming connection with a particular
 310                 * cosm_device, COSM device ID == SCIF node ID - 1
 311                 */
 312                cdev = cosm_find_cdev_by_id(port_id.node - 1);
 313                if (!cdev)
 314                        continue;
 315                cdev->newepd = newepd;
 316                schedule_work(&cdev->scif_work);
 317        }
 318
 319        pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
 320        return 0;
 321}
 322
 323static int cosm_scif_listen(void)
 324{
 325        int rc;
 326
 327        listen_epd = scif_open();
 328        if (!listen_epd) {
 329                pr_err("%s %d scif_open failed\n", __func__, __LINE__);
 330                return -ENOMEM;
 331        }
 332
 333        rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
 334        if (rc < 0) {
 335                pr_err("%s %d scif_bind failed rc %d\n",
 336                       __func__, __LINE__, rc);
 337                goto err;
 338        }
 339
 340        rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
 341        if (rc < 0) {
 342                pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
 343                goto err;
 344        }
 345        pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
 346        return 0;
 347err:
 348        scif_close(listen_epd);
 349        listen_epd = NULL;
 350        return rc;
 351}
 352
 353static void cosm_scif_listen_exit(void)
 354{
 355        pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
 356        if (listen_epd) {
 357                scif_close(listen_epd);
 358                listen_epd = NULL;
 359        }
 360}
 361
 362/*
 363 * Create a listening SCIF endpoint and a server kthread which accepts incoming
 364 * SCIF connections from MIC cards
 365 */
 366int cosm_scif_init(void)
 367{
 368        int rc = cosm_scif_listen();
 369
 370        if (rc) {
 371                pr_err("%s %d cosm_scif_listen rc %d\n",
 372                       __func__, __LINE__, rc);
 373                goto err;
 374        }
 375
 376        server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
 377        if (IS_ERR(server_thread)) {
 378                rc = PTR_ERR(server_thread);
 379                pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
 380                goto listen_exit;
 381        }
 382        return 0;
 383listen_exit:
 384        cosm_scif_listen_exit();
 385err:
 386        return rc;
 387}
 388
 389/* Stop the running server thread and close the listening SCIF endpoint */
 390void cosm_scif_exit(void)
 391{
 392        int rc;
 393
 394        if (!IS_ERR_OR_NULL(server_thread)) {
 395                rc = send_sig(SIGKILL, server_thread, 0);
 396                if (rc) {
 397                        pr_err("%s %d send_sig rc %d\n",
 398                               __func__, __LINE__, rc);
 399                        return;
 400                }
 401                kthread_stop(server_thread);
 402        }
 403
 404        cosm_scif_listen_exit();
 405}
 406