linux/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2019 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "i915_request.h"
   8
   9#include "intel_context.h"
  10#include "intel_engine_heartbeat.h"
  11#include "intel_engine_pm.h"
  12#include "intel_engine.h"
  13#include "intel_gt.h"
  14#include "intel_reset.h"
  15
  16/*
  17 * While the engine is active, we send a periodic pulse along the engine
  18 * to check on its health and to flush any idle-barriers. If that request
  19 * is stuck, and we fail to preempt it, we declare the engine hung and
  20 * issue a reset -- in the hope that restores progress.
  21 */
  22
  23static bool next_heartbeat(struct intel_engine_cs *engine)
  24{
  25        long delay;
  26
  27        delay = READ_ONCE(engine->props.heartbeat_interval_ms);
  28        if (!delay)
  29                return false;
  30
  31        delay = msecs_to_jiffies_timeout(delay);
  32        if (delay >= HZ)
  33                delay = round_jiffies_up_relative(delay);
  34        mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
  35
  36        return true;
  37}
  38
  39static struct i915_request *
  40heartbeat_create(struct intel_context *ce, gfp_t gfp)
  41{
  42        struct i915_request *rq;
  43
  44        intel_context_enter(ce);
  45        rq = __i915_request_create(ce, gfp);
  46        intel_context_exit(ce);
  47
  48        return rq;
  49}
  50
  51static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
  52{
  53        engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
  54        i915_request_add_active_barriers(rq);
  55        if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
  56                engine->heartbeat.systole = i915_request_get(rq);
  57}
  58
  59static void heartbeat_commit(struct i915_request *rq,
  60                             const struct i915_sched_attr *attr)
  61{
  62        idle_pulse(rq->engine, rq);
  63
  64        __i915_request_commit(rq);
  65        __i915_request_queue(rq, attr);
  66}
  67
  68static void show_heartbeat(const struct i915_request *rq,
  69                           struct intel_engine_cs *engine)
  70{
  71        struct drm_printer p = drm_debug_printer("heartbeat");
  72
  73        if (!rq) {
  74                intel_engine_dump(engine, &p,
  75                                  "%s heartbeat not ticking\n",
  76                                  engine->name);
  77        } else {
  78                intel_engine_dump(engine, &p,
  79                                  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
  80                                  engine->name,
  81                                  rq->fence.context,
  82                                  rq->fence.seqno,
  83                                  rq->sched.attr.priority);
  84        }
  85}
  86
  87static void
  88reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
  89{
  90        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
  91                show_heartbeat(rq, engine);
  92
  93        if (intel_engine_uses_guc(engine))
  94                /*
  95                 * GuC itself is toast or GuC's hang detection
  96                 * is disabled. Either way, need to find the
  97                 * hang culprit manually.
  98                 */
  99                intel_guc_find_hung_context(engine);
 100
 101        intel_gt_handle_error(engine->gt, engine->mask,
 102                              I915_ERROR_CAPTURE,
 103                              "stopped heartbeat on %s",
 104                              engine->name);
 105}
 106
 107static void heartbeat(struct work_struct *wrk)
 108{
 109        struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
 110        struct intel_engine_cs *engine =
 111                container_of(wrk, typeof(*engine), heartbeat.work.work);
 112        struct intel_context *ce = engine->kernel_context;
 113        struct i915_request *rq;
 114        unsigned long serial;
 115
 116        /* Just in case everything has gone horribly wrong, give it a kick */
 117        intel_engine_flush_submission(engine);
 118
 119        rq = engine->heartbeat.systole;
 120        if (rq && i915_request_completed(rq)) {
 121                i915_request_put(rq);
 122                engine->heartbeat.systole = NULL;
 123        }
 124
 125        if (!intel_engine_pm_get_if_awake(engine))
 126                return;
 127
 128        if (intel_gt_is_wedged(engine->gt))
 129                goto out;
 130
 131        if (i915_sched_engine_disabled(engine->sched_engine)) {
 132                reset_engine(engine, engine->heartbeat.systole);
 133                goto out;
 134        }
 135
 136        if (engine->heartbeat.systole) {
 137                long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
 138
 139                /* Safeguard against too-fast worker invocations */
 140                if (!time_after(jiffies,
 141                                rq->emitted_jiffies + msecs_to_jiffies(delay)))
 142                        goto out;
 143
 144                if (!i915_sw_fence_signaled(&rq->submit)) {
 145                        /*
 146                         * Not yet submitted, system is stalled.
 147                         *
 148                         * This more often happens for ring submission,
 149                         * where all contexts are funnelled into a common
 150                         * ringbuffer. If one context is blocked on an
 151                         * external fence, not only is it not submitted,
 152                         * but all other contexts, including the kernel
 153                         * context are stuck waiting for the signal.
 154                         */
 155                } else if (engine->sched_engine->schedule &&
 156                           rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
 157                        /*
 158                         * Gradually raise the priority of the heartbeat to
 159                         * give high priority work [which presumably desires
 160                         * low latency and no jitter] the chance to naturally
 161                         * complete before being preempted.
 162                         */
 163                        attr.priority = 0;
 164                        if (rq->sched.attr.priority >= attr.priority)
 165                                attr.priority = I915_PRIORITY_HEARTBEAT;
 166                        if (rq->sched.attr.priority >= attr.priority)
 167                                attr.priority = I915_PRIORITY_BARRIER;
 168
 169                        local_bh_disable();
 170                        engine->sched_engine->schedule(rq, &attr);
 171                        local_bh_enable();
 172                } else {
 173                        reset_engine(engine, rq);
 174                }
 175
 176                rq->emitted_jiffies = jiffies;
 177                goto out;
 178        }
 179
 180        serial = READ_ONCE(engine->serial);
 181        if (engine->wakeref_serial == serial)
 182                goto out;
 183
 184        if (!mutex_trylock(&ce->timeline->mutex)) {
 185                /* Unable to lock the kernel timeline, is the engine stuck? */
 186                if (xchg(&engine->heartbeat.blocked, serial) == serial)
 187                        intel_gt_handle_error(engine->gt, engine->mask,
 188                                              I915_ERROR_CAPTURE,
 189                                              "no heartbeat on %s",
 190                                              engine->name);
 191                goto out;
 192        }
 193
 194        rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
 195        if (IS_ERR(rq))
 196                goto unlock;
 197
 198        heartbeat_commit(rq, &attr);
 199
 200unlock:
 201        mutex_unlock(&ce->timeline->mutex);
 202out:
 203        if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
 204                i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
 205        intel_engine_pm_put(engine);
 206}
 207
 208void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
 209{
 210        if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL))
 211                return;
 212
 213        next_heartbeat(engine);
 214}
 215
 216void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
 217{
 218        if (cancel_delayed_work(&engine->heartbeat.work))
 219                i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
 220}
 221
 222void intel_gt_unpark_heartbeats(struct intel_gt *gt)
 223{
 224        struct intel_engine_cs *engine;
 225        enum intel_engine_id id;
 226
 227        for_each_engine(engine, gt, id)
 228                if (intel_engine_pm_is_awake(engine))
 229                        intel_engine_unpark_heartbeat(engine);
 230}
 231
 232void intel_gt_park_heartbeats(struct intel_gt *gt)
 233{
 234        struct intel_engine_cs *engine;
 235        enum intel_engine_id id;
 236
 237        for_each_engine(engine, gt, id)
 238                intel_engine_park_heartbeat(engine);
 239}
 240
 241void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
 242{
 243        INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
 244}
 245
 246static int __intel_engine_pulse(struct intel_engine_cs *engine)
 247{
 248        struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
 249        struct intel_context *ce = engine->kernel_context;
 250        struct i915_request *rq;
 251
 252        lockdep_assert_held(&ce->timeline->mutex);
 253        GEM_BUG_ON(!intel_engine_has_preemption(engine));
 254        GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 255
 256        rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
 257        if (IS_ERR(rq))
 258                return PTR_ERR(rq);
 259
 260        __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
 261
 262        heartbeat_commit(rq, &attr);
 263        GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
 264
 265        return 0;
 266}
 267
 268static unsigned long set_heartbeat(struct intel_engine_cs *engine,
 269                                   unsigned long delay)
 270{
 271        unsigned long old;
 272
 273        old = xchg(&engine->props.heartbeat_interval_ms, delay);
 274        if (delay)
 275                intel_engine_unpark_heartbeat(engine);
 276        else
 277                intel_engine_park_heartbeat(engine);
 278
 279        return old;
 280}
 281
 282int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
 283                               unsigned long delay)
 284{
 285        struct intel_context *ce = engine->kernel_context;
 286        int err = 0;
 287
 288        if (!delay && !intel_engine_has_preempt_reset(engine))
 289                return -ENODEV;
 290
 291        intel_engine_pm_get(engine);
 292
 293        err = mutex_lock_interruptible(&ce->timeline->mutex);
 294        if (err)
 295                goto out_rpm;
 296
 297        if (delay != engine->props.heartbeat_interval_ms) {
 298                unsigned long saved = set_heartbeat(engine, delay);
 299
 300                /* recheck current execution */
 301                if (intel_engine_has_preemption(engine)) {
 302                        err = __intel_engine_pulse(engine);
 303                        if (err)
 304                                set_heartbeat(engine, saved);
 305                }
 306        }
 307
 308        mutex_unlock(&ce->timeline->mutex);
 309
 310out_rpm:
 311        intel_engine_pm_put(engine);
 312        return err;
 313}
 314
 315int intel_engine_pulse(struct intel_engine_cs *engine)
 316{
 317        struct intel_context *ce = engine->kernel_context;
 318        int err;
 319
 320        if (!intel_engine_has_preemption(engine))
 321                return -ENODEV;
 322
 323        if (!intel_engine_pm_get_if_awake(engine))
 324                return 0;
 325
 326        err = -EINTR;
 327        if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
 328                err = __intel_engine_pulse(engine);
 329                mutex_unlock(&ce->timeline->mutex);
 330        }
 331
 332        intel_engine_flush_submission(engine);
 333        intel_engine_pm_put(engine);
 334        return err;
 335}
 336
 337int intel_engine_flush_barriers(struct intel_engine_cs *engine)
 338{
 339        struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
 340        struct intel_context *ce = engine->kernel_context;
 341        struct i915_request *rq;
 342        int err;
 343
 344        if (llist_empty(&engine->barrier_tasks))
 345                return 0;
 346
 347        if (!intel_engine_pm_get_if_awake(engine))
 348                return 0;
 349
 350        if (mutex_lock_interruptible(&ce->timeline->mutex)) {
 351                err = -EINTR;
 352                goto out_rpm;
 353        }
 354
 355        rq = heartbeat_create(ce, GFP_KERNEL);
 356        if (IS_ERR(rq)) {
 357                err = PTR_ERR(rq);
 358                goto out_unlock;
 359        }
 360
 361        heartbeat_commit(rq, &attr);
 362
 363        err = 0;
 364out_unlock:
 365        mutex_unlock(&ce->timeline->mutex);
 366out_rpm:
 367        intel_engine_pm_put(engine);
 368        return err;
 369}
 370
 371#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 372#include "selftest_engine_heartbeat.c"
 373#endif
 374